Example #1
0
def sparkconfig():
    # spark configuration options

    # conf = SparkConf()
    # conf.setMaster("spark://3.168.100.58:7077") # uncomment for standalone cluster
    # conf.setMaster("local")   # uncomment for local execution
    # conf.setAppName("demo_chain")
    # conf.set("spark.executor.memory", "2g")
    # conf.set("spark.default.parallelism", 56)  # 48)
    # conf.set("spark.sql.inMemoryColumnarStorage.compressed","true")
    # conf.set("sql.inMemoryColumnarStorage.batchSize",2000)

    # AMAZON AWS EMR
    conf = SparkConf()
    conf.setMaster("yarn-client")	#client gets output to terminals
    #conf.setMaster("yarn-cluster")	# this seems to runf aster but can't confirm
    conf.set("spark.default.parallelism",648)
    conf.setAppName("spark_markov_chain")
    conf.set("spark.executor.memory", "22g")
    conf.set("spark.executor.instances",9)
    conf.set("spark.executor.cores",9)
    conf.set("spark.yarn.executor.memoryOverhead",800)
    conf.set("spark.rdd.compress","True")
    conf.set("spark.shuffle.consolidateFiles","True")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    return conf
def configureSpark():
	conf = SparkConf()
	conf.setMaster("local")
	conf.setAppName("Apache Spark Alarm Parser")
	conf.set("spark.executor.memory", "1g")
	sc = SparkContext(conf = conf)
	return sc
Example #3
0
    def __connected_yarn_spark_cluster(self, pilotcompute_description):

        number_cores=1
        if pilotcompute_description.has_key("number_cores"):
            number_cores=int(pilotcompute_description["number_cores"])
        
        number_of_processes = 1
        if pilotcompute_description.has_key("number_of_processes"):
            number_of_processes = int(pilotcompute_description["number_of_processes"])

        executor_memory="1g"
        if pilotcompute_description.has_key("number_of_processes"):
            executor_memory = pilotcompute_description["physical_memory_per_process"]

        conf = SparkConf()
        conf.set("spark.num.executors", str(number_of_processes))
        conf.set("spark.executor.instances", str(number_of_processes))
        conf.set("spark.executor.memory", executor_memory)
        conf.set("spark.executor.cores", number_cores)
        if pilotcompute_description!=None:
            for i in pilotcompute_description.keys():
                if i.startswith("spark"):
                    conf.set(i, pilotcompute_description[i])
        conf.setAppName("Pilot-Spark")
        conf.setMaster("yarn-client")
        sc = SparkContext(conf=conf)
        sqlCtx = SQLContext(sc)
        pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
        return pilot
def main():
    parser = argparse.ArgumentParser(
        description='process some log messages, storing them and signaling '
                    'a rest server')
    parser.add_argument('--mongo', help='the mongodb url',
                        required=True)
    parser.add_argument('--rest', help='the rest endpoint to signal',
                        required=True)
    parser.add_argument('--port', help='the port to receive from '
                        '(default: 1984)',
                        default=1984, type=int)
    parser.add_argument('--appname', help='the name of the spark application '
                        '(default: SparkharaLogCounter)',
                        default='SparkharaLogCounter')
    parser.add_argument('--master',
                        help='the master url for the spark cluster')
    parser.add_argument('--socket',
                        help='the socket to attach for streaming text data '
                        '(default: caravan-pathfinder)',
                        default='caravan-pathfinder')
    args = parser.parse_args()
    mongo_url = args.mongo
    rest_url = args.rest

    sconf = SparkConf().setAppName(args.appname)
    if args.master:
        sconf.setMaster(args.master)
    sc = SparkContext(conf=sconf)
    ssc = StreamingContext(sc, 1)

    lines = ssc.socketTextStream(args.socket, args.port)
    lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url))

    ssc.start()
    ssc.awaitTermination()
Example #5
0
    def spark_config(self):
        if self._spark_config is None:
            os.environ['SPARK_SUBMIT_CLASSPATH'] = ','.join(self.spex_conf.spark_config.jars)

            conf = SparkConf()
            conf.setAppName(self.spex_conf.spark_config.name)
            conf.setMaster(self.spex_conf.spark_config.master)

            conf.set('spark.rdd.compress', 'true')
            conf.set('spark.io.compression.codec', 'lz4')
            conf.set('spark.mesos.coarse',
                     'true' if self.spex_conf.spark_config.coarse_mode else 'false')

            # TODO - Setup all the other cruft as needed
            #conf.set('spark.executor.memory', '4g')
            #conf.set('spark.cores.max', '16')
            #conf.set('spark.task.cpus', '6')

            # TODO - bind port for spark web ui

            self._spark_config = conf

        config = self._spark_config

        # These are always set, if someone changes them we simply set them back
        config.set('spark.executor.uri', self.artifact_resolver(self.spex_conf.spark_distro))
        config.setExecutorEnv(key='PYSPARK_PYTHON', value='./%s daemon' % self.spex_conf.spex_name)
        return config
Example #6
0
def main():
    # Setting the cluster configuration parameters
    conf = SparkConf()
    conf.setMaster("spark://localhost:7077")
    conf.setAppName("Tweet App")
    conf.set("spark.executor.memory", "3g")
    conf.set("spark.driver.memory", "4g")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    # Creating and SQL context to perform SQL queries
    sqlContext = SQLContext(sc)

    # Define the data path
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_name = "out.json"

    json_file_path = os.path.join(curr_path +
                                  "/../Spark_Jobs/data/",
                                  json_name)

    parquet_file_path = createSQLContext(json_file_path, sqlContext)
    print(parquet_file_path)

    # Read from parquet file
    parquetFile = sqlContext.read.parquet(parquet_file_path)
    parquetFile.registerTempTable("tweets")
    counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
    print("============= Count =================")
    print("Count:: " + str(counter.collect()[0].cnt))
Example #7
0
def main(args):

    if len(args) < 2:
        sys.exit(1)

    # Setting the cluster configuration parameters
    spark_master = args[0]
    spark_data_file_name = args[1]
    file_path = CURR_DIR + "/" + spark_data_file_name

    conf = SparkConf()
    conf.setMaster(spark_master)
    conf.setAppName("Log Scanner")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    txt_logs = sc.textFile(file_path).filter(lambda line: check(line))
    access_logs = txt_logs.map(lambda line: AccessLog(line))

    #  Getting response_codes from log objects and caching it
    response_codes = access_logs.map(lambda log: log.get_status()).cache()
    log_count = response_codes.count()
    print("Total Resonse Codes: " + str(log_count))
    cnt = response_codes.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
    response200 = cnt.filter(lambda x: x[0] == "200").map(lambda (x, y): y).collect()
    print("###########################")
    print("##  Success Rate : " + str(int(response200[0])*100/log_count) + " %  ##")
    print("###########################")
def read_conf():
    """
    Setting up spark contexts
    """
    conf = SparkConf()
    conf.setMaster("local[*]")
    conf.setAppName("Testing")
    return conf
Example #9
0
 def _test_broadcast_on_driver(self, *extra_confs):
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     bs = self.sc.broadcast(value=5)
     self.assertEqual(5, bs.value)
Example #10
0
def init_spark_context():
    # load spark context
    conf = SparkConf().setAppName("event-contour-server")
    conf.setMaster("local[4]")
    conf.setAppName("reduce")
    conf.set("spark.executor.memory", "4g")
    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['app.py', 'contourGenerator.py','EventParallelize.py'])
 
    return sc
Example #11
0
    def __init__(self, master, name):
        self.name=name
        self.master=master

        print "init spark ..."
        os.environ["HADOOP_HOME"]="D:\code\wqr\hadoop-common-2.2.0-bin"
        conf = SparkConf()
        conf.setMaster(self.master)
        conf.setAppName(self.name)

        self.sc = SparkContext(conf=conf)
Example #12
0
 def init(self):
     os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
     # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
     # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
     conf = SparkConf()
     conf.setMaster("local[10]")
     conf.setAppName("PySparkShell")
     conf.set("spark.executor.memory", "2g")
     conf.set("spark.driver.memory", "1g")
     self.sc = SparkContext(conf=conf)
     self.sqlContext = SQLContext(self.sc)        
Example #13
0
 def _test_multiple_broadcasts(self, *extra_confs):
     """
     Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
     and also multiple jobs.
     """
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     self._test_encryption_helper([5])
     self._test_encryption_helper([5, 10, 20])
Example #14
0
 def __connected_spark_cluster(self, resource_url, pilot_description=None):
     conf = SparkConf()
     conf.setAppName("Pilot-Spark")
     if pilot_description!=None:
         for i in pilot_description.keys():
             if i.startswith("spark"):
                 conf.set(i, pilot_description[i])
     conf.setMaster(resource_url)
     print(conf.toDebugString())
     sc = SparkContext(conf=conf)
     sqlCtx = SQLContext(sc)
     pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
     return pilot
def main():
   count=0
   #Initializing Spark Configuration for the Master Node
   config = SparkConf().setAppName('DiskDetection_App')
   config.setMaster('local[6]')                                                    #indicates the number of threads on the master node
   sc = SparkContext(conf=config)                                                  # Initializing the Spark Context
   for i in os.listdir(os.environ["MODEL_CSV_FILEPATH"]):
       # Loop to restrict training to 20 models (only for better analysis purpose)
       if count < 20:
          modelName = os.path.splitext(i)[0]
          print modelName
          predictMain(modelName,sc)
          count+=1
Example #16
0
def main():
    conf = SparkConf()
    conf.setMaster('local[*]')
    conf.setAppName('spark-basic')
    sc = SparkContext(conf=conf)
    churn_df = read_dataset(sc, "churn_no_header.csv")
    pipeline = build_pipeline()
    training_data, test_data = train_test_split(churn_df, 0.2)
    model = pipeline.fit(training_data)
    predictions = model.transform(test_data)
    print predictions.show(20)

    (roc_score, pr_score) = evaluate(predictions, ['areaUnderROC', 'areaUnderPR'])
    print "\nSpark AUC Score: ", roc_score, ", PR Score: ", pr_score
Example #17
0
def configureSpark(app_name, master):
	
	#Configure SPARK
	conf = SparkConf().setAppName(app_name)
	conf = conf.setMaster(master)
	spark_context = SparkContext(conf=conf)
	return spark_context
Example #18
0
def configureSpark():
	#Configure SPARK
	conf = SparkConf().setAppName("a")
	conf = conf.setMaster("local[*]")
	conf = conf.set("spark.executor.memory", "2g").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryoserializer.buffer", "256").set("spark.akka.frameSize", "500").set("spark.rpc.askTimeout", "30").set('spark.executor.cores', '4').set('spark.driver.memory','2g')

	sc = SparkContext(conf=conf)
	return sc
def main():
    parser = argparse.ArgumentParser(
        description='process some log messages, storing them and signaling '
                    'a rest server')
    parser.add_argument('--mongo', help='the mongodb url',
                        required=True)
    parser.add_argument('--rest', help='the rest endpoint to signal',
                        required=True)
    parser.add_argument('--port', help='the port to receive from '
                        '(default: 1984)',
                        default=1984, type=int)
    parser.add_argument('--appname', help='the name of the spark application '
                        '(default: SparkharaLogCounter)',
                        default='SparkharaLogCounter')
    parser.add_argument('--master',
                        help='the master url for the spark cluster')
    parser.add_argument('--socket',
                        help='the socket ip address to attach for streaming '
                        'text data (default: caravan-pathfinder)',
                        default='caravan-pathfinder')
    parser.add_argument('--model',
                        help='the serialized model to use',
                        default='model.json')
    args = parser.parse_args()
    mongo_url = args.mongo
    rest_url = args.rest
    model = args.model

    sconf = SparkConf().setAppName(args.appname)
    if args.master:
        sconf.setMaster(args.master)
    sc = SparkContext(conf=sconf)
    ssc = StreamingContext(sc, 1)
    somv = fromJSON(model)
    som = sc.broadcast(somv)

    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN)

    lines = ssc.socketTextStream(args.socket, args.port)
    lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url,
                                                 rest_url, som))

    ssc.start()
    ssc.awaitTermination()
Example #20
0
def configureSpark(app_name, master):
	
	#Configure SPARK
	conf = SparkConf().setAppName(app_name)
	conf = conf.setMaster(master)
	#conf.set("fs.s3n.awsAccessKeyId", "")
	#conf.set("fs.s3n.awsSecretAccessKey", "")
	spark_context = SparkContext(conf=conf)
	return spark_context
def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.setMaster('spark://10.21.208.21:7077')
    sc_conf.set('spark.executor.memory', '2g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    print sc_conf.getAll()

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc
Example #22
0
def main():
    conf = SparkConf()
    conf.setMaster('local[*]')
    conf.setAppName('renewer-prediction-spark')
    filename = '/Users/andyyoo/scikit_learn_data/renewer/Orange_Dataset.no.header.csv'
    sc = SparkContext(conf=conf)
    df = read_dataset(sc, filename)
    df = pipe_index_string_cols(df, cols=["label"])
    df = pipe_assemble_features(df, excluded_cols=["label"])
    df = pipe_scale_cols(df, with_mean=True, with_std=True, use_dense_vector=False)
    df.show()

    training_data, test_data = train_test_split(df, 0.2)
    model = rf_classifier().fit(training_data)
    predictions = model.transform(test_data)
    print predictions.show(20)

    (roc_score, pr_score) = evaluate(predictions, ['areaUnderROC', 'areaUnderPR'])
    print "\nSpark AUC Score: ", roc_score, ", PR Score: ", pr_score
def main():
    # master = 'local[2]'
    master = 'spark://192.168.9.164:7077'
    app_name = 'test-broadcast'
    # spark_home = '/data01/app/bigdata/spark'  # local
    spark_home = '/home/hadoop/app/spark'  # test

    pyFiles = ['mysql_utils.py']
    spark_conf = SparkConf()
    spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home)
    sc = SparkContext(conf=spark_conf)
    for path in (pyFiles or []):
        sc.addPyFile(path)

    external_cache = get_api_deviceinfo()

    deviceinfo_b = sc.broadcast(external_cache)


    sc.stop()
def main():
    conf = SparkConf()
    conf.setMaster("spark://192.168.199.123:8070")
    conf.setAppName("User Profile Spark")

    sc = SparkContext(conf=conf)
    print("connection sucessed with Master", conf)
    data = [1, 2, 3, 4]
    distData = sc.parallelize(data)
    print(distData.collect())
    #
    raw = open(TRACKS_PATH, 'r').read().split("\n")
    tackfile = sc.parallelize(raw)

    tackfile = tackfile.filter(lambda line: len(line.split(',')) == 6)
    tbycust = tackfile.map(lambda line: make_tracks_kv(line)).reduceByKey(lambda a, b: a + b)

    custdata = tbycust.mapValues(lambda a: compute_stats_byuser(a))

    print(custdata.first())
Example #25
0
def run_cluster():
    """ 集群模式
    """

    "集群配置"
    conf = SparkConf()
    conf.setMaster("spark://jldrp-4:7077")
    conf.setAppName("WebCat")

    ts_chunks = []
    if len(sys.argv) > 1:
        test_set_file_path = sys.argv[1]
    else: exit("未提供测试集文件! 现在退出...")
    with open(test_set_file_path) as test_set_file:
       ts_chunks = split_file(test_set_file)

    sc = SparkContext(pyFiles=["ProvincesCities.csv", "stopwords.txt", "training.set.balanced.40", "text_cat.py"]
                     ,conf=conf         
                     )
    #ts = sc.textFile("hdfs://jldrp-4:7077/user/liulx/webcat/gt100.gt100.valid.test.set")
    #res = ts.flatMap(lambda x: text_cat.pipe(x)).collect()
    res = sc.parallelize(ts_chunks).flatMap(lambda x: text_cat.pipe(x))
    res.saveAsTextFile("hdfs://jldrp-4:8020/webcat/cat.result")
def words_count_mapReduce():
   
  # configuration
  APP_NAME = 'word count'
  conf = SparkConf().setAppName(APP_NAME)
  conf = conf.setMaster('spark://ukko160:7077')
  sc = SparkContext(conf=conf)

  # actual function
  lines = sc.textFile("../spark-1.4.1-bin-hadoop2.6/README.md")
  table = lines.flatMap(f_flatmapper).map(f_mapper).reduce(f_reducer)
  for x in table:
    print x

  pass
def count_line_mapReduce():
  '''
  This function will count the number of lines in the file.
  It is implemented with mapReduce heuristics.
  '''
  # spark configuration
  APP_NAME = 'count lines in mapReduce'
  conf = SparkConf().setAppName(APP_NAME)
  conf = conf.setMaster('spark://ukko160:7077')
  sc = SparkContext(conf=conf)

  # mapReduce function call
  lines = sc.textFile('../spark-1.4.1-bin-hadoop2.6/README.md')
  lineLength = lines.map(count_line_map)
  totalLength = lineLength.reduce(count_line_reduce)
  return totalLength
  pass
def count_lines_lambdaExpression():
  '''
  The function is to compute the number of line in a text file.
  The function is implemented with python lambda expression.
  '''
  # configuration
  APP_NAME = 'count lines'
  conf = SparkConf().setAppName(APP_NAME)
  conf = conf.setMaster('spark://ukko160:7077')
  sc = SparkContext(conf=conf)

  # actuall lambda
  lines = sc.textFile('../spark-1.4.1-bin-hadoop2.6/README.md')
  lineLength = lines.map(lambda s: len(s))
  totalLength = lineLength.reduce(lambda a,b: a+b)
  return totalLength
  pass
def words_count_lambdaExpression():
   
  # configuration
  APP_NAME = 'words count with python lambda expression'
  conf = SparkConf().setAppName(APP_NAME)
  conf = conf.setMaster('spark://ukko160:7077')
  sc = SparkContext(conf=conf)

  # actual function
  lines = sc.textFile("../spark-1.4.1-bin-hadoop2.6/README.md")
  words = lines.flatMap(lambda x: x.split(' '))
  pairs = words.map(lambda x: (x,1))
  count = pairs.reduceByKey(lambda x,y: x+y)

  for x in count.collect():
    print x

  pass
Example #30
0
def main(arglist):

    with open("log_file_x.txt", "a") as f:
        f.write("Start time of sort...... %s\n" % datetime.datetime.now())

    print("Start time...... %s" % datetime.datetime.now())

    # mapreduce params
    path = arglist[0]
    output = arglist[1]
    minPartitions = int(arglist[2])

    # initialize
    conf = SparkConf()
    conf = conf.setMaster('local').setAppName("PythonSort").set("spark.driver.memory", "10g").set("spark.driver-maxResultSize", "3g")
    sc = SparkContext(conf=conf)

    sc = SparkContext(appName="PythonWordCount")
    lines = sc.textFile(path)
    counts = lines.flatMap(lambda x: x.split('\n')) \
                  .map(lambda x: (x, 1)) \
                  .sortByKey(lambda x: x)
    counts.saveAsTextFile(output)
    # # print(rdd)
    # f = open(output, 'w')
    # f.writelines('\n'.join(rdd))
    # f.close()

    # # write to one single file
    # single_output = open('single_output', 'w')
    # for i in range(minPartitions):
    #     file_name = 'part-000' + ('0'+str(i) if i < 10 else str(i))
    #     file_path = os.path.join(output, file_name)
    #     file = open(file_path, 'r')
    #
    #     single_output.write(''.join(file))
    # single_output.close()
    sc.stop()

    print("End time of sort...... %s" % datetime.datetime.now())
    with open("log_file_x.txt", "a") as f:
        f.write("End time of sort...... %s\n" % datetime.datetime.now())
Example #31
0
# -*- coding: utf-8 -*-
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from sklearn.datasets import load_iris
import pandas
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark2pmml import PMMLBuilder
from pyspark.ml.feature import RFormula

# 配置spark客户端
conf = SparkConf().setAppName("lr_spark").set(
    "spark.jars",
    "./jpmml-sparkml-executable-1.4.5.jar")  # 注意: 这里需要加载jpmml jar
conf = conf.setMaster("local")
sc = SparkContext(conf=conf)

# 加载sklearn的训练数据
iris = load_iris()
# 特征矩阵
features = pandas.DataFrame(iris.data, columns=iris.feature_names)
# 目标矩阵
targets = pandas.DataFrame(iris.target, columns=['Species'])
# 合并矩阵
merged = pandas.concat([features, targets], axis=1)

# 创建SparkSession
sess = SparkSession(sc)

# 创建spark DataFrame
raw_df = sess.createDataFrame(merged)
Example #32
0
import os
import os.path
from pyspark import SparkContext

if 'BACKEND_COMPUTE_MASTER' in os.environ:
    master = os.environ['BACKEND_COMPUTE_MASTER']
else:
    master = 'localhost'

logFile = os.path.join("/tmp/data/README.md")

from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.setMaster('spark://' + master + ':7077')
conf.setAppName("simpleapp")
sc = SparkContext(conf=conf)
logData = sc.textFile(logFile).cache()

numAs = logData.filter(lambda s: 'a' in s).count()
numBs = logData.filter(lambda s: 'b' in s).count()

print "Lines with a: %i, lines with b: %i" % (numAs, numBs)
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, MultilayerPerceptronClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from sklearn.metrics import matthews_corrcoef
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import lit
from pyspark.sql.functions import rand
# from numba import jit

conf = SparkConf()
conf.set("spark.executor.memory", "6G")
conf.set("spark.driver.memory", "4G")
conf.set("spark.executor.cores", "4")
conf.set("spark.sql.crossJoin.enabled", "true")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.default.parallelism", "4")
conf.setMaster('local[4]')

atexit.register(lambda: spark.stop())

spark = SparkSession \
    .builder.config(conf=conf) \
    .appName("bosch-spark-magic").getOrCreate()


# @jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf == 0:
        return 0
    else:
Example #34
0
    # HDFS driver to use with Petastorm.
    PETASTORM_HDFS_DRIVER = 'libhdfs'

    # ================ #
    # DATA PREPARATION #
    # ================ #

    print('================')
    print('Data preparation')
    print('================')

    # Create Spark session for data preparation.
    conf = SparkConf().setAppName('data_prep').set(
        'spark.sql.shuffle.partitions', '16')
    if args.processing_master:
        conf.setMaster(args.processing_master)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()

    train_csv = spark.read.csv('%s/train.csv' % args.data_dir, header=True)
    test_csv = spark.read.csv('%s/test.csv' % args.data_dir, header=True)

    store_csv = spark.read.csv('%s/store.csv' % args.data_dir, header=True)
    store_states_csv = spark.read.csv('%s/store_states.csv' % args.data_dir,
                                      header=True)
    state_names_csv = spark.read.csv('%s/state_names.csv' % args.data_dir,
                                     header=True)
    google_trend_csv = spark.read.csv('%s/googletrend.csv' % args.data_dir,
                                      header=True)
    weather_csv = spark.read.csv('%s/weather.csv' % args.data_dir, header=True)

    def expand_date(df):
conf = SparkConf()
conf.set("spark.executor.memory", "5g")
# conf.set("spark.sql.shuffle.partitions", "1000")
# conf.set("spark.yarn.executor.memoryOverhead", "512m")
conf.set("spark.network.timeout", "2000")
conf.set("spark.sql.broadcastTimeout", "300000")
# conf.set("spark.dynamicAllocation.enabled","true")
# conf.set("spark.shuffle.service.enabled", "true")
# conf.set("spark.local.dir", "/yelp-dataset/spark-tmp")
# conf.set("spark.driver.memory","512m")
# conf.set("spark.driver.maxResultSize","10g")
# sc = SparkContext("local[*]", "Simple App", conf=conf)
# sc.setCheckpointDir('/tmp')
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.6'
conf.setMaster(SPARK_URL)
sc = SparkContext(conf=conf)
sql_sc = SQLContext(sc)

# In[4]:


def save(df, target_dir, name):
    df.write.mode("overwrite").parquet(DIR_ROOT + "/" + target_dir + "/" +
                                       name)


def ren(df, exclude=[]):
    replacements = {c: c + "_2" for c in df.columns if str(c) not in exclude}
    replacements = [
        col(c).alias(replacements.get(c)) for c in df.columns
Example #36
0
def get_sparkcontext():
    conf = SparkConf()
    conf.setAppName("sparkDemo")
    conf.setMaster("local[5]")
    spark_context = SparkContext(conf=conf)
    return spark_context
Example #37
0

def saveTrans(data, sdate, timeSpan):
    res = data[(data[1] == data[3]) & (data[1] != data[2])]
    trans = pandas.read_csv('SubwayFlowConf/trans', header=None)
    #trans = trans[0].tolist()
    #res = res[res[1] in trans]
    res = pandas.merge(res, trans, left_on=1, right_on=0)
    res.to_csv('result/trans/' + sdate + '_' + str(timeSpan),
               header=None,
               index=None)


def timeIndex(self, x):
    L = x.split(':')
    return str(((int(L[0]) * 60) + int(L[1])) / self.span)


def resetIndex(self, x):
    return '%02d:%02d' % ((x * self.span) / 60, (x * self.span) % 60)


if __name__ == '__main__':
    # Configure Spark
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster(master)
    sc = SparkContext(conf=conf)
    spark = SparkSession.builder.master(master).appName(APP_NAME).getOrCreate()
    # Execute Main functionality
    main(sc, spark)
Example #38
0
    :param a: mosaic from mosaic n-gram a
    :param b: mosaic from mosaic n-gram b
    :return: whichever has a bigger mosaic_value
    """
    if mosaic_value(a) > mosaic_value(b):
        return a
    elif mosaic_value(a) == mosaic_value(b) and a < b:
        return a
    else:
        return b


if __name__ == "__main__":

    conf = SparkConf()
    conf.setMaster('spark://hadoop-master:7077')
    conf.setAppName('spark-basic')
    sc = SparkContext(conf=conf)

    host = 'hadoop-master:54310'
    text_file = sc.textFile("hdfs://" + host + "/" + sys.argv[1])

    #counts = text_file.flatMap(permute)\
    #    .reduceByKey(aggregate_by_mosaic).map(lambda x: (x[1], x[0]))\
    #    .reduceByKey(remove_duplicates).sortBy(ascending=False, keyfunc=lambda x: x[0][1])\
    #    .map(lambda x: '{0}\t{1}'.format(x[1], x[0][1]))

    #for comparing spark permute with awk permute
    #counts = text_file.flatMap(permute).map(lambda x: '{0}\t{1}'.format(x[0], x[1][0]))

    #for computing permute TIME
Example #39
0
def geopyspark_conf(master=None, appName=None, additional_jar_dirs=[]):
    """Construct the base SparkConf for use with GeoPySpark.  This configuration
    object may be used as is , or may be adjusted according to the user's needs.

    Note:
        The GEOPYSPARK_JARS_PATH environment variable may contain a colon-separated
        list of directories to search for JAR files to make available via the
        SparkConf.

    Args:
        master (string): The master URL to connect to, such as "local" to run
            locally with one thread, "local[4]" to run locally with 4 cores, or
            "spark://master:7077" to run on a Spark standalone cluster.
        appName (string): The name of the application, as seen in the Spark
            console
        additional_jar_dirs (list, optional): A list of directory locations that
            might contain JAR files needed by the current script.  Already
            includes $(pwd)/jars.

    Returns:
        SparkConf
    """

    conf = SparkConf()

    if not appName:
        raise ValueError("An appName must be provided")
    else:
        conf.setAppName(appName)

    if master:
        conf.setMaster(master)

    if 'GEOPYSPARK_JARS_PATH' in os.environ:
        additional_jar_dirs = additional_jar_dirs + os.environ[
            'GEOPYSPARK_JARS_PATH'].split(':')

    conf.set(key='spark.ui.enabled', value='false')
    conf.set(key='spark.serializer',
             value='org.apache.spark.serializer.KryoSerializer')
    conf.set(key='spark.kryo.registrator',
             value='geopyspark.geotools.kryo.ExpandedKryoRegistrator')

    current_location = os.path.dirname(os.path.realpath(__file__))
    cwd = os.getcwd()

    local_prefixes = [
        os.path.abspath(os.path.join(current_location, 'jars')),
        os.path.abspath(os.path.join(cwd, 'jars')),
        os.path.abspath(os.path.join(cwd, '../geopyspark/jars'))
    ]
    possible_jars = [
        os.path.join(prefix, '*.jar')
        for prefix in local_prefixes + additional_jar_dirs
    ]
    configuration = os.path.join(current_location, 'command',
                                 'geopyspark.conf')

    if not possible_jars:
        if os.path.isfile(configuration):
            with open(os.path.join(configuration)) as config_file:
                possible_jars.append(os.path.relpath(config_file.read(), cwd))

    module_jars = [os.path.abspath(resource_filename('geopyspark.jars', JAR))]

    jar_dirs = [(jar, os.path.dirname(jar)) for jar in module_jars]

    for jar, jar_dir in jar_dirs:
        if jar_dir not in local_prefixes:
            possible_jars.append(jar)

    returned = [glob.glob(jar_files) for jar_files in possible_jars]
    jars = [jar for sublist in returned for jar in sublist]

    if not jars:
        raise IOError(
            "Failed to find any jars. Looked at these paths {}".format(
                possible_jars))

    jar_string = ",".join(set(jars))
    conf.set(key='spark.jars', value=jar_string)
    conf.set(key='spark.driver.memory', value='8G')
    conf.set(key='spark.executor.memory', value='8G')

    return conf
    # Set topic name
    set_global_topic_name(config)

    # Read pyspark submit path from conf file
    pyspark_environ = config['Resources']['pyspark_environ']

    # import kafka libraries to run code from terminal
    environ['PYSPARK_SUBMIT_ARGS'] = pyspark_environ

    # Setup spark conf
    sparkConf = SparkConf("TwitterDataAnalysis")

    # Number of receivers = 2
    # One for kafka and other for rdd processing
    sparkConf.setMaster("local[2]")

    # Create spark context from above configuration
    sc = SparkContext(conf=sparkConf)

    # Set log level to error
    sc.setLogLevel("ERROR")

    # Create Streaming context
    # Get data from stream every 60 secs
    ssc = StreamingContext(sc, 60)

    # Setup checkpoint for RDD recovery
    ssc.checkpoint("checkpointTwitterApp")

    # Reading parameters from conf file
Example #41
0
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
# import rdd if at all used else can be ignored
from pyspark import RDD

# import pyspark class Row from module sql
from pyspark.sql import *

# import DBUtils
#pip install DBUtils
#import DBUtils

# common code
conf = SparkConf().setAppName("window1")
conf.setMaster('local')
sc = SparkContext(conf=conf)

spark = SparkSession(sc)  #if Dataframe is used, this has to be there

#below rdd is is working example
rdd1 = sc.parallelize([(1, 2)])
rdd2 = sc.parallelize([(3, 4)])

df1 = spark.createDataFrame(rdd1)
df2 = spark.createDataFrame(rdd2)

unionDF = df1.union(df2)

hasattr(rdd1, "createDataFrame")
hasattr(rdd2, "createDataFrame")
#unionDF.show()
Example #42
0
from pyspark import SparkContext,SparkConf

# 2. on spécifie le contexte, en fait, on créé un RDD ( résilient data set) en mémoire ram, là c'est du local, mais normalement c'est sur un cluster/

conf = SparkConf().setAppName('testing').setMaster('local')
sc=SparkContext(conf=conf)

# 3. on importe un fichier
txt = sc.textFile('datasets/copyright.txt')

# Syntaxe Sur linux ....
# txt = sc.textFile('file:////usr/share/doc/python/copyright')

# 3. bis Si on avait voulu se connecter à un cluster, voici le code : 

""" 
conf = pyspark.SparkConf()
conf.setMaster('spark://head_node:56887')
conf.set('spark.authenticate', True)
conf.set('spark.authenticate.secret', 'secret-key')
sc = SparkContext(conf=conf) 
"""

# 4. on affiche son nombre de lignes du fichier
print("le nombre de lignes de mon ficher est de :" ,txt.count())

# 5. On filtre sur les lignes contenant le terme 'python'
python_lines = txt.filter(lambda line: 'python' in line.lower())

# 6. on affiche son nombre de lignes du résultat filtré
print("le nombre de lignes de mon ficher comprenent le terme python est de :" ,python_lines.count())
Example #43
0
    y = property(lambda self: self._y, _set_y, doc='The number of difference '
                                                   'vectors used.')
    z = property(lambda self: self._z, _set_z, doc='Crossover scheme.')
    F = property(lambda self: self._F, _set_F, doc='Weight used during '
                                                   'mutation.')
    CR = property(lambda self: self._CR, _set_CR, doc='Weight used during '
                                                      'bin crossover.')


if __name__ == '__main__':
    start_time = time.time()

    args = sys.argv
    sconf = SparkConf()
    sconf.setAppName("lda")
    sconf.setMaster(args[1])
    sconf.set("spark.executor.memory", "6g")
    sconf.set("spark.driver.memory", "6g")
    sconf.set("spark.driver.maxResultSize", "6g")
    sconf.set("spark.yarn.executor.memoryOverhead", "2g")
    sconf.set("spark.yarn.driver.memoryOverhead", "2g")

    sconf.set("spark.eventLog.enabled", "true")
    sconf.set("spark.eventLog.dir", "hdfs://" + args[3] + "/user/" + args[4] + "/Logs/")
    sc = SparkContext(conf=sconf)
    #labels=[int(args[5])]
    labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]

    random.seed(1)
    bounds = [(10, 100), (0, 1), (0, 1)]
    result = {}
Example #44
0
    df_pred.show()

    # Join prediction with original data.
    df_pred = df_pred.join(df, 'id')
    df_pred.show()

    sys.exit(1)

    # # # for maxIter in range(300, 1000, 100):
    # # # for x in range(1, 50):
    # # # # Build the model (cluster the data)
    # # # clusters = KMeans.train(mat, x, maxIterations=maxIter, initializationMode="random")

    # # # WSSSE = mat.map(lambda point: error(clusters, point)).reduce(lambda x, y: x + y)
    # # # print("cluster {0}: maxIter {1}: Within Set Sum of Squared Error = {2}".format(x, maxIter, WSSSE))


if __name__ == "__main__":
    # SparkContext represents connection to a Spark cluster.
    conf = SparkConf()
    conf.setAppName("Spark Machine Learning App")
    conf.setMaster('local[2]')
    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")

    spark = SparkSession \
    .builder \
    .config(conf=conf) \
    .getOrCreate()

    readDataFromES()
Example #45
0
    def functionToCreateContext(self):
        # Define Spark configuration
        conf = SparkConf()
        conf.setMaster(self.config['master_url'])
        conf.setAppName(self.config['app_name'])
        # conf.set("spark.cores.max", "2")
        conf.set("spark.streaming.backpressure.enabled", True)
        # conf.set("spark.streaming.backpressure.initialRate", "60")
        # Can set the max rate per kafka partition if needed
        conf.set("spark.streaming.kafka.maxRatePerPartition", "100")
        # Initialize a SparkContext
        sc = SparkContext(conf=conf)
        spark = SparkSession(sc)
        # Set the batch interval to be 1 sec
        ssc = StreamingContext(sc, self.config['batch_interval'])

        def savetohdfs(rdd):
            if not rdd.isEmpty():
                schema = StructType([
                    StructField("IP", StringType(), True),
                    StructField("user_identifier", StringType(), True),
                    StructField("user_id", StringType(), True),
                    StructField("user_name", StringType(), True),
                    StructField("time", StringType(), True),
                    StructField("Method", StringType(), True),
                    StructField("URI", StringType(), True),
                    StructField("HTTP-Code", StringType(), True),
                    StructField("code", StringType(), True),
                    StructField("size", StringType(), True),
                    StructField("device", StringType(), True),
                    StructField("tenant_id", StringType(), True),
                    StructField("timezone", StringType(), True),
                    StructField("OS", StringType(), True),
                    StructField("browser", StringType(), True),
                    StructField("country", StringType(), True),
                    StructField("screenResolution", StringType(), True),
                    StructField("action", StringType(), True),
                    StructField("referrer", StringType(), True),
                    StructField("timeonpage", StringType(), True),
                    StructField("supplier_id", StringType(), True),
                    StructField("product", StringType(), True),
                    StructField("geolocation", StringType(), True)
                ])
                #
                # schema =['IP','user_identifier','user_id','user_name','time','Message','code',
                #  'size','device','user_name','tenant_id','timezone','OS', 'browser',
                #  'country','screenResolution','action','referrer','timezone',
                #  'supplier_id','product','geolocation']

                df = rdd.toDF(schema)
                df.write.mode("Overwrite").format('json').save(
                    "hdfs://localhost:9820/user/rzariwal/stream")

        # # Consume Kafka streams directly, without receivers
        lines = KafkaUtils.createDirectStream(
            ssc, [self.topic], {"metadata.broker.list": self.addr})
        lines1 = lines.map(lambda x: x[1])
        lines1.cache()
        val_sum_lines = lines1.window(self.report_interval,
                                      self.batch_interval)
        val_sum_lines_top_ip = val_sum_lines.filter(lambda x: 'HEARTBEAT' not in x) \
            .map(lambda x: (x.split(' ')[0].rstrip(' '), x.split(' ')[1].rstrip(' '), x.split(' ')[2].rstrip(' '),
                            x.split(' ')[3].rstrip(' '), x.split(' ')[4].lstrip('['), x.split(' ')[6].lstrip('"'), x.split(' ')[7].rstrip(' '),
                            x.split(' ')[8].rstrip('"'), x.split(' ')[9].rstrip(' '), x.split(' ')[10].rstrip(' '),
                            x.split(' ')[11].rstrip(' '), x.split(' ')[12].rstrip(' '), x.split(' ')[13].rstrip(' '),
                            x.split(' ')[14].rstrip(' '), x.split(' ')[15].rstrip(' '), x.split(' ')[16].rstrip(' '),
                            x.split(' ')[17].rstrip(' '), x.split(' ')[18].rstrip(' '), x.split(' ')[19].rstrip(' '),
                            x.split(' ')[20].rstrip(' '), x.split(' ')[21].rstrip(' '), x.split('[')[2].rstrip('] '),
                            x.split('[')[3].rstrip('] ')))

        val_sum_lines_top_ip.foreachRDD(savetohdfs)
        val_sum_lines_top_ip1 = val_sum_lines_top_ip.map(
            lambda x: (x[3], x[4], x[6], x[8]))
        val_sum_lines_top_ip1.pprint()

        def savetheresult(rdd):
            if not rdd.isEmpty():
                hbase_table = 'flexigym'
                hconn = happybase.Connection('localhost')
                ctable = hconn.table(hbase_table)
                hconn.open()

                for row in rdd.collect():
                    time = datetime.now()
                    counter = str(time) + row[0]
                    ctable.put(
                        counter, {
                            b'Page_Visted:': row[2],
                            b'Response_Code:': row[3],
                            b'Time:': row[1],
                            b'User_Name:': row[0]
                        })

                schema = ["User_Name", "Page_Visted", "Response_Code", "Time"]
                rdd.toDF(schema).groupBy("User_Name", "Page_Visted", "Response_Code", "Time") \
                    .count() \
                    .show(truncate=False)

        val_sum_lines_top_ip1.foreachRDD(savetheresult)
        return ssc
Example #46
0
import findspark
findspark.init()
import time
from pyspark import SparkContext as sc
from pyspark import SparkConf
from pyspark.sql import SparkSession as ss
from pyspark.sql.types import *
conf = SparkConf()
conf.setMaster("spark://Sarthaks-MBP:7077").setAppName(
    'IPL Analytics Job').set("spark.executor.memory", "512m")
spark = sc(conf=conf)

a = spark.textFile("Dataset/*.csv").map(lambda line: line.split(",")).filter(
    lambda line: line[0].strip() == "ball").collect()
player_vs_player = {}
for line in a:
    details = line
    if details[0].strip() == 'ball':
        players1 = (details[4], details[6])
        players2 = (details[5], details[6])

        if players1 in player_vs_player.keys():
            player_vs_player[players1]['total'] += int(details[7])
            player_vs_player[players1]['runs'][int(details[7])] += 1
            player_vs_player[players1]['balls'] += 1
            if details[9] != '""' and details[9] != 'run out' and players1[
                    0].strip() == details[10].strip():
                player_vs_player[players1]['wickets'] += 1
        else:
            player_vs_player[players1] = {}
            player_vs_player[players1]['total'] = int(details[7])
Example #47
0
                           inputCol="scaled_features",
                           outputCol="scaled_weighted_features")

    train_all_client = m.transform(train_all_client)

    client_master = m.transform(client_master)

    sqlContext.dropTempTable("df_master_train_table")

    nn = 1000
    popshared = 0.30
    num_indices = (int)(popshared * client_master.count())
    tree_type = "kd_tree"
    nn, popshared, num_indices

    train_pd = train_all_client.toPandas()
    test_pd = client_master.toPandas()

    freq_table = findNearestNeighbour_client(train_pd, test_pd, nn,
                                             num_indices, tree_type)

    sqlContext.createDataFrame(freq_table[['cust_id', 'freq']], ).repartition(
        1).write.format("com.databricks.spark.csv").save(output_path)


if __name__ == "__main__":
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("yarn")
    sc = SparkContext(conf=conf)
    main(sc)
Example #48
0
        'merck': 5,
        'nike': 6,
        'verizon': 8
    }
    context = x[1][0]
    return3 = x[1][1]
    context2 = context.split("\t")[1]
    key = context.split("\t")[0]
    return2 = return3.split(",")[dic2[compdic[key]]]
    res = "\t".join([x[0], key, return2, context2])
    return res


if __name__ == "__main__":
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)
    df = sc.textFile("tweets/*.txt")
    parts = df.map(lambda x: (x.split(",")[0], ",".join(x.split(",")[1:])))
    #parts.collect()
    #parts = df.map(lambda x: (x.split(" ")[0], x)))
    #df1=df.take(1)
    #print df1
    df2 = sc.textFile("tweets2/step1.txt")
    parts2 = df2.map(lambda l:
                     ((l.split("\t")[0], "\t".join(l.split("\t")[1:]))))
    parts3 = parts2.join(parts)
    part4 = parts3.map(mapp)
    part4.saveAsTextFile("step33/step3.txt")
    #print parts4.take(1)
    #df3=df2.take(1)
Example #49
0
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p0.34/jars/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Number of elements in RDD is 8
计算成功!
"""



from pyspark import SparkConf
from pyspark import SparkContext

conf = SparkConf()
conf.setMaster('yarn')
conf.setAppName('spark-yarn')
conf.setExecutorEnv('HADOOP_CONF_DIR','$HADOOP_HOME/etc/hadoop')
conf.setExecutorEnv('YARN_CONF_DIR','$HADOOP_HOME/etc/hadoop')
sc = SparkContext(conf=conf)


def mod(x):
    import numpy as np
        return (x, np.mod(x, 2))
    rdd = sc.parallelize(range(1000)).map(mod).take(10)
    print(rdd)

"""
>>>
SLF4J: Class path contains multiple SLF4J bindings.
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.util import MLUtils
from pyspark.sql import Row
from pyspark.ml.feature import StopWordsRemover, Tokenizer, RegexTokenizer

from nltk.stem import PorterStemmer

if __name__ == "__main__":
    #get the parameters
    brokers, topic = sys.argv[1:]
    #number of features to be used in tf idf calculation
    numFeatures = 1000

    #Spark configuration
    conf = SparkConf()
    conf.setMaster('spark://VM10-1-0-20:7077')
    conf.setAppName('sentiment_stream')
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    #sliding window configuration (in seconds)
    ssc = StreamingContext(sc, 20)

    #set list of broker
    kafkaParams = {"metadata.broker.list": brokers}

    counter_model = sc.accumulator(1)

    #stemming words
    def porter_stem(words):
        stem = [PorterStemmer().stem(x) for x in words]
Example #51
0
from scipy.stats import ttest_ind
from scipy.signal import savgol_filter
from collections import OrderedDict
from multiprocessing import Pool
from functools import partial
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
#from single_molecule_mechanics.ProteinModels import xSeriesWLCe
from .ProteinModels import xSeriesWLCe
from collections import OrderedDict
from multiprocessing import Pool
from functools import partial

#create a spark context
conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]').set('spark.executor.memory', '2G').set(
    'spark.driver.memory', '8G').set('spark.driver.maxResultSize', '15G'))
sc = SparkContext(conf=conf)

### NOTE: you will need to re-create this zip file every time you want to run this code. This is super annoying. We should look into using Dask instead of Spark.
print(
    'NOTE: you will need to re-create this zip file every time you want to run this code. This is super annoying. We should look into using Dask instead of Spark.'
)
sc.addPyFile("/home/tbartsch/source/repos/single_molecule_mechanics.zip")


class TimeSeriesLoader(object):
    '''Provides data structures and methods to analyze single-molecule data.'''
    def __init__(self):
        #define some default values
        data = np.empty((3, 2))
        data.fill(np.nan)
Example #52
0
def initSparkConf(isLocal, appName):
    conf = SparkConf()
    conf.setAppName(appName)
    if isLocal is True:
        conf.setMaster("local[*]")
    return conf
# spark-submit --master yarn --num-executors 10 --jars spark-csv-assembly-1.4.0.jar amazon_book.rating.py
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import HiveContext
from pyspark.sql.types import StructType, IntegerType, StringType, FloatType, TimestampType, StructField

conf = SparkConf()
conf.setAppName('spark-workshop')
conf.setMaster('yarn-client')

sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

customSchema = StructType(
        [
            StructField("user", StringType(), True),
            StructField("item", StringType(), True),
            StructField("rating", FloatType(), True),
            StructField("timestamp", IntegerType(), True)
            ]
        )

df = sqlContext.read.format("com.databricks.spark.csv")\
    .option("header", "false")\
    .option("inferSchema", "false")\
    .schema(customSchema)\
    .load("/project/public/spark-workshop/amazon_ratings_Books.csv")

df.registerTempTable("tb")

sqlContext.sql("drop table if exists default.amazon_book_rating")
Example #54
0
from pyspark import SparkConf, SparkContext

conf1 = SparkConf()
conf1.setMaster("local")
conf1.setAppName("Sixth program")

sc = SparkContext(conf=conf1)

List1 = [1, 2, 3, 4, 5, 5, 5, 3]
Rdd = sc.parallelize(List1)

Data1 = Rdd.countByValue()

for K in Data1:
    print(K, "...", Data1[K])
print("======================")
for K in Data1:
    print(K, "...", 3)


def add(x, y):
    return x + y


def maxi(x, y):
    if x > y:
        return x
    else:
        return y

Example #55
0
# spark-basic.py
from pyspark import SparkConf
from pyspark import SparkContext

conf = SparkConf()
conf.setMaster('spark://gabriela-TM1703:7077')
conf.setAppName('spark-basic')
sc = SparkContext(conf=conf)

# ACO code
import numpy as np
from random import randrange
from aco_algorithm import AntColony

N = 10
rand_matrix = np.random.random_integers(1, 100, size=(N, N))
rand_dist = (rand_matrix + rand_matrix.T) / 2
for i in range(N):
    rand_dist[i][i] = np.inf
for i in range((int)(N / 10)):
    j = randrange(0, N - 1)
    k = randrange(0, N - 1)
    rand_dist[j][k] = np.inf
    rand_dist[k][j] = np.inf

ant_colony = AntColony(rand_dist, 20, 100, 0.95, alpha=1, beta=1)
shortest_path = ant_colony.run(0, 5)

print("--- Final result is: ---")
print(shortest_path)
Example #56
0
from pyspark import SparkContext
from pyspark import SparkConf

import os
import sys

conf = SparkConf()

sparkmaster = sys.argv[1]
wordcountfile = sys.argv[2]

conf.setMaster(sparkmaster)
conf.setAppName("test")

sc = SparkContext(conf=conf)

file = sc.textFile(wordcountfile)

counts = file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)

output = counts.collect()

for (k, v) in output:
    print k + ": " + str(v)
Example #57
0
# !/usr/bin/env python
from pyspark import SparkContext, SparkConf
import pickle

conf = SparkConf()
conf.setMaster("spark://dmlhdpc10:7077")
conf.setAppName("VFKMMProject")
conf.set("spark.executor.memory", "5g")
conf.set("spark.ui.port", "44041")
sc = SparkContext(conf=conf, pyFiles=['lib.zip'])
#numOfCores = 96
# sc = SparkContext(appName="PythonProject", pyFiles=['lib.zip'])
import numpy as np
import time
import argparse
from lib.splitter import split
from lib.bagger import get_size_no, partition, bag, pair, cartesian
from lib.kmm import computeBeta
from lib.evaluation import computeNMSE
from lib.scaleKMM import *
from lib.util import *
from lib.bagger import *
from lib.caculate import *
import csv


def kmmProcess():
    parser = argparse.ArgumentParser()
    parser.add_argument('-b',
                        "--bagging",
                        type=int,
Example #58
0
        return False
    else:
        return True


if __name__ == "__main__":
    APP_NAME = "hw3_problem04"
    MASTER_URL = "spark://192.168.1.103:7077"
    HOME_PATH = "D:\\data\\"
    SHINGLING_K = 1
    DOCUMENT_ONE_INDEX = 0
    DOCUMENT_TWO_INDEX = 3

    # 初始化
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster(MASTER_URL)
    sc = SparkContext(conf=conf)

    # 讀取檔案並進行預處理以取得每篇新聞的body.text
    news_rdd = preprocess_data(
        sc.wholeTextFiles(HOME_PATH + "reut2-[0-9]*.sgm"))
    # 取得要比較的兩篇新聞的body.text
    news_rdd = get_target_news(DOCUMENT_ONE_INDEX, DOCUMENT_TWO_INDEX,
                               news_rdd)
    # 計算兩篇新聞的shingles
    news_shingles_rdd = create_shingles(news_rdd)

    # 執行linear search
    start_time = time.time()
    linear_search_result = linear_search(news_shingles_rdd)
    linear_search_time = time.time() - start_time
Example #59
0
        vals['telephone'] - 1, vals['foreign_wkr'] - 1
    ]
    return LabeledPoint(label, feats)


def train_decision_tree(lp, german_cfi):
    return DecisionTree.trainClassifier(lp,
                                        numClasses=2,
                                        categoricalFeaturesInfo=german_cfi,
                                        impurity='gini',
                                        maxDepth=3,
                                        maxBins=5)


## Main functionality


def main(sc):
    dat = sqlCtx.sql('SELECT * FROM german')
    lp = dat.rdd.filter(lambda x: x.cred).map(german_lp).cache()
    predictions = model.predict(lp.map(lambda x: x.features))
    labelsAndPredictions = lp.map(lambda lp: lp.label).zip(predictions)
    trainErr = labelsAndPredictions.filter(
        lambda (v, p): v != p).count() / float(lp.count())


if __name__ == "__main__":
    conf = SparkConf().setAppName("german_spark_submit")
    conf = conf.setMaster("yarn cluster")
    sc = SparkContext(conf=conf)
    main(sc)
Example #60
0
APP_NAME = 'IE2-Project-Homocide-Reports'
INPUT_DATA = "C:/Users/MWeil/Documents/GitHub/IE2-Project/emb/emb-out.txt"
OUTPUT_LABEL = "C:/Users/MWeil/Documents/GitHub/IE2-Project/data/homicide-reports/database_new_label_emb.json"
INPUT_MODEL = "C:/Users/MWeil/Documents/GitHub/IE2-Project/embedding-clustering/kmeans_model_7"


def run_kmeans(sc):
    cpu_count = multiprocessing.cpu_count()

    # Load Data
    dataset = sc.textFile(INPUT_DATA, cpu_count)
    dataset = dataset.map(
        lambda line: array([float(x) for x in line.split(';')]))

    # Load Model
    sameModel = KMeansModel.load(sc, INPUT_MODEL)

    # Predict cluster labels per row
    labels = sameModel.predict(dataset).collect()

    # Save labels in json file
    with open(OUTPUT_LABEL, 'w') as out_f:
        json.dump(labels, out_f)


if __name__ == "__main__":
    conf = SparkConf().setAppName(APP_NAME)
    conf.setMaster('local[*]')
    sc = SparkContext(conf=conf)
    run_kmeans(sc)