def sparkconfig(): # spark configuration options # conf = SparkConf() # conf.setMaster("spark://3.168.100.58:7077") # uncomment for standalone cluster # conf.setMaster("local") # uncomment for local execution # conf.setAppName("demo_chain") # conf.set("spark.executor.memory", "2g") # conf.set("spark.default.parallelism", 56) # 48) # conf.set("spark.sql.inMemoryColumnarStorage.compressed","true") # conf.set("sql.inMemoryColumnarStorage.batchSize",2000) # AMAZON AWS EMR conf = SparkConf() conf.setMaster("yarn-client") #client gets output to terminals #conf.setMaster("yarn-cluster") # this seems to runf aster but can't confirm conf.set("spark.default.parallelism",648) conf.setAppName("spark_markov_chain") conf.set("spark.executor.memory", "22g") conf.set("spark.executor.instances",9) conf.set("spark.executor.cores",9) conf.set("spark.yarn.executor.memoryOverhead",800) conf.set("spark.rdd.compress","True") conf.set("spark.shuffle.consolidateFiles","True") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") return conf
def configureSpark(): conf = SparkConf() conf.setMaster("local") conf.setAppName("Apache Spark Alarm Parser") conf.set("spark.executor.memory", "1g") sc = SparkContext(conf = conf) return sc
def __connected_yarn_spark_cluster(self, pilotcompute_description): number_cores=1 if pilotcompute_description.has_key("number_cores"): number_cores=int(pilotcompute_description["number_cores"]) number_of_processes = 1 if pilotcompute_description.has_key("number_of_processes"): number_of_processes = int(pilotcompute_description["number_of_processes"]) executor_memory="1g" if pilotcompute_description.has_key("number_of_processes"): executor_memory = pilotcompute_description["physical_memory_per_process"] conf = SparkConf() conf.set("spark.num.executors", str(number_of_processes)) conf.set("spark.executor.instances", str(number_of_processes)) conf.set("spark.executor.memory", executor_memory) conf.set("spark.executor.cores", number_cores) if pilotcompute_description!=None: for i in pilotcompute_description.keys(): if i.startswith("spark"): conf.set(i, pilotcompute_description[i]) conf.setAppName("Pilot-Spark") conf.setMaster("yarn-client") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
def main(): parser = argparse.ArgumentParser( description='process some log messages, storing them and signaling ' 'a rest server') parser.add_argument('--mongo', help='the mongodb url', required=True) parser.add_argument('--rest', help='the rest endpoint to signal', required=True) parser.add_argument('--port', help='the port to receive from ' '(default: 1984)', default=1984, type=int) parser.add_argument('--appname', help='the name of the spark application ' '(default: SparkharaLogCounter)', default='SparkharaLogCounter') parser.add_argument('--master', help='the master url for the spark cluster') parser.add_argument('--socket', help='the socket to attach for streaming text data ' '(default: caravan-pathfinder)', default='caravan-pathfinder') args = parser.parse_args() mongo_url = args.mongo rest_url = args.rest sconf = SparkConf().setAppName(args.appname) if args.master: sconf.setMaster(args.master) sc = SparkContext(conf=sconf) ssc = StreamingContext(sc, 1) lines = ssc.socketTextStream(args.socket, args.port) lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url)) ssc.start() ssc.awaitTermination()
def spark_config(self): if self._spark_config is None: os.environ['SPARK_SUBMIT_CLASSPATH'] = ','.join(self.spex_conf.spark_config.jars) conf = SparkConf() conf.setAppName(self.spex_conf.spark_config.name) conf.setMaster(self.spex_conf.spark_config.master) conf.set('spark.rdd.compress', 'true') conf.set('spark.io.compression.codec', 'lz4') conf.set('spark.mesos.coarse', 'true' if self.spex_conf.spark_config.coarse_mode else 'false') # TODO - Setup all the other cruft as needed #conf.set('spark.executor.memory', '4g') #conf.set('spark.cores.max', '16') #conf.set('spark.task.cpus', '6') # TODO - bind port for spark web ui self._spark_config = conf config = self._spark_config # These are always set, if someone changes them we simply set them back config.set('spark.executor.uri', self.artifact_resolver(self.spex_conf.spark_distro)) config.setExecutorEnv(key='PYSPARK_PYTHON', value='./%s daemon' % self.spex_conf.spex_name) return config
def main(): # Setting the cluster configuration parameters conf = SparkConf() conf.setMaster("spark://localhost:7077") conf.setAppName("Tweet App") conf.set("spark.executor.memory", "3g") conf.set("spark.driver.memory", "4g") # Creating a Spark Context with conf file sc = SparkContext(conf=conf) # Creating and SQL context to perform SQL queries sqlContext = SQLContext(sc) # Define the data path curr_path = os.path.dirname(os.path.abspath(__file__)) json_name = "out.json" json_file_path = os.path.join(curr_path + "/../Spark_Jobs/data/", json_name) parquet_file_path = createSQLContext(json_file_path, sqlContext) print(parquet_file_path) # Read from parquet file parquetFile = sqlContext.read.parquet(parquet_file_path) parquetFile.registerTempTable("tweets") counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets") print("============= Count =================") print("Count:: " + str(counter.collect()[0].cnt))
def main(args): if len(args) < 2: sys.exit(1) # Setting the cluster configuration parameters spark_master = args[0] spark_data_file_name = args[1] file_path = CURR_DIR + "/" + spark_data_file_name conf = SparkConf() conf.setMaster(spark_master) conf.setAppName("Log Scanner") # Creating a Spark Context with conf file sc = SparkContext(conf=conf) txt_logs = sc.textFile(file_path).filter(lambda line: check(line)) access_logs = txt_logs.map(lambda line: AccessLog(line)) # Getting response_codes from log objects and caching it response_codes = access_logs.map(lambda log: log.get_status()).cache() log_count = response_codes.count() print("Total Resonse Codes: " + str(log_count)) cnt = response_codes.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) response200 = cnt.filter(lambda x: x[0] == "200").map(lambda (x, y): y).collect() print("###########################") print("## Success Rate : " + str(int(response200[0])*100/log_count) + " % ##") print("###########################")
def read_conf(): """ Setting up spark contexts """ conf = SparkConf() conf.setMaster("local[*]") conf.setAppName("Testing") return conf
def _test_broadcast_on_driver(self, *extra_confs): conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) bs = self.sc.broadcast(value=5) self.assertEqual(5, bs.value)
def init_spark_context(): # load spark context conf = SparkConf().setAppName("event-contour-server") conf.setMaster("local[4]") conf.setAppName("reduce") conf.set("spark.executor.memory", "4g") # IMPORTANT: pass aditional Python modules to each worker sc = SparkContext(conf=conf, pyFiles=['app.py', 'contourGenerator.py','EventParallelize.py']) return sc
def __init__(self, master, name): self.name=name self.master=master print "init spark ..." os.environ["HADOOP_HOME"]="D:\code\wqr\hadoop-common-2.2.0-bin" conf = SparkConf() conf.setMaster(self.master) conf.setAppName(self.name) self.sc = SparkContext(conf=conf)
def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local[10]") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc)
def _test_multiple_broadcasts(self, *extra_confs): """ Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, and also multiple jobs. """ conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) self._test_encryption_helper([5]) self._test_encryption_helper([5, 10, 20])
def __connected_spark_cluster(self, resource_url, pilot_description=None): conf = SparkConf() conf.setAppName("Pilot-Spark") if pilot_description!=None: for i in pilot_description.keys(): if i.startswith("spark"): conf.set(i, pilot_description[i]) conf.setMaster(resource_url) print(conf.toDebugString()) sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
def main(): count=0 #Initializing Spark Configuration for the Master Node config = SparkConf().setAppName('DiskDetection_App') config.setMaster('local[6]') #indicates the number of threads on the master node sc = SparkContext(conf=config) # Initializing the Spark Context for i in os.listdir(os.environ["MODEL_CSV_FILEPATH"]): # Loop to restrict training to 20 models (only for better analysis purpose) if count < 20: modelName = os.path.splitext(i)[0] print modelName predictMain(modelName,sc) count+=1
def main(): conf = SparkConf() conf.setMaster('local[*]') conf.setAppName('spark-basic') sc = SparkContext(conf=conf) churn_df = read_dataset(sc, "churn_no_header.csv") pipeline = build_pipeline() training_data, test_data = train_test_split(churn_df, 0.2) model = pipeline.fit(training_data) predictions = model.transform(test_data) print predictions.show(20) (roc_score, pr_score) = evaluate(predictions, ['areaUnderROC', 'areaUnderPR']) print "\nSpark AUC Score: ", roc_score, ", PR Score: ", pr_score
def configureSpark(app_name, master): #Configure SPARK conf = SparkConf().setAppName(app_name) conf = conf.setMaster(master) spark_context = SparkContext(conf=conf) return spark_context
def configureSpark(): #Configure SPARK conf = SparkConf().setAppName("a") conf = conf.setMaster("local[*]") conf = conf.set("spark.executor.memory", "2g").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryoserializer.buffer", "256").set("spark.akka.frameSize", "500").set("spark.rpc.askTimeout", "30").set('spark.executor.cores', '4').set('spark.driver.memory','2g') sc = SparkContext(conf=conf) return sc
def main(): parser = argparse.ArgumentParser( description='process some log messages, storing them and signaling ' 'a rest server') parser.add_argument('--mongo', help='the mongodb url', required=True) parser.add_argument('--rest', help='the rest endpoint to signal', required=True) parser.add_argument('--port', help='the port to receive from ' '(default: 1984)', default=1984, type=int) parser.add_argument('--appname', help='the name of the spark application ' '(default: SparkharaLogCounter)', default='SparkharaLogCounter') parser.add_argument('--master', help='the master url for the spark cluster') parser.add_argument('--socket', help='the socket ip address to attach for streaming ' 'text data (default: caravan-pathfinder)', default='caravan-pathfinder') parser.add_argument('--model', help='the serialized model to use', default='model.json') args = parser.parse_args() mongo_url = args.mongo rest_url = args.rest model = args.model sconf = SparkConf().setAppName(args.appname) if args.master: sconf.setMaster(args.master) sc = SparkContext(conf=sconf) ssc = StreamingContext(sc, 1) somv = fromJSON(model) som = sc.broadcast(somv) log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN) lines = ssc.socketTextStream(args.socket, args.port) lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url, som)) ssc.start() ssc.awaitTermination()
def configureSpark(app_name, master): #Configure SPARK conf = SparkConf().setAppName(app_name) conf = conf.setMaster(master) #conf.set("fs.s3n.awsAccessKeyId", "") #conf.set("fs.s3n.awsSecretAccessKey", "") spark_context = SparkContext(conf=conf) return spark_context
def create_sc(): sc_conf = SparkConf() sc_conf.setAppName("finance-similarity-app") sc_conf.setMaster('spark://10.21.208.21:7077') sc_conf.set('spark.executor.memory', '2g') sc_conf.set('spark.executor.cores', '4') sc_conf.set('spark.cores.max', '40') sc_conf.set('spark.logConf', True) print sc_conf.getAll() sc = None try: sc.stop() sc = SparkContext(conf=sc_conf) except: sc = SparkContext(conf=sc_conf) return sc
def main(): conf = SparkConf() conf.setMaster('local[*]') conf.setAppName('renewer-prediction-spark') filename = '/Users/andyyoo/scikit_learn_data/renewer/Orange_Dataset.no.header.csv' sc = SparkContext(conf=conf) df = read_dataset(sc, filename) df = pipe_index_string_cols(df, cols=["label"]) df = pipe_assemble_features(df, excluded_cols=["label"]) df = pipe_scale_cols(df, with_mean=True, with_std=True, use_dense_vector=False) df.show() training_data, test_data = train_test_split(df, 0.2) model = rf_classifier().fit(training_data) predictions = model.transform(test_data) print predictions.show(20) (roc_score, pr_score) = evaluate(predictions, ['areaUnderROC', 'areaUnderPR']) print "\nSpark AUC Score: ", roc_score, ", PR Score: ", pr_score
def main(): # master = 'local[2]' master = 'spark://192.168.9.164:7077' app_name = 'test-broadcast' # spark_home = '/data01/app/bigdata/spark' # local spark_home = '/home/hadoop/app/spark' # test pyFiles = ['mysql_utils.py'] spark_conf = SparkConf() spark_conf.setMaster(master).setAppName(app_name).setSparkHome(spark_home) sc = SparkContext(conf=spark_conf) for path in (pyFiles or []): sc.addPyFile(path) external_cache = get_api_deviceinfo() deviceinfo_b = sc.broadcast(external_cache) sc.stop()
def main(): conf = SparkConf() conf.setMaster("spark://192.168.199.123:8070") conf.setAppName("User Profile Spark") sc = SparkContext(conf=conf) print("connection sucessed with Master", conf) data = [1, 2, 3, 4] distData = sc.parallelize(data) print(distData.collect()) # raw = open(TRACKS_PATH, 'r').read().split("\n") tackfile = sc.parallelize(raw) tackfile = tackfile.filter(lambda line: len(line.split(',')) == 6) tbycust = tackfile.map(lambda line: make_tracks_kv(line)).reduceByKey(lambda a, b: a + b) custdata = tbycust.mapValues(lambda a: compute_stats_byuser(a)) print(custdata.first())
def run_cluster(): """ 集群模式 """ "集群配置" conf = SparkConf() conf.setMaster("spark://jldrp-4:7077") conf.setAppName("WebCat") ts_chunks = [] if len(sys.argv) > 1: test_set_file_path = sys.argv[1] else: exit("未提供测试集文件! 现在退出...") with open(test_set_file_path) as test_set_file: ts_chunks = split_file(test_set_file) sc = SparkContext(pyFiles=["ProvincesCities.csv", "stopwords.txt", "training.set.balanced.40", "text_cat.py"] ,conf=conf ) #ts = sc.textFile("hdfs://jldrp-4:7077/user/liulx/webcat/gt100.gt100.valid.test.set") #res = ts.flatMap(lambda x: text_cat.pipe(x)).collect() res = sc.parallelize(ts_chunks).flatMap(lambda x: text_cat.pipe(x)) res.saveAsTextFile("hdfs://jldrp-4:8020/webcat/cat.result")
def words_count_mapReduce(): # configuration APP_NAME = 'word count' conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster('spark://ukko160:7077') sc = SparkContext(conf=conf) # actual function lines = sc.textFile("../spark-1.4.1-bin-hadoop2.6/README.md") table = lines.flatMap(f_flatmapper).map(f_mapper).reduce(f_reducer) for x in table: print x pass
def count_line_mapReduce(): ''' This function will count the number of lines in the file. It is implemented with mapReduce heuristics. ''' # spark configuration APP_NAME = 'count lines in mapReduce' conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster('spark://ukko160:7077') sc = SparkContext(conf=conf) # mapReduce function call lines = sc.textFile('../spark-1.4.1-bin-hadoop2.6/README.md') lineLength = lines.map(count_line_map) totalLength = lineLength.reduce(count_line_reduce) return totalLength pass
def count_lines_lambdaExpression(): ''' The function is to compute the number of line in a text file. The function is implemented with python lambda expression. ''' # configuration APP_NAME = 'count lines' conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster('spark://ukko160:7077') sc = SparkContext(conf=conf) # actuall lambda lines = sc.textFile('../spark-1.4.1-bin-hadoop2.6/README.md') lineLength = lines.map(lambda s: len(s)) totalLength = lineLength.reduce(lambda a,b: a+b) return totalLength pass
def words_count_lambdaExpression(): # configuration APP_NAME = 'words count with python lambda expression' conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster('spark://ukko160:7077') sc = SparkContext(conf=conf) # actual function lines = sc.textFile("../spark-1.4.1-bin-hadoop2.6/README.md") words = lines.flatMap(lambda x: x.split(' ')) pairs = words.map(lambda x: (x,1)) count = pairs.reduceByKey(lambda x,y: x+y) for x in count.collect(): print x pass
def main(arglist): with open("log_file_x.txt", "a") as f: f.write("Start time of sort...... %s\n" % datetime.datetime.now()) print("Start time...... %s" % datetime.datetime.now()) # mapreduce params path = arglist[0] output = arglist[1] minPartitions = int(arglist[2]) # initialize conf = SparkConf() conf = conf.setMaster('local').setAppName("PythonSort").set("spark.driver.memory", "10g").set("spark.driver-maxResultSize", "3g") sc = SparkContext(conf=conf) sc = SparkContext(appName="PythonWordCount") lines = sc.textFile(path) counts = lines.flatMap(lambda x: x.split('\n')) \ .map(lambda x: (x, 1)) \ .sortByKey(lambda x: x) counts.saveAsTextFile(output) # # print(rdd) # f = open(output, 'w') # f.writelines('\n'.join(rdd)) # f.close() # # write to one single file # single_output = open('single_output', 'w') # for i in range(minPartitions): # file_name = 'part-000' + ('0'+str(i) if i < 10 else str(i)) # file_path = os.path.join(output, file_name) # file = open(file_path, 'r') # # single_output.write(''.join(file)) # single_output.close() sc.stop() print("End time of sort...... %s" % datetime.datetime.now()) with open("log_file_x.txt", "a") as f: f.write("End time of sort...... %s\n" % datetime.datetime.now())
# -*- coding: utf-8 -*- from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession from sklearn.datasets import load_iris import pandas from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline from pyspark2pmml import PMMLBuilder from pyspark.ml.feature import RFormula # 配置spark客户端 conf = SparkConf().setAppName("lr_spark").set( "spark.jars", "./jpmml-sparkml-executable-1.4.5.jar") # 注意: 这里需要加载jpmml jar conf = conf.setMaster("local") sc = SparkContext(conf=conf) # 加载sklearn的训练数据 iris = load_iris() # 特征矩阵 features = pandas.DataFrame(iris.data, columns=iris.feature_names) # 目标矩阵 targets = pandas.DataFrame(iris.target, columns=['Species']) # 合并矩阵 merged = pandas.concat([features, targets], axis=1) # 创建SparkSession sess = SparkSession(sc) # 创建spark DataFrame raw_df = sess.createDataFrame(merged)
import os import os.path from pyspark import SparkContext if 'BACKEND_COMPUTE_MASTER' in os.environ: master = os.environ['BACKEND_COMPUTE_MASTER'] else: master = 'localhost' logFile = os.path.join("/tmp/data/README.md") from pyspark import SparkConf, SparkContext conf = SparkConf() conf.setMaster('spark://' + master + ':7077') conf.setAppName("simpleapp") sc = SparkContext(conf=conf) logData = sc.textFile(logFile).cache() numAs = logData.filter(lambda s: 'a' in s).count() numBs = logData.filter(lambda s: 'b' in s).count() print "Lines with a: %i, lines with b: %i" % (numAs, numBs)
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, MultilayerPerceptronClassifier from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from sklearn.metrics import matthews_corrcoef from pyspark.ml.feature import VectorAssembler from pyspark.sql.functions import lit from pyspark.sql.functions import rand # from numba import jit conf = SparkConf() conf.set("spark.executor.memory", "6G") conf.set("spark.driver.memory", "4G") conf.set("spark.executor.cores", "4") conf.set("spark.sql.crossJoin.enabled", "true") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.default.parallelism", "4") conf.setMaster('local[4]') atexit.register(lambda: spark.stop()) spark = SparkSession \ .builder.config(conf=conf) \ .appName("bosch-spark-magic").getOrCreate() # @jit def mcc(tp, tn, fp, fn): sup = tp * tn - fp * fn inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) if inf == 0: return 0 else:
# HDFS driver to use with Petastorm. PETASTORM_HDFS_DRIVER = 'libhdfs' # ================ # # DATA PREPARATION # # ================ # print('================') print('Data preparation') print('================') # Create Spark session for data preparation. conf = SparkConf().setAppName('data_prep').set( 'spark.sql.shuffle.partitions', '16') if args.processing_master: conf.setMaster(args.processing_master) spark = SparkSession.builder.config(conf=conf).getOrCreate() train_csv = spark.read.csv('%s/train.csv' % args.data_dir, header=True) test_csv = spark.read.csv('%s/test.csv' % args.data_dir, header=True) store_csv = spark.read.csv('%s/store.csv' % args.data_dir, header=True) store_states_csv = spark.read.csv('%s/store_states.csv' % args.data_dir, header=True) state_names_csv = spark.read.csv('%s/state_names.csv' % args.data_dir, header=True) google_trend_csv = spark.read.csv('%s/googletrend.csv' % args.data_dir, header=True) weather_csv = spark.read.csv('%s/weather.csv' % args.data_dir, header=True) def expand_date(df):
conf = SparkConf() conf.set("spark.executor.memory", "5g") # conf.set("spark.sql.shuffle.partitions", "1000") # conf.set("spark.yarn.executor.memoryOverhead", "512m") conf.set("spark.network.timeout", "2000") conf.set("spark.sql.broadcastTimeout", "300000") # conf.set("spark.dynamicAllocation.enabled","true") # conf.set("spark.shuffle.service.enabled", "true") # conf.set("spark.local.dir", "/yelp-dataset/spark-tmp") # conf.set("spark.driver.memory","512m") # conf.set("spark.driver.maxResultSize","10g") # sc = SparkContext("local[*]", "Simple App", conf=conf) # sc.setCheckpointDir('/tmp') os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.6' conf.setMaster(SPARK_URL) sc = SparkContext(conf=conf) sql_sc = SQLContext(sc) # In[4]: def save(df, target_dir, name): df.write.mode("overwrite").parquet(DIR_ROOT + "/" + target_dir + "/" + name) def ren(df, exclude=[]): replacements = {c: c + "_2" for c in df.columns if str(c) not in exclude} replacements = [ col(c).alias(replacements.get(c)) for c in df.columns
def get_sparkcontext(): conf = SparkConf() conf.setAppName("sparkDemo") conf.setMaster("local[5]") spark_context = SparkContext(conf=conf) return spark_context
def saveTrans(data, sdate, timeSpan): res = data[(data[1] == data[3]) & (data[1] != data[2])] trans = pandas.read_csv('SubwayFlowConf/trans', header=None) #trans = trans[0].tolist() #res = res[res[1] in trans] res = pandas.merge(res, trans, left_on=1, right_on=0) res.to_csv('result/trans/' + sdate + '_' + str(timeSpan), header=None, index=None) def timeIndex(self, x): L = x.split(':') return str(((int(L[0]) * 60) + int(L[1])) / self.span) def resetIndex(self, x): return '%02d:%02d' % ((x * self.span) / 60, (x * self.span) % 60) if __name__ == '__main__': # Configure Spark conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster(master) sc = SparkContext(conf=conf) spark = SparkSession.builder.master(master).appName(APP_NAME).getOrCreate() # Execute Main functionality main(sc, spark)
:param a: mosaic from mosaic n-gram a :param b: mosaic from mosaic n-gram b :return: whichever has a bigger mosaic_value """ if mosaic_value(a) > mosaic_value(b): return a elif mosaic_value(a) == mosaic_value(b) and a < b: return a else: return b if __name__ == "__main__": conf = SparkConf() conf.setMaster('spark://hadoop-master:7077') conf.setAppName('spark-basic') sc = SparkContext(conf=conf) host = 'hadoop-master:54310' text_file = sc.textFile("hdfs://" + host + "/" + sys.argv[1]) #counts = text_file.flatMap(permute)\ # .reduceByKey(aggregate_by_mosaic).map(lambda x: (x[1], x[0]))\ # .reduceByKey(remove_duplicates).sortBy(ascending=False, keyfunc=lambda x: x[0][1])\ # .map(lambda x: '{0}\t{1}'.format(x[1], x[0][1])) #for comparing spark permute with awk permute #counts = text_file.flatMap(permute).map(lambda x: '{0}\t{1}'.format(x[0], x[1][0])) #for computing permute TIME
def geopyspark_conf(master=None, appName=None, additional_jar_dirs=[]): """Construct the base SparkConf for use with GeoPySpark. This configuration object may be used as is , or may be adjusted according to the user's needs. Note: The GEOPYSPARK_JARS_PATH environment variable may contain a colon-separated list of directories to search for JAR files to make available via the SparkConf. Args: master (string): The master URL to connect to, such as "local" to run locally with one thread, "local[4]" to run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster. appName (string): The name of the application, as seen in the Spark console additional_jar_dirs (list, optional): A list of directory locations that might contain JAR files needed by the current script. Already includes $(pwd)/jars. Returns: SparkConf """ conf = SparkConf() if not appName: raise ValueError("An appName must be provided") else: conf.setAppName(appName) if master: conf.setMaster(master) if 'GEOPYSPARK_JARS_PATH' in os.environ: additional_jar_dirs = additional_jar_dirs + os.environ[ 'GEOPYSPARK_JARS_PATH'].split(':') conf.set(key='spark.ui.enabled', value='false') conf.set(key='spark.serializer', value='org.apache.spark.serializer.KryoSerializer') conf.set(key='spark.kryo.registrator', value='geopyspark.geotools.kryo.ExpandedKryoRegistrator') current_location = os.path.dirname(os.path.realpath(__file__)) cwd = os.getcwd() local_prefixes = [ os.path.abspath(os.path.join(current_location, 'jars')), os.path.abspath(os.path.join(cwd, 'jars')), os.path.abspath(os.path.join(cwd, '../geopyspark/jars')) ] possible_jars = [ os.path.join(prefix, '*.jar') for prefix in local_prefixes + additional_jar_dirs ] configuration = os.path.join(current_location, 'command', 'geopyspark.conf') if not possible_jars: if os.path.isfile(configuration): with open(os.path.join(configuration)) as config_file: possible_jars.append(os.path.relpath(config_file.read(), cwd)) module_jars = [os.path.abspath(resource_filename('geopyspark.jars', JAR))] jar_dirs = [(jar, os.path.dirname(jar)) for jar in module_jars] for jar, jar_dir in jar_dirs: if jar_dir not in local_prefixes: possible_jars.append(jar) returned = [glob.glob(jar_files) for jar_files in possible_jars] jars = [jar for sublist in returned for jar in sublist] if not jars: raise IOError( "Failed to find any jars. Looked at these paths {}".format( possible_jars)) jar_string = ",".join(set(jars)) conf.set(key='spark.jars', value=jar_string) conf.set(key='spark.driver.memory', value='8G') conf.set(key='spark.executor.memory', value='8G') return conf
# Set topic name set_global_topic_name(config) # Read pyspark submit path from conf file pyspark_environ = config['Resources']['pyspark_environ'] # import kafka libraries to run code from terminal environ['PYSPARK_SUBMIT_ARGS'] = pyspark_environ # Setup spark conf sparkConf = SparkConf("TwitterDataAnalysis") # Number of receivers = 2 # One for kafka and other for rdd processing sparkConf.setMaster("local[2]") # Create spark context from above configuration sc = SparkContext(conf=sparkConf) # Set log level to error sc.setLogLevel("ERROR") # Create Streaming context # Get data from stream every 60 secs ssc = StreamingContext(sc, 60) # Setup checkpoint for RDD recovery ssc.checkpoint("checkpointTwitterApp") # Reading parameters from conf file
from pyspark.sql import SparkSession from pyspark import SparkContext, SparkConf # import rdd if at all used else can be ignored from pyspark import RDD # import pyspark class Row from module sql from pyspark.sql import * # import DBUtils #pip install DBUtils #import DBUtils # common code conf = SparkConf().setAppName("window1") conf.setMaster('local') sc = SparkContext(conf=conf) spark = SparkSession(sc) #if Dataframe is used, this has to be there #below rdd is is working example rdd1 = sc.parallelize([(1, 2)]) rdd2 = sc.parallelize([(3, 4)]) df1 = spark.createDataFrame(rdd1) df2 = spark.createDataFrame(rdd2) unionDF = df1.union(df2) hasattr(rdd1, "createDataFrame") hasattr(rdd2, "createDataFrame") #unionDF.show()
from pyspark import SparkContext,SparkConf # 2. on spécifie le contexte, en fait, on créé un RDD ( résilient data set) en mémoire ram, là c'est du local, mais normalement c'est sur un cluster/ conf = SparkConf().setAppName('testing').setMaster('local') sc=SparkContext(conf=conf) # 3. on importe un fichier txt = sc.textFile('datasets/copyright.txt') # Syntaxe Sur linux .... # txt = sc.textFile('file:////usr/share/doc/python/copyright') # 3. bis Si on avait voulu se connecter à un cluster, voici le code : """ conf = pyspark.SparkConf() conf.setMaster('spark://head_node:56887') conf.set('spark.authenticate', True) conf.set('spark.authenticate.secret', 'secret-key') sc = SparkContext(conf=conf) """ # 4. on affiche son nombre de lignes du fichier print("le nombre de lignes de mon ficher est de :" ,txt.count()) # 5. On filtre sur les lignes contenant le terme 'python' python_lines = txt.filter(lambda line: 'python' in line.lower()) # 6. on affiche son nombre de lignes du résultat filtré print("le nombre de lignes de mon ficher comprenent le terme python est de :" ,python_lines.count())
y = property(lambda self: self._y, _set_y, doc='The number of difference ' 'vectors used.') z = property(lambda self: self._z, _set_z, doc='Crossover scheme.') F = property(lambda self: self._F, _set_F, doc='Weight used during ' 'mutation.') CR = property(lambda self: self._CR, _set_CR, doc='Weight used during ' 'bin crossover.') if __name__ == '__main__': start_time = time.time() args = sys.argv sconf = SparkConf() sconf.setAppName("lda") sconf.setMaster(args[1]) sconf.set("spark.executor.memory", "6g") sconf.set("spark.driver.memory", "6g") sconf.set("spark.driver.maxResultSize", "6g") sconf.set("spark.yarn.executor.memoryOverhead", "2g") sconf.set("spark.yarn.driver.memoryOverhead", "2g") sconf.set("spark.eventLog.enabled", "true") sconf.set("spark.eventLog.dir", "hdfs://" + args[3] + "/user/" + args[4] + "/Logs/") sc = SparkContext(conf=sconf) #labels=[int(args[5])] labels = [1, 2, 3, 4, 5, 6, 7, 8, 9] random.seed(1) bounds = [(10, 100), (0, 1), (0, 1)] result = {}
df_pred.show() # Join prediction with original data. df_pred = df_pred.join(df, 'id') df_pred.show() sys.exit(1) # # # for maxIter in range(300, 1000, 100): # # # for x in range(1, 50): # # # # Build the model (cluster the data) # # # clusters = KMeans.train(mat, x, maxIterations=maxIter, initializationMode="random") # # # WSSSE = mat.map(lambda point: error(clusters, point)).reduce(lambda x, y: x + y) # # # print("cluster {0}: maxIter {1}: Within Set Sum of Squared Error = {2}".format(x, maxIter, WSSSE)) if __name__ == "__main__": # SparkContext represents connection to a Spark cluster. conf = SparkConf() conf.setAppName("Spark Machine Learning App") conf.setMaster('local[2]') sc = SparkContext(conf=conf) sc.setLogLevel("WARN") spark = SparkSession \ .builder \ .config(conf=conf) \ .getOrCreate() readDataFromES()
def functionToCreateContext(self): # Define Spark configuration conf = SparkConf() conf.setMaster(self.config['master_url']) conf.setAppName(self.config['app_name']) # conf.set("spark.cores.max", "2") conf.set("spark.streaming.backpressure.enabled", True) # conf.set("spark.streaming.backpressure.initialRate", "60") # Can set the max rate per kafka partition if needed conf.set("spark.streaming.kafka.maxRatePerPartition", "100") # Initialize a SparkContext sc = SparkContext(conf=conf) spark = SparkSession(sc) # Set the batch interval to be 1 sec ssc = StreamingContext(sc, self.config['batch_interval']) def savetohdfs(rdd): if not rdd.isEmpty(): schema = StructType([ StructField("IP", StringType(), True), StructField("user_identifier", StringType(), True), StructField("user_id", StringType(), True), StructField("user_name", StringType(), True), StructField("time", StringType(), True), StructField("Method", StringType(), True), StructField("URI", StringType(), True), StructField("HTTP-Code", StringType(), True), StructField("code", StringType(), True), StructField("size", StringType(), True), StructField("device", StringType(), True), StructField("tenant_id", StringType(), True), StructField("timezone", StringType(), True), StructField("OS", StringType(), True), StructField("browser", StringType(), True), StructField("country", StringType(), True), StructField("screenResolution", StringType(), True), StructField("action", StringType(), True), StructField("referrer", StringType(), True), StructField("timeonpage", StringType(), True), StructField("supplier_id", StringType(), True), StructField("product", StringType(), True), StructField("geolocation", StringType(), True) ]) # # schema =['IP','user_identifier','user_id','user_name','time','Message','code', # 'size','device','user_name','tenant_id','timezone','OS', 'browser', # 'country','screenResolution','action','referrer','timezone', # 'supplier_id','product','geolocation'] df = rdd.toDF(schema) df.write.mode("Overwrite").format('json').save( "hdfs://localhost:9820/user/rzariwal/stream") # # Consume Kafka streams directly, without receivers lines = KafkaUtils.createDirectStream( ssc, [self.topic], {"metadata.broker.list": self.addr}) lines1 = lines.map(lambda x: x[1]) lines1.cache() val_sum_lines = lines1.window(self.report_interval, self.batch_interval) val_sum_lines_top_ip = val_sum_lines.filter(lambda x: 'HEARTBEAT' not in x) \ .map(lambda x: (x.split(' ')[0].rstrip(' '), x.split(' ')[1].rstrip(' '), x.split(' ')[2].rstrip(' '), x.split(' ')[3].rstrip(' '), x.split(' ')[4].lstrip('['), x.split(' ')[6].lstrip('"'), x.split(' ')[7].rstrip(' '), x.split(' ')[8].rstrip('"'), x.split(' ')[9].rstrip(' '), x.split(' ')[10].rstrip(' '), x.split(' ')[11].rstrip(' '), x.split(' ')[12].rstrip(' '), x.split(' ')[13].rstrip(' '), x.split(' ')[14].rstrip(' '), x.split(' ')[15].rstrip(' '), x.split(' ')[16].rstrip(' '), x.split(' ')[17].rstrip(' '), x.split(' ')[18].rstrip(' '), x.split(' ')[19].rstrip(' '), x.split(' ')[20].rstrip(' '), x.split(' ')[21].rstrip(' '), x.split('[')[2].rstrip('] '), x.split('[')[3].rstrip('] '))) val_sum_lines_top_ip.foreachRDD(savetohdfs) val_sum_lines_top_ip1 = val_sum_lines_top_ip.map( lambda x: (x[3], x[4], x[6], x[8])) val_sum_lines_top_ip1.pprint() def savetheresult(rdd): if not rdd.isEmpty(): hbase_table = 'flexigym' hconn = happybase.Connection('localhost') ctable = hconn.table(hbase_table) hconn.open() for row in rdd.collect(): time = datetime.now() counter = str(time) + row[0] ctable.put( counter, { b'Page_Visted:': row[2], b'Response_Code:': row[3], b'Time:': row[1], b'User_Name:': row[0] }) schema = ["User_Name", "Page_Visted", "Response_Code", "Time"] rdd.toDF(schema).groupBy("User_Name", "Page_Visted", "Response_Code", "Time") \ .count() \ .show(truncate=False) val_sum_lines_top_ip1.foreachRDD(savetheresult) return ssc
import findspark findspark.init() import time from pyspark import SparkContext as sc from pyspark import SparkConf from pyspark.sql import SparkSession as ss from pyspark.sql.types import * conf = SparkConf() conf.setMaster("spark://Sarthaks-MBP:7077").setAppName( 'IPL Analytics Job').set("spark.executor.memory", "512m") spark = sc(conf=conf) a = spark.textFile("Dataset/*.csv").map(lambda line: line.split(",")).filter( lambda line: line[0].strip() == "ball").collect() player_vs_player = {} for line in a: details = line if details[0].strip() == 'ball': players1 = (details[4], details[6]) players2 = (details[5], details[6]) if players1 in player_vs_player.keys(): player_vs_player[players1]['total'] += int(details[7]) player_vs_player[players1]['runs'][int(details[7])] += 1 player_vs_player[players1]['balls'] += 1 if details[9] != '""' and details[9] != 'run out' and players1[ 0].strip() == details[10].strip(): player_vs_player[players1]['wickets'] += 1 else: player_vs_player[players1] = {} player_vs_player[players1]['total'] = int(details[7])
inputCol="scaled_features", outputCol="scaled_weighted_features") train_all_client = m.transform(train_all_client) client_master = m.transform(client_master) sqlContext.dropTempTable("df_master_train_table") nn = 1000 popshared = 0.30 num_indices = (int)(popshared * client_master.count()) tree_type = "kd_tree" nn, popshared, num_indices train_pd = train_all_client.toPandas() test_pd = client_master.toPandas() freq_table = findNearestNeighbour_client(train_pd, test_pd, nn, num_indices, tree_type) sqlContext.createDataFrame(freq_table[['cust_id', 'freq']], ).repartition( 1).write.format("com.databricks.spark.csv").save(output_path) if __name__ == "__main__": conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("yarn") sc = SparkContext(conf=conf) main(sc)
'merck': 5, 'nike': 6, 'verizon': 8 } context = x[1][0] return3 = x[1][1] context2 = context.split("\t")[1] key = context.split("\t")[0] return2 = return3.split(",")[dic2[compdic[key]]] res = "\t".join([x[0], key, return2, context2]) return res if __name__ == "__main__": conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) df = sc.textFile("tweets/*.txt") parts = df.map(lambda x: (x.split(",")[0], ",".join(x.split(",")[1:]))) #parts.collect() #parts = df.map(lambda x: (x.split(" ")[0], x))) #df1=df.take(1) #print df1 df2 = sc.textFile("tweets2/step1.txt") parts2 = df2.map(lambda l: ((l.split("\t")[0], "\t".join(l.split("\t")[1:])))) parts3 = parts2.join(parts) part4 = parts3.map(mapp) part4.saveAsTextFile("step33/step3.txt") #print parts4.take(1) #df3=df2.take(1)
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p0.34/jars/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory] Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). Number of elements in RDD is 8 计算成功! """ from pyspark import SparkConf from pyspark import SparkContext conf = SparkConf() conf.setMaster('yarn') conf.setAppName('spark-yarn') conf.setExecutorEnv('HADOOP_CONF_DIR','$HADOOP_HOME/etc/hadoop') conf.setExecutorEnv('YARN_CONF_DIR','$HADOOP_HOME/etc/hadoop') sc = SparkContext(conf=conf) def mod(x): import numpy as np return (x, np.mod(x, 2)) rdd = sc.parallelize(range(1000)).map(mod).take(10) print(rdd) """ >>> SLF4J: Class path contains multiple SLF4J bindings.
from pyspark.mllib.feature import HashingTF, IDF from pyspark.mllib.util import MLUtils from pyspark.sql import Row from pyspark.ml.feature import StopWordsRemover, Tokenizer, RegexTokenizer from nltk.stem import PorterStemmer if __name__ == "__main__": #get the parameters brokers, topic = sys.argv[1:] #number of features to be used in tf idf calculation numFeatures = 1000 #Spark configuration conf = SparkConf() conf.setMaster('spark://VM10-1-0-20:7077') conf.setAppName('sentiment_stream') sc = SparkContext(conf=conf) spark = SparkSession(sc) #sliding window configuration (in seconds) ssc = StreamingContext(sc, 20) #set list of broker kafkaParams = {"metadata.broker.list": brokers} counter_model = sc.accumulator(1) #stemming words def porter_stem(words): stem = [PorterStemmer().stem(x) for x in words]
from scipy.stats import ttest_ind from scipy.signal import savgol_filter from collections import OrderedDict from multiprocessing import Pool from functools import partial from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext #from single_molecule_mechanics.ProteinModels import xSeriesWLCe from .ProteinModels import xSeriesWLCe from collections import OrderedDict from multiprocessing import Pool from functools import partial #create a spark context conf = SparkConf().setAppName("App") conf = (conf.setMaster('local[*]').set('spark.executor.memory', '2G').set( 'spark.driver.memory', '8G').set('spark.driver.maxResultSize', '15G')) sc = SparkContext(conf=conf) ### NOTE: you will need to re-create this zip file every time you want to run this code. This is super annoying. We should look into using Dask instead of Spark. print( 'NOTE: you will need to re-create this zip file every time you want to run this code. This is super annoying. We should look into using Dask instead of Spark.' ) sc.addPyFile("/home/tbartsch/source/repos/single_molecule_mechanics.zip") class TimeSeriesLoader(object): '''Provides data structures and methods to analyze single-molecule data.''' def __init__(self): #define some default values data = np.empty((3, 2)) data.fill(np.nan)
def initSparkConf(isLocal, appName): conf = SparkConf() conf.setAppName(appName) if isLocal is True: conf.setMaster("local[*]") return conf
# spark-submit --master yarn --num-executors 10 --jars spark-csv-assembly-1.4.0.jar amazon_book.rating.py from pyspark import SparkContext from pyspark import SparkConf from pyspark.sql import HiveContext from pyspark.sql.types import StructType, IntegerType, StringType, FloatType, TimestampType, StructField conf = SparkConf() conf.setAppName('spark-workshop') conf.setMaster('yarn-client') sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) customSchema = StructType( [ StructField("user", StringType(), True), StructField("item", StringType(), True), StructField("rating", FloatType(), True), StructField("timestamp", IntegerType(), True) ] ) df = sqlContext.read.format("com.databricks.spark.csv")\ .option("header", "false")\ .option("inferSchema", "false")\ .schema(customSchema)\ .load("/project/public/spark-workshop/amazon_ratings_Books.csv") df.registerTempTable("tb") sqlContext.sql("drop table if exists default.amazon_book_rating")
from pyspark import SparkConf, SparkContext conf1 = SparkConf() conf1.setMaster("local") conf1.setAppName("Sixth program") sc = SparkContext(conf=conf1) List1 = [1, 2, 3, 4, 5, 5, 5, 3] Rdd = sc.parallelize(List1) Data1 = Rdd.countByValue() for K in Data1: print(K, "...", Data1[K]) print("======================") for K in Data1: print(K, "...", 3) def add(x, y): return x + y def maxi(x, y): if x > y: return x else: return y
# spark-basic.py from pyspark import SparkConf from pyspark import SparkContext conf = SparkConf() conf.setMaster('spark://gabriela-TM1703:7077') conf.setAppName('spark-basic') sc = SparkContext(conf=conf) # ACO code import numpy as np from random import randrange from aco_algorithm import AntColony N = 10 rand_matrix = np.random.random_integers(1, 100, size=(N, N)) rand_dist = (rand_matrix + rand_matrix.T) / 2 for i in range(N): rand_dist[i][i] = np.inf for i in range((int)(N / 10)): j = randrange(0, N - 1) k = randrange(0, N - 1) rand_dist[j][k] = np.inf rand_dist[k][j] = np.inf ant_colony = AntColony(rand_dist, 20, 100, 0.95, alpha=1, beta=1) shortest_path = ant_colony.run(0, 5) print("--- Final result is: ---") print(shortest_path)
from pyspark import SparkContext from pyspark import SparkConf import os import sys conf = SparkConf() sparkmaster = sys.argv[1] wordcountfile = sys.argv[2] conf.setMaster(sparkmaster) conf.setAppName("test") sc = SparkContext(conf=conf) file = sc.textFile(wordcountfile) counts = file.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) output = counts.collect() for (k, v) in output: print k + ": " + str(v)
# !/usr/bin/env python from pyspark import SparkContext, SparkConf import pickle conf = SparkConf() conf.setMaster("spark://dmlhdpc10:7077") conf.setAppName("VFKMMProject") conf.set("spark.executor.memory", "5g") conf.set("spark.ui.port", "44041") sc = SparkContext(conf=conf, pyFiles=['lib.zip']) #numOfCores = 96 # sc = SparkContext(appName="PythonProject", pyFiles=['lib.zip']) import numpy as np import time import argparse from lib.splitter import split from lib.bagger import get_size_no, partition, bag, pair, cartesian from lib.kmm import computeBeta from lib.evaluation import computeNMSE from lib.scaleKMM import * from lib.util import * from lib.bagger import * from lib.caculate import * import csv def kmmProcess(): parser = argparse.ArgumentParser() parser.add_argument('-b', "--bagging", type=int,
return False else: return True if __name__ == "__main__": APP_NAME = "hw3_problem04" MASTER_URL = "spark://192.168.1.103:7077" HOME_PATH = "D:\\data\\" SHINGLING_K = 1 DOCUMENT_ONE_INDEX = 0 DOCUMENT_TWO_INDEX = 3 # 初始化 conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster(MASTER_URL) sc = SparkContext(conf=conf) # 讀取檔案並進行預處理以取得每篇新聞的body.text news_rdd = preprocess_data( sc.wholeTextFiles(HOME_PATH + "reut2-[0-9]*.sgm")) # 取得要比較的兩篇新聞的body.text news_rdd = get_target_news(DOCUMENT_ONE_INDEX, DOCUMENT_TWO_INDEX, news_rdd) # 計算兩篇新聞的shingles news_shingles_rdd = create_shingles(news_rdd) # 執行linear search start_time = time.time() linear_search_result = linear_search(news_shingles_rdd) linear_search_time = time.time() - start_time
vals['telephone'] - 1, vals['foreign_wkr'] - 1 ] return LabeledPoint(label, feats) def train_decision_tree(lp, german_cfi): return DecisionTree.trainClassifier(lp, numClasses=2, categoricalFeaturesInfo=german_cfi, impurity='gini', maxDepth=3, maxBins=5) ## Main functionality def main(sc): dat = sqlCtx.sql('SELECT * FROM german') lp = dat.rdd.filter(lambda x: x.cred).map(german_lp).cache() predictions = model.predict(lp.map(lambda x: x.features)) labelsAndPredictions = lp.map(lambda lp: lp.label).zip(predictions) trainErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(lp.count()) if __name__ == "__main__": conf = SparkConf().setAppName("german_spark_submit") conf = conf.setMaster("yarn cluster") sc = SparkContext(conf=conf) main(sc)
APP_NAME = 'IE2-Project-Homocide-Reports' INPUT_DATA = "C:/Users/MWeil/Documents/GitHub/IE2-Project/emb/emb-out.txt" OUTPUT_LABEL = "C:/Users/MWeil/Documents/GitHub/IE2-Project/data/homicide-reports/database_new_label_emb.json" INPUT_MODEL = "C:/Users/MWeil/Documents/GitHub/IE2-Project/embedding-clustering/kmeans_model_7" def run_kmeans(sc): cpu_count = multiprocessing.cpu_count() # Load Data dataset = sc.textFile(INPUT_DATA, cpu_count) dataset = dataset.map( lambda line: array([float(x) for x in line.split(';')])) # Load Model sameModel = KMeansModel.load(sc, INPUT_MODEL) # Predict cluster labels per row labels = sameModel.predict(dataset).collect() # Save labels in json file with open(OUTPUT_LABEL, 'w') as out_f: json.dump(labels, out_f) if __name__ == "__main__": conf = SparkConf().setAppName(APP_NAME) conf.setMaster('local[*]') sc = SparkContext(conf=conf) run_kmeans(sc)