def main(): from pyspark import SparkContext from pyspark.sql import SQLContext sc = SparkContext() sqlCtx = SQLContext(sc) csv2df_events(sqlCtx) sc.stop()
def main(input_path, output_path): sc = SparkContext(appName='Data_Analysis') # load raw dataset raw_rdd = sc.textFile(input_path).map(lambda x: x.split('|')) # vaidate whether data fulfill the definition of data dictionary validate_rdd = raw_rdd # load fixed cell master file for cgi and bbc mapping cell_master_dict = {} with open(CELL_MASTER_FILE, 'r') as f: for line in f: line = line.strip() line = line.split('|') cell_master_dict[line[0]] = line[9] transform_rdd = validate_rdd.map(lambda x: data_tranform(x, cell_master_dict)) ''' Filter out records for JABODETABEK ''' Jabodetabek_rdd = transform_rdd.filter(lambda x: x[-1] == 'JABODETABEK') bbc_ci_number = Jabodetabek_rdd.map(lambda x: (x[-2], 1)).reduceByKey(lambda x,y : x+y).count() print 'Number of cell tower for Jabodetabek: %d' % bbc_ci_number Jabodetabek_rdd.saveAsTextFile(output_path)
class ZeppelinReporterTest(unittest.TestCase): def setUp(self): self.sc = SparkContext() self.sql = SQLContext(self.sc) self.df = self.sql.createDataFrame([(1, "a"), (1, None), (3, "c")]) def tearDown(self): self.sc.stop() def test_output(self): with patch("pyddq.reporters.get_field") as get_field: baos = ByteArrayOutputStream() baos.jvm = self.df._sc._jvm get_field.return_value = baos.jvm_obj check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") z = Mock() reporter = ZeppelinReporter(z) check.run([reporter]) expected_output = """ %html </p> <h4>Checking [_1: bigint, _2: string]</h4> <h5>It has a total number of 2 columns and 3 rows.</h5> <table> <tr><td style="padding:3px">❌</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr> <tr><td style="padding:3px">✅</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr> </table> <p hidden> """.strip() self.assertEqual(baos.get_output(), expected_output)
def run(self): sc = SparkContext("local", "gender") sqlContext = SQLContext(sc) #StringType =(str, unicode) _out = self.output().open('w') #lines = sc.textFile("myUser.csv") #fobj = self.input().open("r") #lines = sc.textFile(fobj.name) print(type(self.required_tasks['insert_source'].output())) print(self.required_tasks['insert_source']) #print(self.input()['insert_source'].input()) lines = sc.textFile("myUser.csv") parts = lines.map(lambda l: l.split(",")) users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7], p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19])) schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId" print(schemaString) _out.write(schemaString ) fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) #schemaUser = sqlContext.createDataFrame(users, schema) schemaUser = sqlContext.applySchema(users, schema) schemaUser.registerTempTable("users") results = sqlContext.sql("SELECT gender FROM users") genders = results.map(lambda p : (p,1)) counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect() for name in counts.collect(): _out.write(str(name)) _out.close()
def do_all(f_path,out_name): sc = SparkContext() data = sc.textFile(f_path) data = data.map(parseKeepD).filter(lambda p: p[0] != None) # Scale Features features = data.map(lambda x: x[0].features) summary = Statistics.colStats(features) global means global varis means = summary.mean() varis = summary.variance() #scale the points data = data.map(lambda y: (conv_label_pt(y[0]),y[1])) #train model model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none') #calculate disparity disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1])) #calculate SSR for later ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum() #keep N N = disparity.count() #shut down SC MSE = ssr/float(N) se = std_errors(data,MSE,N) disparity.saveAsTextFile(out_loc + out_name) sc.stop() return model.intercept,model.weights,se,disparity, ssr, N
def query12_input(query_name, conf=None, output_persist=False): sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) # SQL statements can be run by using the sql methods provided by sqlContext sql = "use tpcds_text_db_1_50" _ = sqlContext.sql(sql) # web_sales_sql = "select * from web_sales" # web_sales = sqlContext.sql(web_sales_sql) # web_sales.persist() # web_sales.registerAsTable("web_sales") # item_sql = "select * from item" # item = sqlContext.sql(item_sql) # item.persist() # item.registerAsTable("item") # date_dim_sql = "select * from date_dim" # date_dim = sqlContext.sql(date_dim_sql) # date_dim.persist() # date_dim.registerAsTable("date_dim") sqlContext.cacheTable("web_sales") sqlContext.cacheTable("item") sqlContext.cacheTable("date_dim") # discard the first query output = execute_sql(query_name, sqlContext, output_persist) # check the re-run statistics output = execute_sql(query_name, sqlContext) output['describe'] = output['output'].describe().show() sc.stop() return output
def main(): inputs = sys.argv[1] output = sys.argv[2] conf = SparkConf().setAppName('scalable multiplication') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' text = sc.textFile(inputs) # sbaronia - Split the row to get individual numbers row = text.map(lambda line: line.split()) # sbaronia - calling element_wise_product on individual line # and then adding all the returned 10x10 matrix to get # final matrix sub = row.map(element_wise_product).reduce(add_tuples) # sbaronia - writing formatted output to a file in # a 10x10 matrix result = open(output, 'w') count = 0 for i in range(len(sub)): result.write(str(sub[i]) + " ") count += 1 if (count == 10): result.write("\n") count = 0 result.close()
def main(argList): # Process command line args if len(argList) >= 2: pass else: print ("no input file specified and or output") usage() sys.exit() if '-inputPartition' in argList: inp = int(argList[argList.index('-inputPartition') + 1]) else: inp = 1 if '-outputPartition' in argList: onp = int(argList[argList.index('-outputPartition') + 1]) else: onp = inp # Create Spark Contex for NONE local MODE sc = SparkContext() irdd = sc.textFile(argList[0], inp, use_unicode=True).map(lambda x: (x[0:10],x[10:])) ordd = irdd.sortByKey(True, onp).map(lambda x: (x[0] + x[1].strip('\n')) + '\r') ordd.saveAsTextFile(argList[1]+'/output')
def main(): cleanup() sc = SparkContext() spark = SparkSession(sc) path = os.path.join(mysql_export_dir, "name_string_indices.tsv") df = spark.read.csv(path, header=True, inferSchema=True, sep='\t', nullValue='NULL') names = df.select('name').rdd.map(lambda r: r['name']) names_json = parse_spark(sc, names) \ .map(json.loads) \ .zip(df.rdd) synonym_names = names_json.filter(lambda n: is_synonym(n)) accepted_names = names_json.filter(lambda n: not is_synonym(n)) synonym_names_with_accepted_columns = synonym_names \ .map(to_key_value) \ .leftOuterJoin(accepted_names.map(to_key_value)) \ .map(add_accepted_data_to_synonym_name) accepted_names_with_accepted_columns = accepted_names \ .map(add_accepted_data_accepted_name) sc.union([synonym_names_with_accepted_columns, accepted_names_with_accepted_columns]) \ .map(join_fields) \ .saveAsTextFile(output_dir_name_string_indices)
def main(): sc = SparkContext( appName="Transforming Eff Care" ) src = sc.textFile(utils.data_home + "/measure_dates.csv") transformed = src.map(utils.to_row_sep).map(transform_row).map(utils.to_row_string) transformed.saveAsTextFile(utils.data_home + "/measures_data")
def KMeansModel(dataPath, label, k, character, master): sc = SparkContext(master) data = sc.textFile(dataPath).map(lambda line: line.replace(character, ',')) if label == 0: label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[0]), 1)).reduceByKey(add).collect() label = data.map(lambda line: line.split(',')).map(lambda data: float(data[0])).collect() train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part), x[1:len(x)])) else: label_sum = data.map(lambda line: line.split(',')).map(lambda data: (float(data[-1]), 1)).reduceByKey(add).collect() label = data.map(lambda line: line.split(',')).map(lambda data: float(data[-1])).collect() train_data = data.map(lambda line: line.split(',')).map(lambda x: map(lambda part: float(part) if part is not None else '', x[:len(x) - 1])) model = km.train(train_data, k) predict_data = train_data.collect() train = len(predict_data) acc = 0 for i in range(len(label_sum)): ksum = np.zeros(k, dtype = int) cur_label = label_sum[i][0] for j in range(train): if label[j] == cur_label: ksum[model.predict(predict_data[j])] += 1 acc += max(ksum) string = "KMeans Result: \n" center = model.centers for i in range(k): cur = str(i) + ":" + str(center[i]) + '\n' string += cur string = string + "Acc: " + str((float(acc)/train) * 100) + "%" sc.stop() return string
def main(): input = sys.argv[1] output = sys.argv[2] conf = SparkConf().setAppName('Matrix Multiplication') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' row = sc.textFile(input).map(lambda row : row.split(' ')).cache() ncol = len(row.take(1)[0]) intermediateResult = row.map(permutation).reduce(add_tuples) outputFile = open(output, 'w') result = [intermediateResult[x:x+3] for x in range(0, len(intermediateResult), ncol)] for row in result: for element in row: outputFile.write(str(element) + ' ') outputFile.write('\n') outputFile.close()
class TestWordCounter(unittest.TestCase): def setUp(self): conf = SparkConf().setAppName("appTest").setMaster("local[*]") self.sc = SparkContext(conf=conf) self.counter = WordCounter() def tearDown(self): self.sc.stop() def test_when_exist_one_movie_and_counter(self): movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy", "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"] result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']), ('Toy', ['::Toy Story Toy (1995)::'])) movies = self.sc.parallelize(movieList) self.assertEqual(self.counter.getMaxValues(movies),result) def test_when_exist_one_movie_and_counter_moreMovies(self): movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy", "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy", "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"] result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::'])) movies = self.sc.parallelize(movieList) self.assertEqual(self.counter.getMaxValues(movies),result)
def recom(matrix_file_name, user_file_name, output="re.out"): sc = SparkContext("local[8]", "Recommendation") """ Reads in a sequence file FILE_NAME to be manipulated """ matrix = sc.sequenceFile(matrix_file_name) user = sc.sequenceFile(user_file_name) """ - flatMap takes in a function that will take one input and outputs 0 or more items - map takes in a function that will take one input and outputs a single item - reduceByKey takes in a function, groups the dataset by keys and aggregates the values of each key """ user_tuples = user.flatMap(flat_user) \ .map(map_user) \ .sortByKey(keyfunc=lambda k: int(k)) keys = user_tuples.keys().collect() matrix_tuples = matrix.flatMap(flat_matrix) \ .map(map_matrix) \ .filter(lambda x: x[0] in keys) global mt mt = matrix_tuples.collectAsMap() recm = user_tuples.flatMap(flat_recom) \ .reduceByKey(reduce_recom) \ .filter(lambda x: x[0] not in keys) \ .sortBy(lambda (key, value): int(value)) """ Takes the dataset stored in counts and writes everything out to OUTPUT """ recm.coalesce(1).saveAsTextFile(output)
def stackexchange_json_spark_job(): """ Spark job to convert json data from hdfs into ques and ans. Result is written into elasticsearch for text based search from user. """ server = bluebook_conf.HDFS_FQDN conf = SparkConf().setAppName("stackexchange_json_spark_job") spark_context = SparkContext(conf=conf) json_ques_folder_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME +\ "/part-*" json_ans_folder_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME +\ "/part-*" # Ques and ans files are seperately read from hdfs ques_file = spark_context.textFile(json_ques_folder_address) ans_file = spark_context.textFile(json_ans_folder_address) ques_tups = ques_file.map(lambda line: stackexchange_json_mapper(line, 'ques')) ans_tups = ans_file.map(lambda line: stackexchange_json_mapper(line, 'ans')) # Join accepted answers with their respective questions ques_ans = ques_tups.join(ans_tups).map(lambda x: (x[0], {'ques': x[1][0], 'ans': x[1][1]})) ques_ans.saveAsNewAPIHadoopFile( path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=stackoverflow_es_write_conf)
def main(argv): ''' matrixDirectory: the hdfs directory where we find users profile matrix. It is assumed to be compressed and split in several files. streamFiles: the files used to update the matrix. In userId|country|artistId|trackId format outputFile: optional output directory for the updated matrix. By default, we simply overwrite the current one''' matrixDirectory, streamFiles, outputFile = getArguments(argv) sc = SparkContext(appName="usersProfile") # open both matrix and non processed stream_xxxxxxxx files # Turn into (key, value) pair, where key = (user, track), to prepare the join matrix = (sc.textFile(matrixDirectory + "*.gz") .map(lambda line: map(int, line.split(" "))) .map(lambda t: ((t[0], t[1]), t[2]))) streamData = (sc.textFile(streamFiles) .map(lambda line: line.split("|")) .map(lambda t: ((int(t[0]), int(t[3])), 1))) outData = (matrix.join(streamData) # here the entries look like ((user, track), [count, 1, 1 ...]) .map(lambda t: (t[0], sum(t[1])) ) # compute new count => ((user, track), new_count) .sortByKey() .map(lambda t: " ".join(map(str, (t[0][0], t[0][1], t[1]))))) # prepare output file saveAsTextFile(outData, path = outputFile, overwrite = True)
def run(): #if __name__ == "__main__": sc = SparkContext(master = spark_addr, appName= app_name) rdd = sc.textFile(hdfs_addr + file_path, 2).map(lambda line:format_list(line)).cache() # rdd = sc.parallelize(test_list,4).cache() #********create rules************ supp = float(rdd.count())*supp_rate item = create_item(rdd) #create one item item = freq(rdd,item,supp) one_item = item freq_items = item while item.count() > 0: more_item = item_plus(sc,item) item = freq(rdd,more_item,supp) freq_items = freq_items.union(item) #result freq_items is key_value,key's type is frozenset # rules = produce_rule(freq_items,one_item) # rule_result = rules.collect() freq_result = freq_items.collect() # one_result = one_item.keys().collect() one_result = one_item.keys().collect() dict_rule = produce_rule(freq_result,one_result) out,total = probability(rdd,dict_rule,0.5) out1 =out.collect() print "$$$$$$$$$$$$$$$$$$$$$$$out=",out1,"all=",total #**************************** sc.stop() return freq_result,dict_rule
class SparkContextFactory: def __init__(self): # not sure why windows environment variable can't be read, I set it ##os.environ["SPARK_HOME"] = "C:\Spark" # not sure why windows environment variable can't be read, I set it ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin" ##sys.path.append("C:\Spark\python") ##sys.path.append("C:\Spark\bin") # specify spark home os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark" # specify pyspark path so its libraries can be accessed by this application sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python") from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext self.conf = SparkConf().setMaster("yarn-client") self.conf.setAppName("MrT") self.conf.set("spark.executor.memory", "5g") self.conf.set("spark.driver.memory", "10g") self.sc = SparkContext(conf = self.conf, pyFiles = ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"]) """ toDF method is a monkey patch executed inside SQLContext constructor so to be able to use it you have to create a SQLContext first """ self.sqlContextInstance = SQLContext(self.sc) def disconnect(self): self.sc.stop()
def init_spark_context(details=[]): global spark_context if spark_context: return build_type = yb_dist_tests.global_conf.build_type from pyspark import SparkContext # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of # retries. # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism # is just for the resilience of the test framework itself. SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES)) if yb_dist_tests.global_conf.build_type == 'tsan': logging.info("Using a separate default Spark cluster for TSAN tests") default_spark_master_url = DEFAULT_SPARK_MASTER_URL_TSAN else: logging.info("Using the regular default Spark cluster for non-TSAN tests") default_spark_master_url = DEFAULT_SPARK_MASTER_URL spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', default_spark_master_url) details += [ 'user: {}'.format(getpass.getuser()), 'build type: {}'.format(build_type) ] if 'BUILD_URL' in os.environ: details.append('URL: {}'.format(os.environ['BUILD_URL'])) spark_context = SparkContext(spark_master_url, "YB tests ({})".format(', '.join(details))) spark_context.addPyFile(yb_dist_tests.__file__)
def solve_puzzle(master, output, height, width, slaves): global HEIGHT, WIDTH, level HEIGHT=height WIDTH=width level = 0 sc = SparkContext(master, "python") """ YOUR CODE HERE """ NUM_WORKERS = slaves sol = Sliding.solution(WIDTH, HEIGHT) """ MAP REDUCE PROCESSING CODE HERE """ level_pos = sc.parallelize((make_state(level, sol),)) prev_size, size = 0, 1 while prev_size != size: level += 1 if level % 10 == 0: level_pos = level_pos.partitionBy(PARTITION_COUNT) level_pos = level_pos.flatMap(bfs_flat_map).reduceByKey(bfs_reduce) prev_size = size size = level_pos.count() """ OUTPUT CODE HERE """ level_pos = level_pos.map(unhash_board) level_pos.coalesce(NUM_WORKERS).saveAsTextFile(output) sc.stop()
def load_cut_to_rdd(input_file, result_file): sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050") lines = sc.textFile(input_file) data = lines.map(parseKV).cache() doc_term_tf = data.reduceByKey(add).cache() num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count() terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect() num_term = len(terms_list) term_idf = doc_term_tf.map( lambda ((tid, term), tf): (term, 1.0) ).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1))) tfidf_join = doc_term_tf.map( lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf) tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf))) doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache() nonzero_count = 0 f = open(result_file,'w') f.write('%s %s\r\n'%(num_doc, num_term)) for (tid, feature) in doc_vec.collect(): for num in feature: f.write(str(num)+"\t") f.write("\n") f.close() sc.stop() return
def solve_puzzle(master, output, height, width, slaves): global HEIGHT, WIDTH, level HEIGHT=height WIDTH=width level = 0 sc = SparkContext(master, "python") """ YOUR CODE HERE """ """ YOUR MAP REDUCE PROCESSING CODE HERE """ solution=Sliding.solution(WIDTH, HEIGHT) sol = Sliding.board_to_hash(WIDTH, HEIGHT, solution) data = sc.parallelize([(sol,level),]) counter = 0 curLen = 1 while(counter < curLen): level += 1 data = data.flatMap(bfs_flat_map) if (level% 12 == 0): data = data.partitionBy(PARTITION_COUNT) data = data.reduceByKey(bfs_reduce) if (level% 6 == 0): counter = curLen curLen = data.count() """ YOUR OUTPUT CODE HERE """ data.coalesce(slaves).saveAsTextFile(output) sc.stop()
def main(name, divide): """ old_g = pickle.load(open("/net/data/facebook/facebook-ucsb/Facebook_2008/"+name +"/original_pickles/"+name +".pickle", 'r')) new_g = networkx.Graph() for node, friends in old_g.adj.iteritems(): if node not in new_g.nodes(): new_g.add_node(node) for friend in friends.iterkeys(): new_g.add_node(friend) new_g.add_edge(node, friend) """ # serialize the networkx graph as text files of edgelist # into a text file for workers to read # networkx.write_edgelist(new_g, "edgelist/"+name, data=False) # subprocess.check_call("hdfs dfs -put edgelist/"+name+ " edgelist/", shell=True) new_g = networkx.read_adjlist(name + "_list.txt") # Egypt_list is an edge list sc = SparkContext(appName="Sorted_removal") dataG = json_graph.node_link_data(new_g) stringG = json.dumps(dataG) originalG = sc.broadcast(stringG) edges = sc.textFile("hdfs://scrapper/user/xiaofeng/edgelist/" + name, 192 * 4 * int(divide)) costs = edges.map(lambda line: line.split(" ")).map(lambda edge: edge_to_cost(edge, originalG.value)) costs.saveAsTextFile("hdfs://scrapper/user/xiaofeng/costs_" + name) sc.stop() subprocess.check_call("hdfs dfs -get costs_" + name + " /home/xiaofeng/facebook/FacebookProject/costs/", shell=True) Reformat("/home/xiaofeng/facebook/FacebookProject/costs/costs_" + name + "/", name)
def count_triangles(data, master="local[2]"): """ @brief: Count triangles using Spark @param data: The data location for the input files @param master: The master URL as defined at https://spark.apache.org/docs/1.1.0/submitting-applications.html#master-urls """ ################# NO EDITS HERE ################### assert not os.path.exists("triangles.out"), "File: triangles.out \ already exists" sc = SparkContext(master, "Triangle Count") start = time() ############### END NO EDITS HERE ################ # TODO: Your code goes here! people = sc.textFile(data) triad = people.flatMap(GetTriad).reduceByKey(add).filter(lambda x: x[1]>1) #triadCount = triad.map(lambda x: (x,1)) #triadSum = triadCount.reduceByKey(add) #triangles = triadSum.filter(lambda x: x[1]>1) #output = triangles.collect() output = triad.collect() #triangles.saveAsTextFile("test1") ################# NO EDITS HERE ################### print "\n\n*****************************************" print "\nTotal algorithm time: %.4f sec \n" % (time()-start) print "*****************************************\n\n""" ############### END NO EDITS HERE ################ with open("triangles.out", "wb") as f: for friends in output: f.write(friends[0]+"\n") # TODO: Loop with f to write your result to file serially pass
def bmRun(self): """ Connect DB from Spark and Run/Profile Query """ #create output file for results print "Create benchmark output file for recoring..." file_out = open("/Users/mira67/Downloads/benchmark_output.txt", "w") print "start query evaluation, load tables from DB and register tables in Spark..." #load data with Spark with Timer() as tm: sc = SparkContext("local","penguin") #sc = SparkContext(master=local[2]) sqlContext = SQLContext(sc) #queries test here, depends on queries to load table in memory df1 =sqlContext.read.jdbc(url=self.url, table = self.tbName[0],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable df1.registerTempTable(self.tbName[0]) df2 =sqlContext.read.jdbc(url=self.url, table = self.tbName[1],lowerBound = 0, upperBound = 350, numPartitions=200)#dbtable is variable df2.registerTempTable(self.tbName[1]) #register helper functions for SQL sqlContext.registerFunction("MONTH", lambda x: x[5:7], StringType())#grab Month sqlContext.registerFunction("YEAR", lambda x: x[0:4], StringType()) sqlContext.registerFunction("DAY", lambda x: x[8:10], StringType()) rdf1 = sqlContext.sql("SELECT * FROM "+self.tbName[0]) rdf2 = sqlContext.sql("SELECT * FROM " + self.tbName[1]) sqlContext.registerDataFrameAsTable(rdf1, self.mtb[0]) sqlContext.registerDataFrameAsTable(rdf2, self.mtb[1]) mem_use = self.memory_usage_psutil() print "memory_use_load %s" %mem_use print "=> elasped load data: %s ms" % (tm.secs * 1000) #Query with Spark with Timer() as tm: #query rdf = sqlContext.sql(self.sqlStm) #need register as table first print "Data schema from query:" rdf.printSchema() #hist of BT values #Todo mem_use = self.memory_usage_psutil() print "memory_use_load %s" %mem_use print "=> elasped: %s ms" % (tm.secs * 1000) file_out.write("Query Time %s Memory %s\n" % (str(tm.secs * 1000),str(mem_use))) #example enabled day1 = sqlContext.sql("SELECT * FROM ssmi t1, map t2 WHERE t1.DATE BETWEEN '1990-01-01' AND '1990-01-01' AND t1.LOCID = t2.ID ORDER BY t1.LOCID") #call plot demoplt = qplt.queryPlot() demoplt.qMapDemo(day1) #stop sparkcontext sc.stop()
def main(arglist): with open("log_file_v.txt", "a") as f: f.write("Start time of validation...... %s\n" % datetime.datetime.now()) print("Start time of validation...... %s" % datetime.datetime.now()) # mapreduce params output = arglist[0] minPartitions = int(arglist[1]) # initialize sc = SparkContext(appName="PythonValidate") # rdd = sc.textFile(output_file_name, minPartitions=minPartitions) rdd = sc.wholeTextFiles(output, minPartitions=minPartitions) print('partitions', rdd.getNumPartitions()) error_count = rdd.mapPartitions(separateBlocks).sum() sc.stop() print("End time of validation...... %s" % datetime.datetime.now()) with open("log_file_v.txt", "a") as f: f.write("End time of validation...... %s\n" % datetime.datetime.now()) f.write("Error count of sorted file...... %s" % error_count) f.close()
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True) parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False) parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true') args = parser.parse_args() sc = SparkContext() global goodJsonRecords, badJsonRecords goodJsonRecords = sc.accumulator(0) badJsonRecords = sc.accumulator(0) data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text") tagCounts = data.values().flatMap(getTokens).countByValue() # So far, this code isn't useful. The output fiile is written by the # master node into an isolated folder, and I don't know of a way to # retrieve it. if args.output != None: with codecs.open(args.output, 'wb', 'utf-8') as f: for k in sorted(tagCounts): f.write(k + " " + str(tagCounts[k]) + "\n") print "========================================" print "goodJsonRecords = %d" % goodJsonRecords.value print "badJsonRecords = %d" % badJsonRecords.value if args.printToLog: for k in sorted(tagCounts): print json.dumps(k), tagCounts[k] print "========================================"
def stackexchange_xml_spark_job(): server = bluebook_conf.HDFS_FQDN conf = SparkConf() xml_file_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_XML_FOLDER_NAME +\ bluebook_conf.STACKEXCHANGE_XML_FILE_NAME json_ques_folder_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME json_ans_folder_address = "hdfs://" + server + "/" +\ bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME conf.setAppName('stackexchange_xml_spark_job') spark_context = SparkContext(conf=conf) file = spark_context.textFile(xml_file_address) # Ques and Ans files are stored seperately depending of their 'posttypeid' # Ques -> posttypeid == 1 # Ans -> posttypeid == 2 ques = file.map(stackexchange_xml_mapper)\ .filter(lambda dic: 'posttypeid' in dic.keys())\ .filter(lambda dic: dic['posttypeid'] == '1')\ .map(lambda d: jsoner(d)) ans = file.map(stackexchange_xml_mapper)\ .filter(lambda dic: 'posttypeid' in dic.keys())\ .filter(lambda dic: dic['posttypeid'] == '2')\ .map(lambda d: jsoner(d)) ques.saveAsTextFile(json_ques_folder_address) ans.saveAsTextFile(json_ans_folder_address)
def __init__(self, file_path, train_file, test_file, real_file=None): """ file_path: the folder where data files reside train_file: (user, item, rating) quote records test_file: (user, item) records, preferences to be predicted real_file: (user, option, value) real purchase records, can be none if it doesn't exist For this specific project: item here is the combination of options with their values, e.g. item 10 denotes option A with choice 0; item 21 denotes option B with choice 1 rating is the number of quotes for a certain item by a user """ self.file_path = file_path config = SparkConf().setMaster("local").setAppName("Kaggle")\ .set("spark.executor.memory", "2g")\ .set("spark.storage.memoryFraction", "1") sc = SparkContext(conf=config) self.train_data = sc.textFile("file:" + self.file_path + train_file).cache()\ .map(lambda line: array([float(x) for x in line.split(',')])) self.test_data = sc.textFile("file:" + self.file_path + test_file).cache()\ .map(lambda line: [float(x) for x in line.split(',')]) if real_file: self.real_data = sc.textFile("file:" + self.file_path + real_file).cache()\ .map(lambda line: [float(x) for x in line.split(',')]).map(lambda r: ((r[0], r[1]), r[2]))
def main(): conf = SparkConf().set("spark.ui.showConsoleProgress", "false") sc = SparkContext(appName="PythonStatusAPIDemo", conf=conf) def run(): rdd = sc.parallelize(range(10), 10).map(delayed(2)) reduced = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) return reduced.map(delayed(2)).collect() result = call_in_background(run) status = sc.statusTracker() while result.empty(): ids = status.getJobIdsForGroup() for id in ids: job = status.getJobInfo(id) print "Job", id, "status: ", job.status for sid in job.stageIds: info = status.getStageInfo(sid) if info: print "Stage %d: %d tasks total (%d active, %d complete)" % \ (sid, info.numTasks, info.numActiveTasks, info.numCompletedTasks) time.sleep(1) print "Job results are:", result.get() sc.stop()
# input: a tuple (x, y) (longtitude, landtitude) and a list of (id, (longtitude, langtitude)) of centers def closestCenter(xy, centers): min_dist = haversine(xy, centers[0][1]) min_center_id = centers[0][0] for i in range(1, len(centers)): # lengths should be 5 dist = haversine(xy, centers[i][1]) if dist < min_dist: min_dist = dist min_center_id = centers[i][0] # id return ( min_center_id, (xy[0], xy[1], 1) ) #-------- program --------# appName = "Kmeans App" conf = SparkConf().setAppName(appName) sc = SparkContext(conf=conf) HDFS = "hdfs://master:9000/" rides = sc.textFile(HDFS + "yellow_tripdata_1m.csv") # pointer to the file filter_rides = rides.filter(myfilter) # remove incorrect data lines (contain 0.0 coordinate) coords = filter_rides.map(lambda line: (float(line.split(",")[3]), float(line.split(",")[4]))) # map data to correct format #TODO sc.cashe() for extra grade centers = [(idx, tup) for idx, tup in enumerate(coords.take(5))] # list type, not rdd #print("\n\n-------- {} --------\n{}\n\n".format('Beginning', centers)) for i in range(0, MAX_ITER): mapped = coords.map(lambda tup: closestCenter(tup, centers)) # emit (id of center, ) avg = mapped \ .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2])) \ .mapValues(lambda v: (v[0]/v[2], v[1]/v[2])) # reduce by key (id of center) and tranform only the value
from skimage import io import numpy as np import matplotlib.pyplot as plt from skimage.exposure import rescale_intensity from scipy import ndimage as ndi import math from skimage.morphology import skeletonize from django.views.decorators.http import condition from pyspark import SparkContext from pyspark.sql import SQLContext, Row from pyspark.ml.linalg import Vectors from pyspark import SparkConf, SparkContext os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'") conf = (SparkConf().set("spark.driver.maxResultSize", "5g")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) def as_gray(image_filter, image, *args, **kwargs): gray_image = rgb2gray(image) return image_filter(gray_image, *args, **kwargs) @adapt_rgb(as_gray) def original_gray(image): return image @adapt_rgb(as_gray) def skeleton_gray(image): return skeletonize(image)
if not bf[nOffset + k]: bHit = False break nOffset += bits_per_slice if bHit == True: yield t if __name__ == '__main__': sApp = 'spark' nPart = 38*14*4 #sRef = op.join(sHdfsDir, 'hg38.fa.nb.enc.gzip') sRef = op.join(sHdfsDir, 'chr21.fa.nb.enc.gzip') sInput = op.join(sHdfsDir, 'first1M.fa.nb.enc') sSeeds = op.join(sHdfsDir, 'seed.enc') # print default SparkConf sf = SparkConf() print sf.toDebugString() sc = SparkContext(appName=sApp) rdd = sc.textFile(op.join(sHdfsDir,'half.enc'), use_unicode=False) nTotal = rdd.count() sc.stop() print nTotal
""" Created on Mon Dec 14 16:13:29 2020 @author: prach """ import re from pyspark import SparkConf, SparkContext def normalizeWords(text): return re.compile(r'\W+', re.UNICODE).split(text.lower()) conf = SparkConf().setMaster("local").setAppName("WordCount") sc = SparkContext(conf=conf) input = sc.textFile("file:///sparkcourse/book.txt") rdd = input.flatMap(normalizeWords) rdd1 = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) rdd2 = rdd1.map(lambda x: (x[1], x[0])).sortBykey() rdd3 = rdd2.sortBykey() results = rdd3.collect() for result in results: count = str(result[0])
class HailContext(object): """The main entrypoint for Hail functionality. :param sc: spark context, will be auto-generated if None :type sc: :class:`.pyspark.SparkContext` :param appName: Spark application identifier :param master: Spark cluster master :param local: local resources to use :param log: log path :param quiet: suppress log messages :param append: write to end of log file instead of overwriting :param parquet_compression: level of on-disk annotation compression :param min_block_size: minimum file split size in MB :param branching_factor: branching factor for tree aggregation :param tmp_dir: temporary directory for file merging :ivar sc: Spark context :vartype sc: :class:`.pyspark.SparkContext` """ def __init__(self, sc=None, appName="Hail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, parquet_compression='uncompressed', min_block_size=1, branching_factor=50, tmp_dir='/tmp'): from pyspark import SparkContext SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm Env._jvm = self._jvm Env._gateway = self._gateway # hail package self._hail = getattr(self._jvm, 'is').hail driver = scala_package_object(self._hail.driver) if not sc: self._jsc = driver.configureAndCreateSparkContext( appName, joption(master), local, parquet_compression, min_block_size) self.sc = SparkContext(gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) else: self.sc = sc # sc._jsc is a JavaSparkContext self._jsc = sc._jsc.sc() driver.configureHail(branching_factor, tmp_dir) driver.configureLogging(log, quiet, append) self._jsql_context = driver.createSQLContext(self._jsc) self._sql_context = SQLContext(self.sc, self._jsql_context) def _jstate(self, jvds): return self._hail.driver.State( self._jsc, self._jsql_context, jvds, scala_object(self._jvm.scala.collection.immutable, 'Map').empty()) def _run_command(self, vds, pargs): jargs = jarray(self._jvm.java.lang.String, pargs) t = self._hail.driver.ToplevelCommands.lookup(jargs) cmd = t._1() cmd_args = t._2() jstate = self._jstate(vds._jvds if vds != None else None) try: result = cmd.run(jstate, cmd_args) except Py4JJavaError as e: raise_py4j_exception(e) return VariantDataset(self, result.vds()) def grep(self, regex, path, max_count=100): """Grep big files, like, really fast. **Examples** Print all lines containing the string ``hello`` in *file.txt*: >>> hc.grep('hello','data/file.txt') Print all lines containing digits in *file1.txt* and *file2.txt*: >>> hc.grep('\d', ['data/file1.txt','data/file2.txt']) **Background** :py:meth:`~hail.HailContext.grep` mimics the basic functionality of Unix ``grep`` in parallel, printing results to screen. This command is provided as a convenience to those in the statistical genetics community who often search enormous text files like VCFs. Find background on regular expressions at `RegExr <http://regexr.com/>`_. :param str regex: The regular expression to match. :param path: The files to search. :type path: str or list of str :param int max_count: The maximum number of matches to return. """ pargs = ["grep", regex] if isinstance(path, str): pargs.append(path) else: for p in path: pargs.append(p) pargs.append('--max-count') pargs.append(str(max_count)) self._run_command(None, pargs) def import_annotations_table(self, path, variant_expr, code=None, npartitions=None, config=None): """Import variants and variant annotations from a delimited text file (text table) as a sites-only VariantDataset. :param path: The files to import. :type path: str or list of str :param str variant_expr: Expression to construct a variant from a row of the text table. Must have type Variant. :param code: Expression to build the variant annotations. :type code: str or None :param npartitions: Number of partitions. :type npartitions: int or None :param config: Configuration options for importing text files :type config: :class:`.TextTableConfig` or None :rtype: :class:`.VariantDataset` """ pargs = ['importannotations', 'table'] if isinstance(path, str): pargs.append(path) else: for p in path: pargs.append(p) pargs.append('--variant-expr') pargs.append(variant_expr) if code: pargs.append('--code') pargs.append(code) if npartitions: pargs.append('--npartition') pargs.append(npartitions) if not config: config = TextTableConfig() pargs.extend(config._as_pargs()) return self._run_command(None, pargs) def import_bgen(self, path, tolerance=0.2, sample_file=None, npartitions=None): """Import .bgen files as VariantDataset :param path: .bgen files to import. :type path: str or list of str :param float tolerance: If the sum of the dosages for a genotype differ from 1.0 by more than the tolerance, set the genotype to missing. :param sample_file: The sample file. :type sample_file: str or None :param npartitions: Number of partitions. :type npartitions: int or None :return A dataset imported from the bgen file. :rtype: :class:`.VariantDataset` """ pargs = ["importbgen"] if isinstance(path, str): pargs.append(path) else: for p in path: pargs.append(p) if sample_file: pargs.append('--samplefile') pargs.append(sample_file) if npartitions: pargs.append('--npartition') pargs.append(str(npartitions)) pargs.append('--tolerance') pargs.append(str(tolerance)) return self._run_command(None, pargs) def import_gen(self, path, sample_file=None, tolerance=0.02, npartitions=None, chromosome=None): """Import .gen files as VariantDataset. **Examples** Read a .gen file and a .sample file and write to a .vds file:: >>> (hc.import_gen('data/example.gen', sample_file='data/example.sample') >>> .write('data/example.vds')) Load multiple files at the same time with `Hadoop glob patterns <../reference.html#hadoopglob>`_:: >>> (hc.import_gen('data/example.chr*.gen', sample_file='data/example.sample') >>> .write('data/example.vds')) **Notes** For more information on the .gen file format, see `here <http://www.stats.ox.ac.uk/%7Emarchini/software/gwas/file_format.html#mozTocId40300>`_. To ensure that the .gen file(s) and .sample file are correctly prepared for import: - If there are only 5 columns before the start of the dosage data (chromosome field is missing), you must specify the chromosome using the ``chromosome`` parameter - No duplicate sample IDs are allowed The first column in the .sample file is used as the sample ID ``s.id``. .. _dosagefilters: **Dosage representation** Since dosages are understood as genotype probabilities, :py:meth:`~hail.HailContext.import_gen` automatically sets to missing those genotypes for which the sum of the dosages is a distance greater than the ``tolerance`` paramater from 1.0. The default tolerance is 0.02, so a genotypes with sum .97 or 1.03 is filtered out, whereas a genotype with sum .98 or 1.02 remains. :py:meth:`~hail.HailContext.import_gen` normalizes all dosages to sum to 1.0. Therefore, an input dosage of (0.98, 0.0, 0.0) will be stored as (1.0, 0.0, 0.0) in Hail. Even when the dosages sum to 1.0, Hail may store slightly different values than the original GEN file (maximum observed difference is 3E-4). **Annotations** :py:meth:`~hail.HailContext.import_gen` adds the following variant annotations: - **va.varid** (*String*) -- 2nd column of .gen file if chromosome present, otherwise 1st column. - **va.rsid** (*String*) -- 3rd column of .gen file if chromosome present, otherwise 2nd column. :param path: .gen files to import. :type path: str or list of str :param sample_file: The sample file. :type sample_file: str or None :param float tolerance: If the sum of the dosages for a genotype differ from 1.0 by more than the tolerance, set the genotype to missing. :param npartitions: Number of partitions. :type npartitions: int or None :param chromosome: Chromosome if not listed in the .gen file. :type chromosome: str or None :rtype: :class:`.VariantDataset` :return: A dataset imported from a .gen and .sample file. """ pargs = ["importgen"] if isinstance(path, str): pargs.append(path) else: for p in path: pargs.append(p) if sample_file: pargs.append('--samplefile') pargs.append(sample_file) if chromosome: pargs.append('--chromosome') pargs.append(chromosome) if npartitions: pargs.append('--npartition') pargs.append(str(npartitions)) if tolerance: pargs.append('--tolerance') pargs.append(str(tolerance)) return self._run_command(None, pargs) def import_keytable(self, path, key_names, npartitions=None, config=None): """Import delimited text file (text table) as KeyTable. :param path: files to import. :type path: str or list of str :param key_names: The name(s) of fields to be considered keys :type key_names: str or list of str :param npartitions: Number of partitions. :type npartitions: int or None :param config: Configuration options for importing text files :type config: :class:`.TextTableConfig` or None :rtype: :class:`.KeyTable` """ path_args = [] if isinstance(path, str): path_args.append(path) else: for p in path: path_args.append(p) if not isinstance(key_names, str): key_names = ','.join(key_names) if not npartitions: npartitions = self.sc.defaultMinPartitions if not config: config = TextTableConfig() return KeyTable(self, self._hail.keytable.KeyTable.importTextTable( self._jsc, jarray(self._jvm.java.lang.String, path_args), key_names, npartitions, config._to_java())) def import_plink(self, bed, bim, fam, npartitions=None, delimiter='\\\\s+', missing='NA', quantpheno=False): """Import PLINK binary file (BED, BIM, FAM) as VariantDataset **Examples** Import data from a PLINK binary file: >>> vds = (hc.import_plink(bed="data/test.bed", >>> bim="data/test.bim", >>> fam="data/test.fam")) **Implementation Details** Only binary SNP-major mode files can be read into Hail. To convert your file from individual-major mode to SNP-major mode, use PLINK to read in your fileset and use the ``--make-bed`` option. The centiMorgan position is not currently used in Hail (Column 3 in BIM file). The ID (``s.id``) used by Hail is the individual ID (column 2 in FAM file). .. warning:: No duplicate individual IDs are allowed. Chromosome names (Column 1) are automatically converted in the following cases: - 23 => "X" - 24 => "Y" - 25 => "X" - 26 => "MT" **Annotations** :py:meth:`~hail.HailContext.import_plink` adds the following annotations: - **va.rsid** (*String*) -- Column 2 in the BIM file. - **sa.famID** (*String*) -- Column 1 in the FAM file. Set to missing if ID equals "0". - **sa.patID** (*String*) -- Column 3 in the FAM file. Set to missing if ID equals "0". - **sa.matID** (*String*) -- Column 4 in the FAM file. Set to missing if ID equals "0". - **sa.isFemale** (*String*) -- Column 5 in the FAM file. Set to missing if value equals "-9", "0", or "N/A". Set to true if value equals "2". Set to false if value equals "1". - **sa.isCase** (*String*) -- Column 6 in the FAM file. Only present if ``quantpheno`` equals False. Set to missing if value equals "-9", "0", "N/A", or the value specified by ``missing``. Set to true if value equals "2". Set to false if value equals "1". - **sa.qPheno** (*String*) -- Column 6 in the FAM file. Only present if ``quantpheno`` equals True. Set to missing if value equals ``missing``. :param str bed: PLINK BED file. :param str bim: PLINK BIM file. :param str fam: PLINK FAM file. :param npartitions: Number of partitions. :type npartitions: int or None :param str missing: The string used to denote missing values **only** for the phenotype field. This is in addition to "-9", "0", and "N/A" for case-control phenotypes. :param str delimiter: FAM file field delimiter regex. :param bool quantpheno: If True, FAM phenotype is interpreted as quantitative. :return: A dataset imported from a PLINK binary file. :rtype: :class:`.VariantDataset` """ pargs = ["importplink"] pargs.append('--bed') pargs.append(bed) pargs.append('--bim') pargs.append(bim) pargs.append('--fam') pargs.append(fam) if npartitions: pargs.append('--npartition') pargs.append(npartitions) if quantpheno: pargs.append('--quantpheno') pargs.append('--missing') pargs.append(missing) pargs.append('--delimiter') pargs.append(delimiter) return self._run_command(None, pargs) def read(self, path, sites_only=False): """Read .vds files as VariantDataset When loading multiple .vds files, they must have the same sample IDs, split status and variant metadata. :param path: .vds files to read. :type path: str or list of str :param bool sites_only: If True, create sites-only VariantDataset. Don't load sample ids, sample annotations or gneotypes. :return: A dataset read from disk :rtype: :class:`.VariantDataset` """ pargs = ["read"] if isinstance(path, str): pargs.append(path) else: for p in path: pargs.append(p) if sites_only: pargs.append("--skip-genotypes") return self._run_command(None, pargs) def write_partitioning(self, path): """Write partitioning.json.gz file for legacy VDS file. :param str path: path to VDS file. """ self._hail.variant.VariantSampleMatrix.writePartitioning(self._jsql_context, path) def import_vcf(self, path, force=False, force_bgz=False, header_file=None, npartitions=None, sites_only=False, store_gq=False, pp_as_pl=False, skip_bad_ad=False): """Import .vcf files as VariantDataset :param path: .vcf files to read. :type path: str or list of str :param bool force: If True, load .gz files serially. :param bool force_bgz: If True, load .gz files as blocked gzip files (BGZF) :param header_file: File to load VCF header from. If not specified, the first file in path is used. :type header_file: str or None :param npartitions: Number of partitions. :type npartitions: int or None :param bool sites_only: If True, create sites-only VariantDataset. Don't load sample ids, sample annotations or gneotypes. :param bool store_gq: If True, store GQ FORMAT field instead of computing from PL. :param bool pp_as_pl: If True, store PP FORMAT field as PL. EXPERIMENTAL. :param bool skip_bad_ad: If True, set AD FORMAT field with wrong number of elements to missing, rather than setting the entire genotype to missing. :return: A dataset imported from the VCF file :rtype: :class:`.VariantDataset` """ pargs = ["importvcf"] if isinstance(path, str): pargs.append(path) else: for p in path: pargs.append(p) if force: pargs.append('--force') if force_bgz: pargs.append('--force-bgz') if header_file: pargs.append('--header-file') pargs.append(header_file) if npartitions: pargs.append('--npartition') pargs.append(str(npartitions)) if pp_as_pl: pargs.append('--pp-as-pl') if skip_bad_ad: pargs.append('--skip-bad-ad') if sites_only: pargs.append('--skip-genotypes') if store_gq: pargs.append('--store-gq') return self._run_command(None, pargs) def index_bgen(self, path): """Index .bgen files. import_bgen cannot run with these indicies. :param path: .bgen files to index. :type path: str or list of str """ pargs = ["indexbgen"] if isinstance(path, str): pargs.append(path) else: for p in path: pargs.append(p) self._run_command(None, pargs) def balding_nichols_model(self, populations, samples, variants, partitions=None, pop_dist=None, fst=None, af_dist = UniformDist(0.1, 0.9), seed=0): """Generate a VariantDataset using the Balding-Nichols model. **Examples** To generate a VDS with 3 populations, 100 samples in total, and 1000 variants: >>> vds = hc.balding_nichols_model(3, 100, 1000) To generate a VDS with 4 populations, 2000 samples, 5000 variants, 10 partitions, population distribution [0.1, 0.2, 0.3, 0.4], :math:`F_st` values [.02, .06, .04, .12], ancestral allele frequencies drawn from a truncated beta distribution with a = .01 and b = .05 over the interval [0.05, 1], and random seed 1: >>> vds = hc.balding_nichols_model(4, 40, 150, 10, >>> pop_dist=[0.1, 0.2, 0.3, 0.4], >>> fst=[.02, .06, .04, .12], >>> af_dist=hail.stats.TruncatedBetaDist(a=0.01, b=2.0, minVal=0.05, maxVal=1.0), >>> seed=1) **Notes** Hail is able to randomly generate a VDS using the Balding-Nichols model. - :math:`K` populations are labeled by integers 0, 1, ..., K - 1 - :math:`N` samples are named by strings 0, 1, ..., N - 1 - :math:`M` variants are defined as ``1:1:A:C``, ``1:2:A:C``, ..., ``1:M:A:C`` - The default ancestral frequency distribution :math:`P_0` is uniform on [0.1, 0.9]. Options are UniformDist(minVal, maxVal), BetaDist(a, b), and TruncatedBetaDist(a, b, minVal, maxVal). All three classes are located in hail.stats. - The population distribution :math:`\pi` defaults to uniform - The :math:`F_{st}` values default to 0.1 - The number of partitions defaults to one partition per million genotypes (i.e., samples * variants / 10^6) or 8, whichever is larger The Balding-Nichols model models genotypes of individuals from a structured population comprising :math:`K` homogeneous subpopulations that have each diverged from a single ancestral population (a `star phylogeny`). We take :math:`N` samples and :math:`M` bi-allelic variants in perfect linkage equilibrium. The relative sizes of the subpopulations are given by a probability vector :math:`\pi`; the ancestral allele frequencies are drawn independently from a frequency spectrum :math:`P_0`; the subpopulations have diverged with possibly different :math:`F_{ST}` parameters :math:`F_k` (here and below, lowercase indices run over a range bounded by the corresponding uppercase parameter, e.g. :math:`k = 1, \ldots, K`). For each variant, the subpopulation allele frequencies are drawn a `beta distribution <https://en.wikipedia.org/wiki/Beta_distribution>`_, a useful continuous approximation of the effect of genetic drift. We denote the individual subpopulation memberships by :math:`k_n`, the ancestral allele frequences by :math:`p_{0, m}`, the subpopulation allele frequencies by :math:`p_{k, m}`, and the genotypes by :math:`g_{n, m}`. The generative model in then given by: .. math:: k_n \,&\sim\, \pi p_{0,m}\,&\sim\, P_0 p_{k,m}\mid p_{0,m}\,&\sim\, \mathrm{Beta}(\mu = p_{0,m},\, \sigma^2 = F_k p_{0,m}(1 - p_{0,m})) g_{n,m}\mid k_n, p_{k, m} \,&\sim\, \mathrm{Binomial}(2, p_{k_n, m}) We have parametrized the beta distribution by its mean and variance; the usual parameters are :math:`a = (1 - p)(1 - F)/F,\; b = p(1-F)/F` with :math:`F = F_k,\; p = p_{0,m}`. **Annotations** :py:meth:`~hail.HailContext.balding_nichols_model` adds the following global, sample, and variant annotations: - **global.nPops** (*Int*) -- Number of populations - **global.nSamples** (*Int*) -- Number of samples - **global.nVariants** (*Int*) -- Number of variants - **global.popDist** (*Array[Double]*) -- Normalized population distribution indexed by population - **global.Fst** (*Array[Double]*) -- F_st values indexed by population - **global.seed** (*Int*) -- Random seed - **global.ancestralAFDist** (*Struct*) -- Information about ancestral allele frequency distribution - **sa.pop** (*Int*) -- Population of sample - **va.ancestralAF** (*Double*) -- Ancestral allele frequency - **va.AF** (*Array[Double]*) -- Allele frequency indexed by population :param int populations: Number of populations. :param int samples: Number of samples. :param int variants: Number of variants. :param int partitions: Number of partitions. :param pop_dist: Unnormalized population distribution :type pop_dist: array of float or None :param fst: F_st values :type fst: array of float or None :param af_dist: Ancestral allele frequency distribution :type af_dist: :class:`.UniformDist` or :class:`.BetaDist` or :class:`.TruncatedBetaDist` :param int seed: Random seed. :rtype: :class:`.VariantDataset` :return: A VariantDataset generated by the Balding-Nichols model. """ if pop_dist is None: jvm_pop_dist_opt = joption(pop_dist) else: jvm_pop_dist_opt = joption(jarray(self._jvm.double, pop_dist)) if fst is None: jvm_fst_opt = joption(fst) else: jvm_fst_opt = joption(jarray(self._jvm.double, fst)) return VariantDataset(self, self._hail.stats.BaldingNicholsModel.apply(self._jsc, populations, samples, variants, jvm_pop_dist_opt, jvm_fst_opt, seed, joption(partitions), af_dist._jrep())) def dataframe_to_keytable(self, df, keys=[]): """Convert Spark SQL DataFrame to KeyTable. Spark SQL data types are converted to Hail types in the obvious way as follows: .. code-block:: text BooleanType => Boolean IntegerType => Int LongType => Long FloatType => Float DoubleType => Double StringType => String BinaryType => Binary ArrayType => Array StructType => Struct Unlisted Spark SQL data types are currently unsupported. :param keys: List of key column names. :type keys: list of string :return: The DataFrame as a KeyTable. :rtype: :class:`.KeyTable` """ jkeys = jarray(self._jvm.java.lang.String, keys) return KeyTable(self, self._hail.keytable.KeyTable.fromDF(df._jdf, jkeys)) def stop(self): """ Shut down the Hail Context """ self.sc.stop() self.sc = None
#Author: Andre Foote #Date: 27th May 2018 from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql import Row import numpy as np from pyspark.sql.types import * import matplotlib.pyplot as plt import sys if __name__ == "__main__": sc = SparkContext(appName="SparkProblem") sqlContext = SQLContext(sc) #Load csv into dataframe df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('DataSample.csv') #Remove suspicious requests (ie records with identical geoinfo and timest fields) df_no_dupes = df.dropDuplicates(['TimeSt','Latitude','Longitude']) no_dupes_length = df_no_dupes.count()-1 #The points of interest (poi) poiList = [(53.546167000000004, -113.48573400000001), (45.521629, -73.566024), (45.22483, -63.232729000000006)] #The following three functions return a single rdd containing the distance between each row in #DataSample.csv and one of the POI coordinates. distance1 for POI1, distance2 for POI2, distance3 for POI3. def distance1(row): return float(np.sqrt((row.Latitude-poiList[0][0])**2+(row.Longitude-poiList[0][1])**2))
import sys import pyspark import string from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType from pyspark.sql.window import Window from pyspark.sql.functions import * if __name__ == "__main__": sc = SparkContext() spark = SparkSession \ .builder \ .appName("sql") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() sqlContext = SQLContext(spark) # get command-line arguments inFile = sys.argv[1] supp = sys.argv[2] conf = sys.argv[3] prot = sys.argv[4]
from pyspark import SparkContext from pyspark.sql import SparkSession, DataFrame from pyspark.sql import SQLContext import os import csv import time start_time = time.time() filename = '/home/fieldtest1/CATT_Intern/TripRecords/TripRecordsFebruary1.csv' sc = SparkContext() spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() def combine_csv(path_to_csv, path_to_save): list_csv = os.listdir(path_to_csv) all_csv = [] for csv_ in list_csv: if csv_[-1] == 'v': path = path_to_csv + '/' + csv_ all_csv.append(open(path, mode='r', newline='')) with open(path_to_save, 'w', newline='', encoding='utf-8') as g: writer = csv.writer(g) for csv_ in all_csv: for row in csv_: writer.writerow(row.rstrip(',,\r\n').split(',')) def extract_crossed_trips(crossed_trips_path, trip_records_name, month):
from pyspark.sql import SQLContext from pyspark.sql.types import * from pyspark import SparkContext sc = SparkContext("local", "Simple App") sqlContext = SQLContext(sc) surveys = sc.textFile('/user/w205/hospital_compare/surveys_responses.csv') surveyfiltered = surveys.filter(lambda x: "Not Available" not in x) surveyparts1 = surveyfiltered.map(lambda l: l.split(',')) surveyparts = surveyparts1.filter(lambda l: len(l) is 33) surveys_table = surveyparts.map( lambda l: ('hcahps', l[0], int(l[31].strip('"')), int(l[32].strip('"')))) schemaString = 'sid hid base_score consistency_score' surveyfields = [ StructField(field_name, StringType(), True) for field_name in schemaString.split() ] surveyfields[2].dataType = IntegerType() surveyfields[3].dataType = IntegerType() surveyschema = StructType(surveyfields) schemasurveys = sqlContext.createDataFrame(surveys_table, surveyschema) schemasurveys.registerTempTable('surveys_table') # save files
return ((str(flightDate), yDest), (flight[11], flight[18], flight[6], flight[10], flight[25], float(flight[38].strip('\"')))) # Origin = 11 # Dest = 18 #Airline = 6 # Flight Number = 10 # CRSDepTime = 25 # ArrDelay = 38 # year = 0 # Month = 2 # DayofMonth = 3 conf = SparkConf() sc = SparkContext(conf=conf) allFiles = [] allFiles = getFileNames() rdd = sc.textFile(','.join(allFiles)) runningFlights = rdd.map(lambda line: line.split(',')) \ .filter(notCancelled) \ .filter(isFloat) flightXY = runningFlights.filter( lambda x: float(x[25].strip('\"')) < 1200).map(extractInfo) flightYZ = runningFlights.filter(lambda x: float(x[25].strip( '\"')) > 1200).map(lambda flight: extractInfo(flight, True))
return failures def kupiecTestStatistic(total, failures, confidenceLevel): failureRatio = float(failures)/ total logNumer = (total - failures) * math.log1p(-confidenceLevel) * failures * math.log(confidenceLevel) logDenom = (total - failures) * math.log1p(-failureRatio) + failures * math.log(failureRatio) return -2 * (logNumer - logDenom) def kupiecTestPValue(stocksReturns, valueAtRisk, confidenceLevel): failures = countFailures(stocksReturns, valueAtRisk) total = len(stocksReturns[0]) testStatistic = kupiecTestStatistic(total, failures, confidenceLevel) return 1 - stats.chi2.cdf(testStatistic, 1) if __name__ == "__main__": sc = SparkContext(appName="VaR") (stocksReturns, factorsReturns) = readStocksAndFactors("/Users/Karim/Downloads/VaR-Data/") plotDistribution(factorsReturns[2]) plotDistribution(factorsReturns[3]) numTrials = 10000000 parallelism = 1000 baseSeed = 1001L trials = computeTrialReturns(stocksReturns, factorsReturns, sc, baseSeed, numTrials,parallelism) trials.cache() valueAtRisk = fivePercentVaR(trials) conditionalValueAtRisk = fivePercentCVaR(trials) print("VaR 5%: " + str(valueAtRisk)) print("CVaR 5%: " + str(conditionalValueAtRisk)) varConfidenceInterval = bootstrappedConfidenceInterval(trials, fivePercentVaR, 100, 0.05) cvarConfidenceInterval = bootstrappedConfidenceInterval(trials, fivePercentCVaR, 100, 0.05) print("VaR confidence interval: " + str(varConfidenceInterval))
import os import sys from pyspark import SparkConf, SparkContext, SQLContext import logger MODULE_NAME = os.path.basename(sys.modules['__main__'].__file__) TEST_NAME = os.path.splitext(MODULE_NAME)[0] LOGGER = logger.get_logger(TEST_NAME) URLPATH = "s3a://dask-avro-data/application-data/app-100*.avro" # Start LOGGER.info('START: Creating spark conf') Sconf = SparkConf() sc = SparkContext(appName="my_test", conf=Sconf) sqlContext = SQLContext(sparkContext=sc) LOGGER.info('FINISH: Finished creating spark conf') LOGGER.info('START: Creating spark dataframe') df = sqlContext.read.format("com.databricks.spark.avro").load(URLPATH) LOGGER.info('FINISH: Spark dataframe created') LOGGER.info('START: Starting filtered count') cnt = df.filter(df.payload.originationCountryCode == 'CAN').count() LOGGER.info('START: Count is %s', cnt) sc.stop()
# import numpy as np #import pandas as pd # d=pd.read_csv("cnt_51000000_7_20_20_1_relu_no_header.csv", delim_whitespace=True) # d["HC"] = d[0].map(lambda x: hamming_comp(inputs_str,x,2)) # d.to_csv("cnt_51000000_7_20_20_1_relu.csv", sep="\t") input_dim = 7 inputs = [[int(l) for l in "{0:07b}".format(i)] for i in range(0, 2**input_dim)] inputs_str = ["{0:07b}".format(i) for i in range(0, 2**input_dim)] from pyspark import SparkContext sc = SparkContext.getOrCreate() print(sc._jsc.sc().getExecutorMemoryStatus()) print(sc) print("Ready to go!") # data = sc.textFile("cnt_51000000_7_20_20_1_relu_no_header.csv") #data = sc.textFile("test.txt") data = sc.textFile(filename) #data.take(15) data = data.map(lambda x: x.split("\t")) # data2 = data.map(lambda x: x[0]).take(15) # rdd = sc.parallelize(data2) # rdd.map(lambda x: hamming_comp(inputs_str,x,2)).collect()
stopWordsPath = sys.argv[1] delimitersPath = sys.argv[2] delimiters = "" with open(stopWordsPath) as f: data = f.read() stopwords = re.split("\\n|\\s", data) #TODO with open(delimitersPath) as f: delimiters = ",|;|\.|\?|!|-|:|@|\[|\]|\(|\)|\{|\}|_|\*|\/|\\n| " #TODO conf = SparkConf().setMaster("local").setAppName("TitleCount") conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=conf) lines = sc.textFile(sys.argv[3], 1) outputFile = open(sys.argv[4], "w") sys.stdout = outputFile def titlecountmap(line): retval = list() words = re.split(delimiters, line.lower()) for word in words: if word not in stopwords and word != '': retval.append(word) return retval
from pyspark.mllib.classification import LogisticRegressionWithSGD import numpy as np from math import log from math import exp # exp(-t) = e^-t from operator import add import sys # input_file = sys.argv[1] # input_file = "/data/scratch/vw/criteo-display-advertising-dataset/train.txt" # Should be some file on your system input_file = "/tmp/datasets/train.txt" # Should be some file on your system print ("--------------------creating context.. ------------") conf = SparkConf().setAppName('Click Prediction') conf.set("spark.storage.memoryFraction", "0.40") sc = SparkContext(conf=conf) # sc = SparkContext("local[4]", "ClickRatePrediction") ##run on local with 4 cores, named it "ClickRatePrediction" print ("-------------------Finished creating context..------------") print ("--------------------Creating parse text file-----------") # input_file = open(input_file) # dacData = [unicode(x.replace('\n', '').replace('\t', ',')) for x in input_file] dacData = sc.textFile(input_file).map(lambda x: unicode(x.replace('\n', '').replace('\t', ',')) for x in input_file) print ("-------------------Parse text was created!-----------") print ("-------------------Creating RDD!! ------------------------") rawData = (sc .parallelize(dacData, 4) # Create an RDD .zipWithIndex() # Enumerate lines
''' Source of school list: http://schools.nyc.gov/schoolsearch/ ''' from __future__ import print_function import sys import os from operator import add from pyspark import SparkContext from csv import reader sc = SparkContext() sc.addFile("src/helper/assign_basetype.py") from assign_basetype import * school_lines = sc.textFile("/user/ac5901/school_name.csv", 1) schools = school_lines.map(lambda x: x).collect() def check_school(val): basetype = get_basetype(val) if basetype == 'TEXT': if val is None or len(val.strip()) == 0 or val in [ 'Unspecified', 'NA', 'N/A', 'N?A', 'NA/' ]: return 'NULL' elif val in schools: return 'VALID' else:
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType from pyspark.sql.functions import to_timestamp import sys import time import math from operator import add import logging sys.path.append('/app/htm') import settings import htmCircle conf = SparkConf() conf.setMaster('spark://spark-master:7077') conf.setAppName('spark-basic') sc = SparkContext(conf=conf) sc.addPyFile("/app/htm/htmCircle.py") sc.addPyFile("/app/htm/_htmCircle.so") point = [336.14, 0.13] properties = { 'user': settings.DB_USER, 'password': settings.DB_PASS, 'host': 'jdbc:mysql://' + settings.DB_HOST + ':3306', 'database': settings.DB_NAME, 'driver': 'com.mysql.jdbc.Driver', 'url': 'jdbc:mysql://' + settings.DB_HOST + ':3306/' + settings.DB_NAME } working_directory = '/app/'
import numpy as np import pandas as pd import pyspark from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating sc = SparkContext(master="local", appName="first app") df_rdd = sc.textFile('./data/ml-1m/ratings.dat').map(lambda x: x.split("::")) ratings = df_rdd.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) X_train, X_test = ratings.randomSplit([0.8, 0.2]) rank = 10 numIterations = 10 model = ALS.train(X_train, rank, numIterations) testdata = X_test.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = X_test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE))
from pyspark import SparkContext import sys import time from itertools import combinations from collections import defaultdict import queue as Q # For BFS Implementation # import copy sc = SparkContext("local[*]", "Task1") sc.setLogLevel("OFF") start = time.time() input_file_path = sys.argv[1] RDD_inter = sc.textFile(input_file_path) result_RDD1 = RDD_inter.map(lambda a: a.split(" ")) result_RDD2 = RDD_inter.map(lambda a: a.split(" ")[::-1]) result_RDD = result_RDD1.union(result_RDD2) #print(user_pairs_RDD.take(5)) nodes_RDD = result_RDD.flatMap(lambda a: [(a[0]), (a[1])]).distinct() nodes_list = nodes_RDD.collect() print(len(nodes_list)) edges_RDD = result_RDD.map(lambda a: (a[0], a[1])).map(lambda a: (a[0], a[1])) edges_list = edges_RDD.collect() print(len(edges_list)) # Edges between users based on threshold # def user_edges(user): user_edge_list = [] for i in edges_list: if (i[1] == user): user_edge_list.append(i[0])
# In[2]: get_ipython().system('pip install pyspark==2.4.5') # In[3]: get_ipython().system('pip install systemml') # In[4]: from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, SparkSession from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]")) from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() # In[5]: get_ipython().system('mkdir -p /home/dsxuser/work/systemml') # In[6]: from systemml import MLContext, dml import numpy as np import time ml = MLContext(spark)
# Spark RDD functions 2 import sys, re from pyspark import SparkConf, SparkContext, RDD conf = SparkConf().setAppName('RDD Func') sc = SparkContext(conf=conf) ############################################################ # station-id: station ID number # name: station name # lat: latitude # long: longitude # dockcount: number of docks which embeds station # landmark: city # installation: date station was embedded # bikes_available: number of available bicycle # docks_available: number of available docks # time: time and date, PST ############################################################ stations = sc.textFile('/opt/spark/data/bike-share/stations') status = sc.textFile('/opt/spark/data/bike-share/status') status2 = status.map(lambda x: x.split(',')) \ .map(lambda x: (x[0], x[1], x[2], x[3].replace('"', ''))) \ .map(lambda x: (x[0], x[1], x[2], x[3].split(' '))) \ .map(lambda x: (int(x[0]), int(x[1]), int(x[3][0]), int(x[3][1]), int(x[3][2]), int(x[4][0]))) status2.first() status3 = status2.filter(lambda x: x[2] == 2015 and x[3] == 2 and x[4] >= 22) \ .map(lambda x: (x[0], x[1], x[5]))
def print(*arg): mystring = "" for argument in arg: mystring += str(argument) f = open('log.txt', 'a') f.write(mystring + "\n") f.close() # Initialize SparkContext import sys from pyspark import SparkContext from pyspark import SparkConf sc = SparkContext() import os import sys import re from pyspark import SparkContext from pyspark import SparkContext from pyspark.sql import SQLContext sqlContext = SQLContext(sc) from pyspark.sql import types from pyspark.sql import Row from pyspark.sql import functions from pyspark.sql import SparkSession import matplotlib.pyplot as plt import matplotlib.mlab as mlab import pandas as pd import numpy as np
# and then run the example # `$ bin/spark-submit examples/src/main/python/streaming/stateful_network_wordcount.py \ # localhost 9999` ### from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: stateful_network_wordcount.py <hostname> <port>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingStatefulNetworkWordCount") ssc = StreamingContext(sc, 3) ssc.checkpoint("checkpoint") # RDD with initial state (key, value) pairs # initialStateRDD = sc.parallelize([(u'hello', 1), (u'world', 1)]) def updateFunc(new_values, last_sum): return sum(new_values) + (last_sum or 0) lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) running_counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda word: (word, 1))\ .updateStateByKey(updateFunc) #.updateStateByKey(updateFunc,initialStateRDD)
from __future__ import print_function import sys from datetime import timedelta, datetime, tzinfo import numpy as np from pyspark import SparkContext import sparkmpi sc = SparkContext(appName="SparkMPI") print("\nHello SparkMPI\n") partitions = 2 srv = sparkmpi.AddressServer.createServer() addr = srv.start(partitions) print("address: ", addr) def f(args): comm = sparkmpi.Communicator.createCommunicator(args['rank'], 2) imageSize = 2 * 1000000 comm.allocate(imageSize * 4) comm.connect(args['addr']) a = np.zeros(imageSize, dtype=np.float32)
# -*- coding: utf-8 -*- import time import csv import sys import json from pyspark import SparkConf, SparkContext #%% conf = SparkConf().setAppName("Task-1-ground-truth-generator").set("spark.executor.memory", "4g") sc = SparkContext(conf=conf) #%% input_file = 'data/test_review.json' reqd_jaccard_similarity = 0.05 output_file = 'data/true_similarity_pairs_small.csv' #%% input_data = sc.textFile(input_file) input_rdd = input_data.map(json.loads).map(lambda row: (row["business_id"], row["user_id"])).cache() input_rdd_grouped = input_rdd.groupByKey().map(lambda x: (x[0], set(x[1]))) input_rdd_grouped = input_rdd_grouped.repartition(1) num_partitions = input_rdd_grouped.getNumPartitions() business_bucket = input_rdd_grouped.collect() business_bucket = sorted(business_bucket) #%%
from pyspark import SparkConf, SparkContext import math def format0(rec): Record = rec.split("|") return (Record) def format1(rec): Record = rec.split("\t") return (Record) con = SparkConf() sc = SparkContext(conf=con) movie = sc.textFile("file:///home/cloudera/imdb/Movies.item", use_unicode=True) rating = sc.textFile("file:///home/cloudera/imdb/Movie-Ratings-Done.data") movieFormatted = movie.map(format0) ratingFormatted = rating.map(format1) dataM = movieFormatted.take(movieFormatted.count()) dataR = ratingFormatted.collect() #golden = movieFormatted.filter(findMovie) #match = rdd.union #out = dataM.collect() movietitle = "GoldenEye (1995)" movieID = movieFormatted.filter(lambda n: n[1] == movietitle).map( lambda x: x[0]).collect()
# function that parses file to put movie names in python dictionary # maps movie IDs to names def loadMovieNames(): movieNames = {} with open("ml-100k/u.ITEM") as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames # boilerplate conf = SparkConf().setMaster("local").setAppName("PopularMovies") sc = SparkContext(conf=conf) # object that returns the broadcast on cluster nameDict = sc.broadcast(loadMovieNames()) # import the data, map movie IDs and reduce by key while counting occurence of each movie lines = sc.textFile("file:///SparkCourse/ml-100k/u.data") movies = lines.map(lambda x: (int(x.split()[1]), 1)) movieCounts = movies.reduceByKey(lambda x, y: x + y) # flipp the tuple from (id, count) to (count, id) and sort flipped = movieCounts.map(lambda x: (x[1], x[0])) sortedMovies = flipped.sortByKey() # use the broadcast object nameDict to transform each line to (name, count) #sortedMoviesWithNames = sortedMovies.map(lambda (count, movie) : (nameDict.value[movie], count))
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import SparkSession conf = SparkConf().setMaster("spark://Masteru:7077").setAppName("My App") sc = SparkContext(conf=conf) hivectx = HiveContext(sc) a = sc.textFile('hdfs://Masteru:9000/RLCPP.csv') print(a.collect())
from pyspark import SparkContext, SparkConf, SQLContext import logging, sys import numpy as np # spark-submit --packages com.databricks:spark-csv_2.10:1.4.0 --py-files master/hadoop/stemmer.py,master/hadoop/filter.py --master yarn --deploy-mode cluster master/hadoop/distances.py logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger(__name__) conf = SparkConf()\ .set("spark.driver.maxResultSize", "0")\ .set("spark.driver.memory", "12g")\ .set("spark.executor.memory", "12g")\ .set("spark.executor.instances", "400") sc = SparkContext(appName='distances', conf=conf) def write_data(path): import filter from pyspark.mllib.feature import Word2Vec, Word2VecModel # load data loc = '/user/rmusters/text/2015/01/*' text_file = sc.textFile(loc) data = text_file.map(lambda line: filter.filter(line).split(" ")) # load model word2vec = Word2Vec() model = Word2VecModel.load(sc, '/user/rmusters/2015model99')
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'read_sequence_file') self.sc = SparkContext(conf=conf)