def main(): sc = SparkContext(appName="MyApp") sc.setLogLevel('ERROR') # Parse data train_labels, train_data = load_data('train.csv') dummy_labels, test_data = load_data('test.csv', use_labels=False) # Map each data point's label to its features train_set = reformatData(train_data, train_labels) test_set = reformatData(test_data, dummy_labels) # Parallelize the data parallelized_train_set = sc.parallelize(train_set) parallelized_test_set = sc.parallelize(test_set) # Split the data trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42) # Train the models decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2) # Test the model testDecisionTree(decisionTreeModel, parallelized_test_set)
def longest_common_substring(strands): pass # create the Spark context conf = SparkConf().setAppName("longest_common_substring") sc = SparkContext(conf=conf) # create an accumulator for key-value pairs, where each key is a substring, and each value is the set of strings where the substring can be found class ArrayAccumulatorParam(AccumulatorParam): def zero(self, initialValue): return initialValue def addInPlace(self, v1, v2): if type(v2) is list: v1.extend(v2) elif type(v2) is tuple: v1.append(v2) return v1 acc = sc.accumulator([], ArrayAccumulatorParam()) def generate_substrings(data_element): k, v = data_element i = 0 while i < len(v): j = i + 1 while j < len(v): acc.add((v[i:j],k)) j += 1 i += 1 sc.parallelize([(k, v) for k, v in strands.iteritems()]).foreach(generate_substrings) all_substrings = sc.parallelize(acc.value) return all_substrings.groupByKey().filter(lambda x: set(list(x[1])) == set(strands.keys())).takeOrdered(1, key=lambda x: -len(x[0]))[0][0]
def main(): sc = SparkContext(appName="MyApp") sc.setLogLevel('ERROR') # Parse data train_labels, train_data = load_data('train.csv') dummy_labels, test_data = load_data('test.csv', use_labels=False) # Truncate the last 2 features of the data for dataPoint in train_data: len = np.size(dataPoint) dataPoint = np.delete(dataPoint, [len - 2, len - 1]) for dataPoint in test_data: len = np.size(dataPoint) dataPoint = np.delete(dataPoint, [len - 2, len - 1]) # Map each data point's label to its features train_set = reformatData(train_data, train_labels) test_set = reformatData(test_data, dummy_labels) # Parallelize the data parallelized_train_set = sc.parallelize(train_set) parallelized_test_set = sc.parallelize(test_set) # Split the data trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42) # Train the models randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={}, numTrees=750, seed=42, maxDepth=30, maxBins=32) # Test the model testRandomForest(randomForestModel, parallelized_test_set)
def SearchTiles_and_Factorize(n): global globalmergedtiles global globalcoordinates global factors_accum global spcon spcon = SparkContext("local[4]","Spark_TileSearch_Optimized") if persisted_tiles == True: tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r") tileintervalslist=tileintervalsf.read().split("\n") #print "tileintervalslist=",tileintervalslist tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam()) paralleltileintervals=spcon.parallelize(tileintervalslist) paralleltileintervals.foreach(tilesearch) else: factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w") hardy_ramanujan_ray_shooting_queries(n) hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n) baker_harman_pintz_ray_shooting_queries(n) cramer_ray_shooting_queries(n) zhang_ray_shooting_queries(n) factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam()) #spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent) spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent) print "factors_accum.value = ", factors_accum.value factors=[] factordict={} for f in factors_accum.value: factors += f factordict[n]=factors json.dump(factordict,factorsfile) return factors
def SparkBroadcastAccumulator(n): global broadcast_var global accumulator_var spcon = SparkContext("local[2]","SparkBroadcastAccumulator") broadcast_var=spcon.broadcast("broadcast_message") accumulator_var=spcon.accumulator(0) spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
class LookAlikeTest(unittest.TestCase): def setUp(self): conf = SparkConf().setAppName("Tests").setMaster("local") self.sc = SparkContext(conf=conf) def tearDown(self): self.sc.stop() def test_ratings_calculation(self): data = [("u1", 123), ("u1", 123), ("u1", 132), ("u2", 123), ("u2", 111), ("u2", 111), ("u2", 111), ("u2", 111), ("u3", 123), ("u3", 123), ("u3", 125), ("u3", 125), ("u3", 111)] input_data = self.sc.parallelize(data) ratings = calculate_ratings(input_data).collectAsMap() self.assertEqual(ratings["u1"][123], 1.0) self.assertEqual(ratings["u1"][132], 0.5) self.assertEqual(ratings["u2"][111], 1.0) self.assertEqual(ratings["u2"][123], 0.25) self.assertEqual(ratings["u3"][123], 1.0) self.assertEqual(ratings["u3"][125], 1.0) self.assertEqual(ratings["u3"][111], 0.5) def test_correlations_calculation(self): ratings = [("u1", {1: 0.5, 2: 1.0, 3: 0.1}), ("u2", {1: 0.25, 3: 1.0}), ("u3", {2: 0.25, 3: 1.0})] ratings_data = self.sc.parallelize(ratings) correlations = calculate_correlations(ratings_data, 3).collectAsMap() self.assertEqual(round(correlations[1], 2), -1.0) self.assertEqual(round(correlations[2], 2), -1.0)
class TestWordCounter(unittest.TestCase): def setUp(self): conf = SparkConf().setAppName("appTest").setMaster("local[*]") self.sc = SparkContext(conf=conf) self.counter = WordCounter() def tearDown(self): self.sc.stop() def test_when_exist_one_movie_and_counter(self): movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy", "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"] result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']), ('Toy', ['::Toy Story Toy (1995)::'])) movies = self.sc.parallelize(movieList) self.assertEqual(self.counter.getMaxValues(movies),result) def test_when_exist_one_movie_and_counter_moreMovies(self): movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy", "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy", "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"] result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::'])) movies = self.sc.parallelize(movieList) self.assertEqual(self.counter.getMaxValues(movies),result)
def parallelDisassembler(matrix,groups): def splitGroup(g): b,n1,n2 = isAFalse(g,matrix) if b: g1 = [n1] g2 = [n2] for nid in g: if nid != n1 and nid != n2: sim1 = 0.0 sim2 = 0.0 for tmp in g1: sim1 += jcSig(matrix[tmp],matrix[nid]) for tmp in g2: sim2 += jcSig(matrix[tmp],matrix[nid]) if sim1 / len(g1) > sim2 / len(g2): g1 += [nid] else: g2 += [nid] return g1,g2 return ([],g) tmp = len(groups) sc = SparkContext(appName="Splitter") parrGroup = sc.parallelize(groups) groups = parrGroup.map(splitGroup).collect() tmpgrp = [] for g1,g2 in groups: tmpgrp += [g1] tmpgrp += [g2] groups = tmpgrp while len(groups) != tmp : tmp = len(groups) #print(tmp) parrGroup = sc.parallelize(groups) groups = parrGroup.map(splitGroup).collect() tmpgrp = [] for g1,g2 in groups: if g1 != []: tmpgrp += [g1] tmpgrp += [g2] groups = tmpgrp sc.stop() return groups
def _kmeans_spark(X, n_clusters, max_iter=300, worker_nums=10, init='k-means++', random_state=None, tol=1e-4): from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%worker_nums) sc = SparkContext(conf=conf) data = sc.parallelize(X) data.cache() random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None x_squared_norms = row_norms(X, squared=True) # x_squared_norms = data.map(lambda x: (x*x).sum(axis=0)).collect() # x_squared_norms = np.array(x_squared_norms, dtype='float64') centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms) bs = X.shape[0]/worker_nums data_temp = [] for i in range(worker_nums-1): data_temp.append(X[i*bs:(i+1)*bs]) data_temp.append(X[(worker_nums-1)*bs:]) data_temp = np.array(data_temp, dtype='float64') data_temp = sc.parallelize(data_temp) data_temp.cache() for i in range(max_iter): centers_old = centers.copy() all_distances = data_temp.map(lambda x: euclidean_distances(centers, x, squared=True)).collect() temp_all_distances = all_distances[0] for i in range(1, worker_nums): temp_all_distances = np.hstack((temp_all_distances, all_distances[i])) all_distances = temp_all_distances # all_distances = data.map(lambda x: euclidean_distances(centers, x, squared=True)).collect() # # reshape, from (1, n_samples, k) to (k, n_samples) # all_distances = np.asarray(all_distances, dtype="float64").T[0] # Assignment, also called E-step of EM labels, inertia = _labels_inertia(X, x_squared_norms, centers, all_distances=all_distances) # re-computation of the centroids, also called M-step of EM centers = _centers(X, labels, n_clusters) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia shift = squared_norm(centers_old - centers) if shift <= tol: break return best_centers, best_labels, best_inertia
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca): pca_n = 2 sc = SparkContext(master) data = sc.textFile(dataPath) # not RDD data ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)]))) if label == 0: ndata = ndata.map(lambda line: line[::-1]) if normalize == 1: test_data = norm(ndata.collect()) norm_data = sc.parallelize(test_data) train_data = norm_data.map(lambda part: lbp(part[0], part[1])) #raw_data = data.map(lambda line: line.split(character)) else: test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect() train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1])) if ispca == 1: pca = PCA(n_components = pca_n) pca_train = [test_data[i][1] for i in range(len(test_data))] pca_data = pca.fit(pca_train).transform(pca_train) test = [] for i in range(len(pca_data)): test.append([test_data[i][0], pca_data[i]]) train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1])) test_data = test model_lr = lr.train(train_data) err_lr = 0.0 size = len(train_data.collect()) for i in range(size): err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0]) print "result:", err_lr/size String = "Linear Regression Result:\n" String = String + str(model_lr.weights) + '\n' String = String + "Error: " + str(err_lr / size) sc.stop() return String
def main(num_factors, num_workers, num_iterations, beta_value, lambda_value, Wm_value, \ V_filename, output_W_filename, output_H_filename): # Conf conf = SparkConf().setAppName("Spark SGD MF") sc = SparkContext(conf=conf) user_movie_ratings = sc.textFile(V_filename).map(line_to_movie_user_ratings) user_movie_ratings.persist() #global user_nonzero, movie_nonzero #user_nonzero = user_movie_ratings.keyBy(first_element).countByKey() #movie_nonzero = user_movie_ratings.keyBy(second_element).countByKey() num_users = int(user_movie_ratings.map(first_element).reduce(max)) num_movies = int(user_movie_ratings.map(second_element).reduce(max)) global updates_total updates_total = 0 # Begin iterations iter = 0 global seed while iter < num_iterations: # Initialize W and H if iter == 0: W = sc.parallelize(range(num_users+1)).map(key_to_entry_rand).persist()#(user_id,rand(num_factors)) H = sc.parallelize(range(num_movies+1)).map(key_to_entry_rand).persist()#(movie_id,rand(num_factors) # Set random seed seed = random.randrange(MAXSEED) # Partition parameters W_blocks = W.keyBy(lambda W_entry: item_to_block(W_entry[0]))#key:worker_id,value:(user_id,rand(num_factors)) H_blocks = H.keyBy(lambda H_entry: item_to_block(H_entry[0]))#key:worker_id,value:(movie_id,rand(num_factors) # Filter diagonal blocks V_diagonal = user_movie_ratings.filter(filter_diagonal).persist()#(user_id,movie_id,rating) where worker_id(user_id) == worker_id(movie_id) V_blocks = V_diagonal.keyBy(lambda t : item_to_block(t[0]))#key:worker_id,value:(user_id,movie_id,rating) where user_id == movie_id updates_curr = V_diagonal.count() V_diagonal.unpersist() V_group = V_blocks.groupWith(W_blocks, H_blocks).coalesce(num_workers)#key:worker_id,value:seq[V],seq[W],seq[H] # Perform SGD updatedWH = V_group.map(SGD_update).persist() W = updatedWH.flatMap(first_element).persist() H = updatedWH.flatMap(second_element).persist() updates_total += updates_curr iter += 1 W_result = numpy.vstack(W.sortByKey().map(second_element).collect()[1:]) H_result = numpy.vstack(H.sortByKey().map(second_element).collect()[1:]) # Save W and H savetxt(output_W_filename, W_result, delimiter=',') savetxt(output_H_filename, H_result, delimiter=',') sc.stop
def main(image_files): sc = SparkContext( appName="Resize Images") sc.parallelize(image_files).map(resize_image_file).count() #read all the resized images into an array to save as a pickled object #out_dir = CUR_DIR + TEST_OR_TRAIN + '_' + str(IMAGE_SIZE) #save_images(out_dir) #read all the resized images into an array to save as a csv file out_dir = CUR_DIR + TEST_OR_TRAIN + '_' + str(IMAGE_SIZE) save_images_csv(out_dir)
def model(classifier, ftrain, fvalid, fprediction): startTime = time.time() ctx = SparkContext(appName="model_on_Spark") sqlContext = SQLContext(ctx) logger = SparkLogger(ctx) logger.set_level('ERROR') # load and prepare training and validation data rawTrain, train = prepData(sqlContext, ctx, ftrain) rawValid, valid = prepData(sqlContext, ctx, fvalid) # is needed to join columns valid = indexData(valid) rawValid = indexData(rawValid) classifiers = { "RandomForestClassifier" : RFC } clf = classifiers[classifier]() labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") # train and predict pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(train) predictions = model.transform(valid) # write to file: subsetPrediction = predictions.select("prediction", "index") subsetValidData = rawValid.select("dataset", "index") output = (subsetValidData .join(subsetPrediction, subsetPrediction.index == subsetValidData.index) .drop("index") .drop("index")) lines = output.map(toCSVLine) lines.saveAsTextFile('output') evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print "Test Error = %g" % (1.0 - accuracy) executionTime = time.time() - startTime row=classifier+','+str(executionTime) ctx.parallelize([row]).saveAsTextFile("timing")
def main(): HDFS_URI = "hdfs://hdfs.domain.cc/folder" sc = SparkContext() rdd = sc.parallelize([("a", 1), ("b", 2), ("c", 3)]) rdd.saveAsNewAPIHadoopFile(HDFS_URI + "/01", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat") rdd = sc.parallelize([("d", 4), ("e", 5), ("f", 6)]) rdd.saveAsNewAPIHadoopFile(HDFS_URI + "/02", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat") folder = TwoDHDFSMap(sc, HDFS_URI) print("hdfsURI test: ", folder.hdfsURI == HDFS_URI) print("folder[\"01\"][\"a\"] test: ", folder["01"]["a"] == 1) print("\"01\" in folder test: ", "01" in folder) print("\"02\" in folder test: ", "02" in folder) print("folder[\"02\"][\"d\"] test: ", folder["02"]["d"] == 4)
class TestCalculator (unittest.TestCase): def setUp(self): conf = SparkConf().setAppName("appTest").setMaster("local[*]") self.sc = SparkContext(conf=conf) self.setMovies = SetMovies() def tearDown(self): self.sc.stop() def test_when_calculate_set_word_most_repeater(self): entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])), ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])), ('Story', (1, ['::Toy Story Toy (1995)::'])), ('StoryA', (3, ['::ToyA StoryA ToyA (1995)::']))] result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']), ("StoryA",["::ToyA StoryA ToyA (1995)::"])) funcReverseTuple = lambda value :((value[1][0],(value[0],value[1][1]))) rdd = self.sc.parallelize(entry) self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result) def test_when_calculate_set_word_most_repeater_one(self): entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])), ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])), ('Story', (1, ['::Toy Story Toy (1995)::'])), ('StoryA', (1, ['::ToyA StoryA ToyA (1995)::']))] result = (('ToyA', ['::ToyA StoryA ToyA (1995)::'])) funcReverseTuple = lambda value :(value[1][0],(value[0],value[1][1])) rdd = self.sc.parallelize(entry) self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result) def test_when_calculate_maximum_year(self): entry = [('(1996)',2), ('(1998)',2), ('(1997)',1)] result = ('(1996)','(1998)') rdd = self.sc.parallelize(entry) funcReverseTuple = lambda value :(value[1],value[0]) self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result) def test_when_calculate_maximum_year_with_only_one(self): entry = [('(1996)',2), ('(1998)',1), ('(1997)',1), ('(1999)',1)] result = ('(1996)') rdd = self.sc.parallelize(entry) funcReverseTuple = lambda value :(value[1],value[0]) self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)
def log_mapreducer(logfilename, pattern, filt="None"): spcon=SparkContext() if filt == "None": input=open(logfilename,'r') paralleldata=spcon.parallelize(input.readlines()) patternlines=paralleldata.filter(lambda patternline: pattern in patternline) print "pattern lines",patternlines.collect() matches=patternlines.map(mapFunction).reduceByKey(reduceFunction) else: input=spcon.textFile(logfilename) matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction) matches_collected=matches.collect() print "matches_collected:",matches_collected if len(matches_collected) > 0: sqlContext=SQLContext(spcon) bytes_stream_schema=sqlContext.createDataFrame(matches_collected) bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream") query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream") dict_query_results=dict(query_results.collect()) print "----------------------------------------------------------------------------------" print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]" print "----------------------------------------------------------------------------------" dict_matches=dict(matches_collected) sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True) print "pattern matching lines:",sorted_dict_matches print "----------------------------------------------------------------------------------" print "SparkSQL DataFrame query results:" print "----------------------------------------------------------------------------------" pprint.pprint(dict_query_results) print "----------------------------------------------------------------------------------" print "Cardinality of Stream Dataset:" print "----------------------------------------------------------------------------------" print len(dict_query_results) spcon.stop() return sorted_dict_matches
def solve_puzzle(master, output, height, width, slaves): global HEIGHT, WIDTH, level HEIGHT=height WIDTH=width level = 0 sc = SparkContext(master, "python") """ YOUR CODE HERE """ """ YOUR MAP REDUCE PROCESSING CODE HERE """ solution=Sliding.solution(WIDTH, HEIGHT) sol = Sliding.board_to_hash(WIDTH, HEIGHT, solution) data = sc.parallelize([(sol,level),]) counter = 0 curLen = 1 while(counter < curLen): level += 1 data = data.flatMap(bfs_flat_map) if (level% 12 == 0): data = data.partitionBy(PARTITION_COUNT) data = data.reduceByKey(bfs_reduce) if (level% 6 == 0): counter = curLen curLen = data.count() """ YOUR OUTPUT CODE HERE """ data.coalesce(slaves).saveAsTextFile(output) sc.stop()
def solve_puzzle(master, output, height, width, slaves): global HEIGHT, WIDTH, level HEIGHT=height WIDTH=width level = 0 sc = SparkContext(master, "python") """ YOUR CODE HERE """ sol = Sliding.board_to_hash(WIDTH, HEIGHT, Sliding.solution(WIDTH, HEIGHT)) RDD = sc.parallelize([(sol,level)]) count = RDD.count() RDD_count = 0 search = True k = 1 """ YOUR MAP REDUCE PROCESSING CODE HERE """ while search: if k % 3== 0: RDD = RDD.flatMap(bfs_map).partitionBy(PARTITION_COUNT).reduceByKey(bfs_reduce) #PUT PARTITION_COUNT FOR 16 else: RDD = RDD.flatMap(bfs_map).reduceByKey(bfs_reduce) if k % 2 == 0: RDD_count = RDD.count() if RDD_count == count: search = False count = RDD_count k = k + 1 level = level + 1 """ YOUR OUTPUT CODE HERE """ RDD = RDD.map(swap_map) RDD.coalesce(slaves).saveAsTextFile(output) #outputLst = RDD.collect() #for elem in outputLst: #output(str(elem[0]) + " " + str(elem[1])) #output the elements sc.stop()
def run(self): sc = SparkContext() sqlContext = SQLContext(sc) #sqlContext = HiveContext(sc) start_scrape = datetime.now() begin, begin_parts = scrape.get_boundary(self.begin) end, end_parts = scrape.get_boundary(self.end) print "here" all_years_months_days = self.getYearsMonths() print "all_years=", all_years_months_days game_ids = scrape.get_games(all_years_months_days, source=scrape.filesystem_scraper) print "games=", game_ids gamesRDD = sc.parallelize(game_ids) gamesRDD.cache() print "fileRDD=", gamesRDD print "# parttions:", gamesRDD.getNumPartitions() print "count=", gamesRDD.count() # create RDDs self.createRawParquet(sc, sqlContext, gamesRDD) # Hitter Stats batter_games = self.createHitterStats(sqlContext) # create Pitcher Stats self.createPitcherStats(sqlContext) print "STOPPING" sc.stop()
def raw_files_to_labeled_features(raw_files, label_file): # Initialize spark conf = SparkConf().setAppName("SpamFilter").setMaster("local[*]") sc = SparkContext(conf=conf) # Get the set of words that we will be accepting as valid features valid_words = set(w.lower() for w in words.words()) # Load training data and convert to our desired format raw_files = sc.wholeTextFiles(raw_files) # Extract a document of filtered words from each text file documents = raw_files.map(lambda x: (x[0], extract_words(x[1], valid_words))) # Calculate TF-IDF values for each document tfidf = calculate_tfidf(documents) # Load labels labels = sc.parallelize(load_labels(label_file)).map(lambda x: x[0]) # Append indexes to features and labels indexed_labels = labels.zipWithIndex().map(lambda x: (x[1],x[0])) indexed_features = tfidf.zipWithIndex().map(lambda x: (x[1],x[0])) # Join labels and features into tuples and return return indexed_labels.join(indexed_features).map(lambda x: x[1]).collect()
def MakeDriveResultFrames(): try: sc = SparkContext() print('Making sc') except: print('Spark Context already exists') NumCores=MP.cpu_count(); yr=range(2009,2015) weekRange=range(1,18) gameRange=range(0,16) GameID=np.zeros([len(yr)*len(weekRange)*len(gameRange),3]) GameID[:,0]=np.repeat(yr,272) GameID[:,1]=np.tile(np.sort(np.tile(weekRange,16)),6) GameID[:,2]=np.tile(gameRange,17*len(yr)) count1=sc.parallelize(range(len(GameID)),NumCores) count2=count1.map(lambda x: DriveResult(GameID[x,:])) R=count2.collect() FrameR=pd.DataFrame(columns=('Field Position','Result')) for i in range(len(R)): if len(np.shape(R[i]))==2: FrameR=FrameR.append(R[i]) #ResFrame=DriveStartResult(FrameR) sc.stop() FrameR.to_csv('DriveResults.csv') return FrameR
def solve_puzzle(master, output, height, width, slaves): global HEIGHT, WIDTH, level HEIGHT=height WIDTH=width level = 0 sc = SparkContext(master, "python") sol = Sliding.solution(WIDTH, HEIGHT) """ YOUR CODE HERE """ sol = Sliding.board_to_hash(WIDTH, HEIGHT, sol) new_visited = [(sol, level)] new_visited = sc.parallelize(new_visited) num = 1 #while there are still (k, v) pairs at the current level while num: #use += as we do not retain board sets not at the global level #in our mapping function new_visited += new_visited.flatMap(bfs_map) if level % 4 == 3: # only reduce and filter every other iteration for performance reasons new_visited = new_visited.reduceByKey(bfs_reduce) new_visited = new_visited.partitionBy(PARTITION_COUNT) #figure out how to use hash num = new_visited.filter(filter_func).count() # count the number of elements in the RDD at the current level level += 1 # Debuggin purposes print("\n\n\nLevel " + str(level) + '\n\n\n') """ YOUR OUTPUT CODE HERE """ new_visited.coalesce(slaves).saveAsTextFile(output) sc.stop()
def main(training_file,n): epochs = int(n); x,y,tags = read_training_data(training_file) v = {} sc = SparkContext(appName="parameterMixing") tags = sc.broadcast(tags) time0 = time.time() training_data = [] for i in range(len(x)): training_data.append((x[i],y[i])) train_data = sc.parallelize(training_data).cache() for round in range(0,epochs): fv = sc.broadcast(v) feat_vec_list = train_data.mapPartitions(lambda t: perc_train(t, tags.value, fv.value)) feat_vec_list = feat_vec_list.combineByKey((lambda x: (x,1)), (lambda x, y: (x[0] + y, x[1] + 1)), (lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect() for (feat, (a,b)) in feat_vec_list: v[feat] = float(a)/float(b) sc.stop() # Compute the weight vector using the Perceptron algorithm #trainer.perceptron_algorithm(5) print "iteration %d in %f seconds" %(iterations, time.time()-t0) # Write out the final weight vector write_weight_vector(v)
def main(): master = 'local[1]' app_name = 'reduce_demo1' # print(range(0, 3)) sc = SparkContext(master, app_name) # 测试1:正常 # rdd_list = [sc.parallelize(range(i * 3, (i+1) * 3)) for i in range(0,3)] # rdd_union = sc.union(rdd_list) # print(rdd_union.getNumPartitions()) # result = rdd_union.map(fun_map_print) # result.count() # 测试2:两次 union rdd_list_outer = [] for x in ['a', 'b', 'c']: rdd_list_inner = [sc.parallelize(map(lambda j: x + str(j),range(i * 3, (i+1) * 3))) for i in range(0,3)] rdd_union_inner = sc.union(rdd_list_inner) rdd_list_outer.append(rdd_union_inner) rdd_union_outer = reduce(lambda rddx, rddy: rddx.union(rddy), rdd_list_outer) result = rdd_union_outer.map(fun_map_print) result.count() sc.stop()
def main(): if len(sys.argv) != 3: print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>" sys.exit(-1) keyspace_name = sys.argv[1] column_family_name = sys.argv[2] # Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1") sc = SparkContext(appName="Spark + Cassandra Example", conf=conf) # import time; time.sleep(30) java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil") print sc._jvm.CassandraJavaUtil users = ( ["Mike", "Sukmanowsky"], ["Andrew", "Montalenti"], ["Keith", "Bourgoin"], ) rdd = sc.parallelize(users) print rdd.collect()
def solve_puzzle(master, output, height, width, slaves): global HEIGHT, WIDTH, level HEIGHT=height WIDTH=width level = 0 sc = SparkContext(master, "python") """ YOUR CODE HERE """ NUM_WORKERS = slaves sol = Sliding.solution(WIDTH, HEIGHT) """ MAP REDUCE PROCESSING CODE HERE """ level_pos = sc.parallelize((make_state(level, sol),)) prev_size, size = 0, 1 while prev_size != size: level += 1 if level % 10 == 0: level_pos = level_pos.partitionBy(PARTITION_COUNT) level_pos = level_pos.flatMap(bfs_flat_map).reduceByKey(bfs_reduce) prev_size = size size = level_pos.count() """ OUTPUT CODE HERE """ level_pos = level_pos.map(unhash_board) level_pos.coalesce(NUM_WORKERS).saveAsTextFile(output) sc.stop()
def _train_spark(data, n_components, n_pc, covar_types, verbose, n_jobs, n_iter_search): # Spark configuration. conf = (SparkConf() .setMaster("local[" + str(n_jobs) + "]") .setAppName("FDD") .set("spark.executor.memory", "512mb") .set("spark.cores.max", str(n_jobs))) sc = SparkContext(conf=conf) # Build hyperparameter vectors. parameters = cartesian((n_components, n_pc, covar_types)) # Distribute the hyperparameters vector. parameters_rdd = sc.parallelize(parameters, 96) # Broadcast the data to all workers. data_broadcast = sc.broadcast(data) # Train a model for each hyperparameter set. models = parameters_rdd.map(lambda param: train_with_parameters(param, data_broadcast)) # Persist the models the avoid re-computation. models.persist(StorageLevel(True, True, False, True, 1)) # Sort by BIC. sorted_models = models.sortBy(lambda model: model[0]) # The first is the best model. best_model = sorted_models.collect()[0][1] sc.stop() return best_model
def solve_puzzle(width, height, output_path, slave_number): def hash_to_board(state): return Sliding.hash_to_board(width, height, state) def board_to_hash(board): return Sliding.board_to_hash(width, height, board) def get_children_boards(board): return Sliding.children(width, height, board) def get_solution_hash(): return Sliding.board_to_hash(width, height, Sliding.solution(width, height)) sc = SparkContext("local", "Slide") boards_rdd = sc.parallelize([(get_solution_hash(), 0)]) current_level = 0 while True: current_level += 1 frontier_rdd = boards_rdd.filter(lambda (state, level): level == current_level - 1) frontier_rdd.persist() if frontier_rdd.isEmpty(): break boards_rdd = frontier_rdd\ .flatMap(lambda (state, level): get_children_boards(hash_to_board(state)))\ .map(lambda state_board: (get_children_boards(state_board), current_level))\ .union(boards_rdd)\ .reduceByKey(lambda step_level_a, step_level_b: min(step_level_a, step_level_b))\ .partitionBy(slave_number) boards_rdd\ .map(lambda (state, level): (level, hash_to_board(state)))\ .sortByKey()\ .coalesce(1)\ .saveAsTextFile(output_path) sc.stop()
class Stack(object): def __init__(self, target): self.target = target def connect(self, spark_host, job_name): self.spark_host = spark_host self.job_name = job_name self.spark_context = SparkContext(spark_host, job_name) @staticmethod def addJobTreeOptions(parser): parser.add_option("--batchSystem", dest="batchSystem", help="This is an old flag that is kept to maintain compatibility default=%default", default="spark") parser.add_option("--jobTree", dest="jobTree", help="This is an old flag that it is maintained for compatibility", default=None) def startJobTree(self, options): self.options = options extra_path = os.path.dirname(os.path.abspath(sys.argv[0])) os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH', "") + ":" + extra_path #print "Starting" sm = StackManager(self.spark_context) targets = self.spark_context.parallelize([('start', self.target)]) sm.runTargetList(targets)
def distribute(k, primeList): from pyspark import SparkContext sc = SparkContext(appName="bern_spark") rdd = sc.parallelize(primeList) rp = rdd.map(lambda p : (computeBkModP(p, k), p)).collect() return rp
from pyspark import SparkConf, SparkContext conf1 = SparkConf() sc = SparkContext(conf=conf1) List1 = [1, 2, 3, 4, 5] def double(x): return x * 2 Rdd = sc.parallelize(List1) Rdd2 = Rdd.map(double) Data1 = Rdd2.collect() Data2 = Rdd.collect() print(Data1) print(Data2)
master = os.environ["SPARK_MASTER"] master = "spark://{}:7077".format(master) conf = SparkConf().setAppName("SpotTrawl").setMaster(master) spark = SparkContext(conf=conf) #===========DEFINE SAMPLING FUNCTION======= numSamples = 10**7 def sample(p): x, y = np.random.random(), np.random.random() return 1 if x * x + y * y < 1 else 0 #==========TAKE SAMPLES====================== count = spark.parallelize(xrange(0, numSamples)).map(sample) \ .reduce(lambda a, b: a + b) #==========ESTIMATE 4pi====================== piEst = 4.0 * count / numSamples #==========FIND NUMBER OF MATCHING DIGITS==== n, pi = 0, np.pi while int(pi * 10**n) == int(piEst * 10**n): n += 1 #=========PRINT RESULTS FOR OBSERVATION====== print "Pi is roughly {}".format(piEst) print "Error: {}%".format((piEst - pi) / pi) print "Matching Digits: {}".format(n) print("DESIRED OUTPUT LENGTH: 3") time.sleep(1000)
# See the License for the specific language governing permissions and # limitations under the License. # import sys from random import random from operator import add from pyspark import SparkContext if __name__ == "__main__": """ Usage: pi [partitions] """ sc = SparkContext(appName="PythonPi") partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2 n = 100000 * partitions def f(_): x = random() * 2 - 1 y = random() * 2 - 1 return 1 if x**2 + y**2 < 1 else 0 count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add) print("Pi is roughly %f" % (4.0 * count / n)) print("-------------------------") print(sc.master) print("----------") sc.stop()
from pyspark.sql import SQLContext from pyspark.sql import functions as F from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql import functions as functions from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType spark = SparkContext() spark.setLogLevel("ERROR") sql_context = SQLContext(spark) data = spark.parallelize([ ("company1", 2, 2.0), ("company2", 2, 4.0), ("company3", 1, 1.0), ("company4", 1, 0.0), ("company5", 1, 2.0), ]) schema = StructType([ StructField("id", StringType(), True), StructField("degree", IntegerType(), True), StructField("nnd", FloatType(), True) ]) df = sql_context.createDataFrame(data, schema) df.show() # average nearest neighbour degree from nearest neighbour degree annd = df\
"FROM `test1.subredditMembershipv2`") # API request - fetches results # Row values can be accessed by field name or index query_job = query_generator(query) # Writes QueryJob rows to a list to parallelize into Spark RDD query_job_list = list() for row in query_job: row = list(row) row.append(1) query_job_list.append(tuple(i for i in row)) # Convert output from QueryJob (list of tuples) into Spark RDD partitions = 5000 user_sub = sc.parallelize(query_job_list, partitions) # # Test RDD Data # user_sub_count = sc.parallelize([(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1), (4, 1), (5, 1)]) # sub_user = sc.parallelize([('a',[2]), ('a',[1]), ('b',[1]), ('c',[3]), ('c',[4]), ('c',[5]), ('c',[1]), ('c',[2])]) # # print('sub_user') # # print(sub_user.collect()) # sub_members = sub_user.reduceByKey(lambda a, b: a+b) # # print('sub_members complete') # # print(sub_members.collect()) user_sub_count = user_sub.map(lambda x: (x[0], 1)) sub_user = user_sub.map(lambda x: (x[1], [x[0]])) sub_members = sub_user.reduceByKey(lambda a, b: a + b) print('sub_members complete: ' + str(datetime.datetime.now()))
RDD_file = sc.textFile("input.txt") data_file = RDD_file.collect() existing_items = [] #Removing Punctuations for i in data_file: items = str(i) aux_words = string.punctuation existing_items.append( items.translate(None, digits).translate(None, aux_words).lower().replace( " ", " ").strip()) print(existing_items) RDD_file = sc.parallelize(existing_items) #Function for word pair in same line def sample(items): condition = [] output = [] for i in range(len(items)): if items[i] not in condition: condition.append(items[i]) for j in range(len(items)): if i != j: output.append((items[i], items[j])) return output
sys.stdout.write('\r' + \ 'Processing data point ' + str(i) + ' hcid ' + thiswell) allwells[thiswell] = wellent # make some pandas dataframes to compute the results print(" ...done") print("computing summaries") alldfs = [] for w in allwells: cols = allwells[w] pdf1 = pd.DataFrame( { "Pressure": cols[1], "Temp": cols[2], "Oil Pct": cols[3] }, index=cols[0]) pdf2 = pd.DataFrame({ "Prod": cols[4], "Inject Vol": cols[5] }, index=cols[0]) alldfs.append((w, pdf1, pdf2)) return (alldfs) print("writing graphs...") sc = SparkContext() s_rdd = sc.parallelize(load_well_data_from_maprdb()) fnames = s_rdd.map(lambda x: output_graph(x[0], x[1], x[2])).collect()
def mapToFormat(line): overall = line[0] result = line[1] splited = overall.split('/') vid = splited[0] country = splited[1] category = splited[2] key = vid value = str(result) + ',' + category + ',' + country return key, value if __name__ == "__main__": sc = SparkContext(appName='part-2') parser = argparse.ArgumentParser() parser.add_argument("--input", help="Input path", default='~/') parser.add_argument("--output", help="Output path", default='~/') args = parser.parse_args() input_path = args.input output_path = args.output csv = sc.textFile(input_path + 'AllVideos_short.csv') csvNoHead = csv.zipWithIndex().filter(lambda tup: tup[1] > 0).keys() after_map = csvNoHead.map(mapper) after_calculate = after_map.groupByKey().mapValues(calculate_difference) sortedByResult = after_calculate.sortBy(lambda a: a[1], 0) answer = sortedByResult.map(mapToFormat) output = answer.collect()[:10] output1 = sc.parallelize(output) output1.saveAsTextFile(output_path)
read_duration = read_stop - read_start # Get important parts d_keyAndText = d_corpus.map(lambda x: (x[x.index('id="') + 4:x.index( '" url=')], x[x.index('">') + 2:][:-6])) regex = re.compile('[^a-zA-Z]') # Split in to words. d_keyAndListOfWords = d_keyAndText.map( lambda x: (str(x[0]), regex.sub(' ', x[1]).lower().split())) # Flat to all words in corpas allWords = d_keyAndListOfWords.flatMap(lambda x: x[1]).map(lambda x: (x, 1)) # Count each words. allCounts = allWords.reduceByKey(add) # Take to 20000 topWords = allCounts.top(20000, key=lambda x: x[1]) twentyK = sc.parallelize(range(20000)) # Create dictionary. dictionary = twentyK.map(lambda x: (topWords[x][0], x)) # Get only words in dictionally for each document allWords = d_keyAndListOfWords.flatMap(lambda x: ((j, x[0]) for j in x[1])) allDictionaryWords = dictionary.join(allWords) justDocAndPos = allDictionaryWords.map(lambda x: (x[1][1], x[1][0])) allDictionaryWordsInEachDoc = justDocAndPos.groupByKey() # Calculate term frequence. tfs = allDictionaryWordsInEachDoc.map(lambda x: (x[0], buildArray(x[1]))) # Make label 0 or 1. data = tfs.map(lambda x: (oneHotEncoding(x[0]), x[1])) # Get sample number of train data num_train = data.count()
return (vector[0], (0, vector[1] / h_max_val[1])) #normalize by making the maximum 1 def a_normalize(vector): return (vector[0], (0, vector[1] / a_max_val[1])) if __name__ == "__main__": arr = [(i + 1, 0, 1.0) for i in range(1000)] #initial h conf = SparkConf() sc = SparkContext(conf=conf) lines = sc.textFile(sys.argv[1]) link = lines.map(parser).distinct() #obtain distict inputs and create matrix link_t = lines.map(parser_t).distinct() h = sc.parallelize(arr) h_pair = h.map(mapper) a_pair = None for j in range(50): h_matmul_pair = link.join(h_pair) #get pairs to multiply h_matmul = h_matmul_pair.map(lambda x: (x[1][0][0], x[1][0][1] * x[1][1][1])).reduceByKey(lambda a, b: a + b) #matrix multiplication h_max_val = h_matmul.max(key= lambda x: x[1]) a_pair = h_matmul.map(h_normalize) #normalize vector a_matmul_pair = link_t.join(a_pair) #get pairs to multiply a_matmul = a_matmul_pair.map(lambda x: (x[1][0][0], x[1][0][1] * x[1][1][1])).reduceByKey(lambda a, b: a + b) #matrix multiplication a_max_val = a_matmul.max(key= lambda x: x[1]) h_pair = a_matmul.map(a_normalize) #normalize vector h_top_10 = sorted(h_pair.collect(), key=lambda x: -x[1][1])[:10] #get top 10
class TaskContextTests(PySparkTestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ # Allow retries even though they are normally disabled in local mode self.sc = SparkContext('local[4, 2]', class_name) def test_stage_id(self): """Test the stage ids are available and incrementing as expected.""" rdd = self.sc.parallelize(range(10)) stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] # Test using the constructor directly rather than the get() stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0] self.assertEqual(stage1 + 1, stage2) self.assertEqual(stage1 + 2, stage3) self.assertEqual(stage2 + 1, stage3) def test_resources(self): """Test the resources are empty by default.""" rdd = self.sc.parallelize(range(10)) resources1 = rdd.map(lambda x: TaskContext.get().resources()).take( 1)[0] # Test using the constructor directly rather than the get() resources2 = rdd.map(lambda x: TaskContext().resources()).take(1)[0] self.assertEqual(len(resources1), 0) self.assertEqual(len(resources2), 0) def test_partition_id(self): """Test the partition id.""" rdd1 = self.sc.parallelize(range(10), 1) rdd2 = self.sc.parallelize(range(10), 2) pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect() pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect() self.assertEqual(0, pids1[0]) self.assertEqual(0, pids1[9]) self.assertEqual(0, pids2[0]) self.assertEqual(1, pids2[9]) def test_attempt_number(self): """Verify the attempt numbers are correctly reported.""" rdd = self.sc.parallelize(range(10)) # Verify a simple job with no failures attempt_numbers = rdd.map( lambda x: TaskContext.get().attemptNumber()).collect() map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers) def fail_on_first(x): """Fail on the first attempt so we get a positive attempt number""" tc = TaskContext.get() attempt_number = tc.attemptNumber() partition_id = tc.partitionId() attempt_id = tc.taskAttemptId() if attempt_number == 0 and partition_id == 0: raise Exception("Failing on first attempt") else: return [x, partition_id, attempt_number, attempt_id] result = rdd.map(fail_on_first).collect() # We should re-submit the first partition to it but other partitions should be attempt 0 self.assertEqual([0, 0, 1], result[0][0:3]) self.assertEqual([9, 3, 0], result[9][0:3]) first_partition = filter(lambda x: x[1] == 0, result) map(lambda x: self.assertEqual(1, x[2]), first_partition) other_partitions = filter(lambda x: x[1] != 0, result) map(lambda x: self.assertEqual(0, x[2]), other_partitions) # The task attempt id should be different self.assertTrue(result[0][3] != result[9][3]) def test_tc_on_driver(self): """Verify that getting the TaskContext on the driver returns None.""" tc = TaskContext.get() self.assertTrue(tc is None) def test_get_local_property(self): """Verify that local properties set on the driver are available in TaskContext.""" key = "testkey" value = "testvalue" self.sc.setLocalProperty(key, value) try: rdd = self.sc.parallelize(range(1), 1) prop1 = rdd.map( lambda _: TaskContext.get().getLocalProperty(key)).collect()[0] self.assertEqual(prop1, value) prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty( "otherkey")).collect()[0] self.assertTrue(prop2 is None) finally: self.sc.setLocalProperty(key, None) def test_barrier(self): """ Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks within a stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) def context_barrier(x): tc = BarrierTaskContext.get() time.sleep(random.randint(1, 10)) tc.barrier() return time.time() times = rdd.barrier().mapPartitions(f).map(context_barrier).collect() self.assertTrue(max(times) - min(times) < 1) def test_barrier_infos(self): """ Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the barrier stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) taskInfos = rdd.barrier().mapPartitions(f).map( lambda x: BarrierTaskContext.get().getTaskInfos()).collect() self.assertTrue(len(taskInfos) == 4) self.assertTrue(len(taskInfos[0]) == 4) def test_context_get(self): """ Verify that TaskContext.get() works both in or not in a barrier stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): taskContext = TaskContext.get() if isinstance(taskContext, BarrierTaskContext): yield taskContext.partitionId() + 1 elif isinstance(taskContext, TaskContext): yield taskContext.partitionId() + 2 else: yield -1 # for normal stage result1 = rdd.mapPartitions(f).collect() self.assertTrue(result1 == [2, 3, 4, 5]) # for barrier stage result2 = rdd.barrier().mapPartitions(f).collect() self.assertTrue(result2 == [1, 2, 3, 4]) def test_barrier_context_get(self): """ Verify that BarrierTaskContext.get() should only works in a barrier stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): try: taskContext = BarrierTaskContext.get() except Exception: yield -1 else: yield taskContext.partitionId() # for normal stage result1 = rdd.mapPartitions(f).collect() self.assertTrue(result1 == [-1, -1, -1, -1]) # for barrier stage result2 = rdd.barrier().mapPartitions(f).collect() self.assertTrue(result2 == [0, 1, 2, 3])
class TaskContextTestsWithWorkerReuse(unittest.TestCase): def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.python.worker.reuse", "true") self.sc = SparkContext('local[2]', class_name, conf=conf) def test_barrier_with_python_worker_reuse(self): """ Regression test for SPARK-25921: verify that BarrierTaskContext.barrier() with reused python worker. """ # start a normal job first to start all workers and get all worker pids worker_pids = self.sc.parallelize( range(2), 2).map(lambda x: os.getpid()).collect() # the worker will reuse in this barrier job rdd = self.sc.parallelize(range(10), 2) def f(iterator): yield sum(iterator) def context_barrier(x): tc = BarrierTaskContext.get() time.sleep(random.randint(1, 10)) tc.barrier() return (time.time(), os.getpid()) result = rdd.barrier().mapPartitions(f).map(context_barrier).collect() times = list(map(lambda x: x[0], result)) pids = list(map(lambda x: x[1], result)) # check both barrier and worker reuse effect self.assertTrue(max(times) - min(times) < 1) for pid in pids: self.assertTrue(pid in worker_pids) def test_task_context_correct_with_python_worker_reuse(self): """Verify the task context correct when reused python worker""" # start a normal job first to start all workers and get all worker pids worker_pids = self.sc.parallelize( xrange(2), 2).map(lambda x: os.getpid()).collect() # the worker will reuse in this barrier job rdd = self.sc.parallelize(xrange(10), 2) def context(iterator): tp = TaskContext.get().partitionId() try: bp = BarrierTaskContext.get().partitionId() except Exception: bp = -1 yield (tp, bp, os.getpid()) # normal stage after normal stage normal_result = rdd.mapPartitions(context).collect() tps, bps, pids = zip(*normal_result) print(tps) self.assertTrue(tps == (0, 1)) self.assertTrue(bps == (-1, -1)) for pid in pids: self.assertTrue(pid in worker_pids) # barrier stage after normal stage barrier_result = rdd.barrier().mapPartitions(context).collect() tps, bps, pids = zip(*barrier_result) self.assertTrue(tps == (0, 1)) self.assertTrue(bps == (0, 1)) for pid in pids: self.assertTrue(pid in worker_pids) # normal stage after barrier stage normal_result2 = rdd.mapPartitions(context).collect() tps, bps, pids = zip(*normal_result2) self.assertTrue(tps == (0, 1)) self.assertTrue(bps == (-1, -1)) for pid in pids: self.assertTrue(pid in worker_pids) def tearDown(self): self.sc.stop()
for filename in filenames: f = open("/Users/panpan/Desktop/linkedin/followings/group3/%s" % filename, "r") files.append(f.readline()) #initialize mutual_list mutual_list = numpy.zeros((len(filenames), len(filenames))) #pick two users each time, and calculate their common freinds for i in range(0, len(files)): if i + 1 >= len(files): continue for j in range(i, len(files)): file_1 = files[i].split(",") file_2 = files[j].split(",") file1 = sc.parallelize(file_1) file2 = sc.parallelize(file_2) #common friends of the two users file_12 = file1.intersection(file2) mutual = len(file_12.collect()) #define a way to cauculate how much percent they are similar to each other mutual_proportion = 1.0 / 2 * mutual * (1.0 / len(file_1) + 1.0 / len(file_2)) mutual_list[i][j] = mutual_list[j][i] = mutual_proportion ###Cluster the models model = cl.KMeans.train(sc.parallelize(mutual_list), 5, maxIterations=15, runs=20, initializationMode="random",
def compute_final_svd(sc: SparkContext, y: List[DenseVector], k: int) -> SingularValueDecomposition: Y: RowMatrix = RowMatrix(sc.parallelize(y)) svd_model = Y.computeSVD(k=k, computeU=True) return svd_model
def run(): conf = SparkConf() #conf.set('spark.shuffle.blockTransferService', 'nio') conf.set('spark.files.fetchTimeout', '180') conf.set('spark.files.overwrite', 'yes') conf.set('spark.akka.timeout', '180') #conf.set('spark.task.maxFailures', '30000') conf.set('spark.akka.frameSize', '500') conf.set('spark.network.timeout', '180') myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p', 'rb')) dataSetMaker = DataSetMakerV2(n=200000) feed = FeedNewsFromGoogleFinance() def sendRecord(rdd): print('new try...') if (not rdd.isEmpty()): newsRDD = dataSetMaker.processKeepNews(rdd) res = newsRDD.map( lambda x: (x[0], myClassifierOnevsOne.predict(x[1].features))) print('for each result...') for result in res.collect(): symbole = result[0].symbole r = requests.put('http://wtun.mooo.com:5000', data={ 'jdata': NewsPrediction(result[0], str(result[1])).json(), 'symbole': symbole, 'label': str(result[1]) }) print('send ok') print('receive %s' % str(r.text)) else: print('empty!') sc = SparkContext(conf=conf) symbolesRDD = sc.parallelize([('NASDAQ:GOOGL', ['GOOG', 'GOOGL', 'GOOGLE']), ('NASDAQ:NVDA', ['NVIDIA']), ('VTX:SCMN', ['SWISSCOM'])]) taskdt = 600 running = True oldNewsRDD = None firstTime = True intersectRDD = None dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2' cpt = 0 while (running): today = datetime.datetime.now() yesterday = today - datetime.timedelta(days=1) tomorrow = today + datetime.timedelta(days=1) newsRDD = symbolesRDD.flatMap( lambda x: feed.lookingAt(x[0], yesterday, tomorrow, x[1])) if (firstTime): firstTime = False intersectRDD = newsRDD else: try: intersectRDD = oldNewsRDD.intersection(newsRDD) except: pass # empty rdd oldNewsRDD = newsRDD try: sendRecord(intersectRDD) intersectRDD.saveAsPickleFile( dataDirectory + '/' + datetime.datetime.now().strftime('%Y-%m-%d--') + str(cpt)) cpt += 1 except: pass # empty rdd time.sleep(taskdt) running = False # TODO remove it
# Read all edge inputs, reverse edges and union as graph is undirected edgeInputs = bFile.map(lambda x: struct.unpack("<qq", x)) allEdgeList = sc.union( [edgeInputs, edgeInputs.map(lambda x: (x[1], x[0]))]) totalEdgeCount = allEdgeList.count() # Reduce edge list to tuple of vertex and array of child vertices inputGraph = allEdgeList.map( lambda edge: (edge[0], [edge[1]])).reduceByKey(lambda a, b: a + b) inputGraph.cache() totalVertexCount = inputGraph.count() distances = inputGraph.map(lambda x: (x[0], -1)) distances.cache() currentLevel = 0 currentLevelQueue = sc.parallelize([(root, currentLevel)]) currentLevelQueue = currentLevelQueue.join(distances).filter( lambda x: x[1][1] == -1).map(lambda x: (x[0], x[1][0])) currentLevelQueue.cache() #print("CurrentLevel: {}\n".format(currentLevel)) while (not (currentLevelQueue.isEmpty())): distances = distances.leftOuterJoin(currentLevelQueue).map( lambda x: (x[0], x[1][0]) if x[1][1] is None else (x[0], x[1][1])) distances.cache() currentLevel += 1 nextLevelQueue = inputGraph.join(currentLevelQueue).flatMap( lambda node: map((lambda child: child), node[1][0])) currentLevelQueue = nextLevelQueue.distinct().map(lambda x:
# -*- coding: utf-8 -*- import findspark findspark.init() from pyspark import SparkContext data_path = "C:\\PySpark\\data" sc = SparkContext("local", "repartition") rdd1 = sc.parallelize(range(1, 100), 2) print("rdd1 partitions: {}".format(rdd1.getNumPartitions())) rdd2 = rdd1.repartition(5) print("rdd2 partitions: {}".format(rdd2.getNumPartitions())) rdd3 = rdd2.coalesce(3) print("rdd3 partitions: {}".format(rdd3.getNumPartitions())) sc.stop()
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix from pyspark.sql import SQLContext from pyspark import SparkContext sc = SparkContext() rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) block_matrix = IndexedRowMatrix( \ rows \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() mat_product = block_matrix.multiply(block_matrix) result = mat_product.toLocalMatrix() print("Matrix Product \n", result) mat_sum = block_matrix.add(block_matrix) result = mat_sum.toLocalMatrix() print("Matrix Sum \n", result) mat_transpose = block_matrix.transpose() result = mat_transpose.toLocalMatrix() print("Matrix Transpose \n", result)
engineStr = cred.getEngineStr() engine = create_engine(engineStr) Session = sessionmaker() Session.configure(bind=engine) session = Session() return session session = makeSession() def addVersions(records): session = makeSession() for row in records: curId = row[0] elem = session.query(RawXML).get(curId) root = etree.fromstring(elem.XML) elem.Version = root.attrib["returnVersion"] session.add(elem) session.commit() sc = SparkContext() records = session.query(RawXML.id, RawXML.Version)\ .filter(RawXML.Version == None) session.close() sc.parallelize(records)\ .foreachPartition(addVersions)
def updateStockInfo(stockFile, iniFile): config = configparser.ConfigParser() config.read(iniFile) localeStr = config['stats']['locale'] locale.setlocale(locale.LC_ALL, localeStr) configStore = config['store'] numJobs = config['spark']['numJobs'] stocks = [] with open(stockFile, 'r') as stockFile: for stock in stockFile: stock = stock.strip(' \n\r') if (stock != ''): stocks.append(stock) if (stocks is None or len(stocks) == 0): print(f"Failed to read any stocks from file: {stockFile}") exit conf = SparkConf().setAppName("LatestStockInfo") sc = SparkContext(conf=conf) broadCastConfig = sc.broadcast(config) tries = 5 attempts = 1 startStocksNum = len(stocks) lastCount = startStocksNum # processedCount = 0 while tries > 0: #Parallise the stock list - one spark process per stock print( f"****************Attempt {attempts}: Parallelising stock processing job for {len(stocks)} stocks....standby" ) rdd = sc.parallelize(stocks, numSlices=numJobs) #This does the actual work of retrieving the stock data and working out the metrics and scores #It returns a dict of scores mrdd = rdd.map( lambda stock: retrieveStockInfoSpark(broadCastConfig, stock)) #Collect the info by combining the returned dicts holding the info, this triggers the map operation infos = mrdd.collect() infos = [s for s in infos if s] print( f"*************Attempt {attempts}: Collected {len(infos)} stocks out of {len(stocks)}" ) #Check that we have all the info print( f"***************Attempt {attempts}: Checking all info retreived") mrdd = rdd.map(lambda stock: checkStockSpark(broadCastConfig, stock)) stocks = mrdd.collect() #Remove Nones stocks = [s for s in stocks if s] if (len(stocks) == 0): print( f"***************Attempt {attempts}: All stocks info check out apparently" ) #Done break if (len(stocks) == lastCount): print( f"***************Attempt {attempts}: Number of stocks left {len(stocks)} is the same as last attempt - aborting" ) if (len(stocks) < 20): print(f"***************Failed stocks: {stocks}") #Done break tries -= 1 attempts += 1 print( f"***************Attempt {attempts}: Retrying for stocks: {stocks}" ) print( f"***************Job complete: Processed {startStocksNum - len(stocks)} out of {startStocksNum} stocks" )
# # # Tutorialspoint - PySpark; Learn Pyspark # # #----------------------------------------foreach.py--------------------------------------- from pyspark import SparkContext sc = SparkContext("local", "ForEach app") words = sc.parallelize ( ["scala", "java", "hadoop", "spark", "akka", "spark vs hadoop", "pyspark", "pyspark and spark"] ) def f(x): print(x) fore = words.foreach(f)
#!/usr/bin/env python import findspark findspark.init() from pyspark import SparkContext from pyspark import SparkConf conf = SparkConf().setMaster("local").setAppName("My app") sc = SparkContext(conf = conf) lines = sc.textFile("ch01.py") inputRDD = lines.filter(lambda x:"sc" in x) for line in inputRDD.take(10): print line lines = sc.parallelize(["hello world", "hi"]) words = lines.flatMap(lambda line:line.split(" ")) print words.first() data = sc.parallelize([1,2,3,4,1,3]) print data.reduce(lambda x,y: x+y) def printall(rdd): print("----------") for r in list(rdd.collect()): print r printall(data.distinct()) for d in list(data.distinct().collect()): print d d = sc.parallelize(["1, hello", "2, hi", "3, how are you"]) for _ in list(d.map(lambda x:(x.split(",")[0], x)).collect()): print _
def _process_wav(self, record: WavRecord): wav = self.audio.load_wav(record.wav_path) wav = self.audio.trim(wav) file_path = os.path.join(self.out_dir, f"{record.key}.tfrecord") write_preprocessed_data(record.key, wav, record.speaker_info.id, record.speaker_info.age, record.speaker_info.gender, file_path) return record.key if __name__ == "__main__": args = docopt(__doc__) in_dir = args["<in_dir>"] out_dir = args["<out_dir>"] default_params.parse(args["--hparams"]) instance = VCTK(in_dir, out_dir, default_params) sc = SparkContext() rdd = instance.process_wavs( sc.parallelize(instance.list_wav_files())) data_file_paths = rdd.collect() with open(os.path.join(out_dir, 'list.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) for path in data_file_paths: writer.writerow([path])
from pyspark import SparkContext from pyspark.streaming import StreamingContext # Create a local StreamingContext with two working threads and a batch interval of 2 seconds sc = SparkContext("local[2]", "Sensor") ssc = StreamingContext(sc, 20) # Create a DStream lines = ssc.socketTextStream("sandbox-hdp.hortonworks.com", 3333) # Bazic reduceByKey example in python # creating PairRDD x with key value pairs xx = sc.parallelize([("a", 1), ("b", 1), ("a", 1), ("a", 1), ("b", 1), ("b", 1), ("b", 1), ("b", 1)], 3) # Applying reduceByKey operation on x y = xx.reduceByKey(lambda accum, n: accum + n) print(y.collect()) # [('b', 5), ('a', 3)] # Define associative function separately def sumFunc(accum, n): return accum + n y = xx.reduceByKey(sumFunc) print(y.collect()) # [('b', 5), ('a', 3)]
page_num = 0 total_pages = 1 batch_size = 100 while page_num < total_pages: url_query = url_cve + "/pages/" + str(page_num) + "?limit=" + str( batch_size) results_json = apiDownloader.download_api(url_query, "isi", args.password) if results_json is not None and "results" in results_json: results = results_json["results"] num_results = len(results) total_pages = results_json["total_pages"] print "Downloaded ", num_results, " new CVE data rows. Adding them to CDR. Page:", ( page_num + 1), " of ", total_pages if num_results > 0: apiDownloader.load_into_cdr(results, "hg_cve", args.team, "hg-cve") print "Done loading into CDR" print "Taking backup on S3" rdd = sc.parallelize(results) rdd.map(lambda x: ("hg-cve", json.dumps(x)) ).saveAsSequenceFile(args.outputFolder + "/hg-cve/" + str(page_num)) print "Done taking backing on S3" else: print "No data found:", results_json page_num += 1
if len(sys.argv) > 3 and sys.argv[3] == 'local': spark = SparkContext('local', appName = 'SparkLCA') else: spark = SparkContext(appName = 'SparkLCA') tstart = time.time() N = int(sys.argv[2]) g = load_graph(sys.argv[1]) if len(sys.argv) > 3 and sys.argv[3] != 'local': out_hdfs = sys.argv[3] else: out_hdfs = None print 'finish loading graph data %f secs elapsed' % (time.time()-tstart) seeds = spark.parallelize([p for p in g.nodes() if p <= N]) gtuple = spark.broadcast(g.get_tuple()) print 'finish broadcasting, %f secs elapsed' % (time.time()-tstart) cite_depth = seeds.flatMap(lambda k: shortest_path(gtuple.value, k)) dist_root = cite_depth.groupByKey() pairs_rdd = dist_root.flatMap(lambda x: map_pairs(x[1], get_year(gtuple.value, x[0]), x[0])) lca_rdd = pairs_rdd.reduceByKey(lambda x, y: x if cmp_key(x) < cmp_key(y) else y) lca = lca_rdd.map(lambda x: x[0] + x[1]) print 'finish calculation, %f secs elapsed' % (time.time()-tstart) if out_hdfs is None: lca = lca.collect() with open(sys.argv[1]+'/result-%d.csv' % N, 'wb') as resultsfile: writer = csv.writer(resultsfile)
print (" ") print (" ") print ("matriz de correlacion:") print (" ") print(Statistics.corr(rows, method="pearson") ''' file = sc.textFile("Process_Data/SuperFile/superfile.dat") row = file.map(lambda line: line.split(' ')[1:len(line)]).map( lambda xs: [float(x) for x in xs]) row_list = row.collect() #transforms to list print(row_list) #matrix w, h = 1, 38 new_list = [[0 for x in range(w)] for y in range(h)] for i in range(0, len(row_list)): new_list[i][:] = Vectors.dense(row_list[i]) i += 1 rows = sc.parallelize([new_list]) print(rows) summary = Statistics.colStats(rows) print("media:"), (summary.mean()) print("varianza:"), (summary.variance()) print("max:"), (summary.max()) print("min:"), (summary.min()) print("non Zeros:"), (summary.numNonzeros())
levelIndex=2 levelSets=[tuple([i]) for i in range(1,16)]+[tuple(i for i in range(5*j+16, 5*j+21)) for j in range(7)] normedRows=[] normedLevelSets={} denormalizers={} for levelSet in levelSets: levelRows=[row for row in archmageList if row[levelIndex] in levelSet] normalized=normalize(levelRows) normedRows+=normalized for i in range(len(normalized)): row=normalized[i] denormalizers[tuple(row)]=levelRows[i] normalArchmage = sc.parallelize(normedRows).persist() numClusters= int(sys.argv[1]) numIterations= int(sys.argv[2]) model = KMeans.train(normalArchmage, numClusters, maxIterations=numIterations) randomRow=normedRows[0] # print("row:", randomRow) # print("denormed:", denormalizers[tuple(randomRow)] ) # print("cluster:", model.predict(randomRow)) #print("labeled cluster:", columnToArcher(model.centers[model.predict(randomRow)])) #print("\n\n")
tokenizer = RegexpTokenizer(r'\w+') tokens1 = tokenizer.tokenize(j) tokens1 = [w.lower() for w in tokens1] if (i in tokens1): sst = ' '.join(tokens1) ss = str(list(parser.raw_parse(sst))) wor.append(calc1(ss, tokens1)) di[i] = wor implicit = [] explicit = [] tokenizer = RegexpTokenizer(r'\w+') for key in di: for ele in di[key]: if (key in tokenizer.tokenize(ele)): explicit.append((key, ele)) else: implicit.append((key, ele)) rdd = sc.parallelize(explicit) rdd1 = sc.parallelize(implicit) #implicit reviews #explicit reviews rdd.coalesce( 1, shuffle=True).saveAsTextFile("hdfs://localhost:9000/output/" + iiii.split('/')[-1].split(".txt")[0] + "/exp") rdd1.coalesce( 1, shuffle=True).saveAsTextFile("hdfs://localhost:9000/output/" + iiii.split('/')[-1].split(".txt")[0] + "/imp") print("new file new file new file new file new file") # #############################################################################
# HDFS Client hdfs_client = None if cluster_execution: hdfs_client = InsecureClient(hdfs_address, user=hdfs_user) # searching for data dataset_files = list() for dataset_path in list_dir(output_dir, hdfs_client, cluster_execution): for f in list_dir(os.path.join(output_dir, dataset_path), hdfs_client, cluster_execution): if 'learningData.csv' in f: dataset_files.append(os.path.join(output_dir, dataset_path, f)) # computing stats files = sc.parallelize(dataset_files, 365) stats = files.flatMap( lambda x: generate_stats_from_dataset(x, params)).persist( StorageLevel.MEMORY_AND_DISK) n_rows = stats.map(lambda x: x[0]).collect() n_columns = stats.map(lambda x: x[1]).collect() size_bytes = stats.map(lambda x: x[2]).collect() hist_n_rows = np.histogram(n_rows, bins=500) hist_n_columns = np.histogram(n_columns, bins=500) hist_size_bytes = np.histogram(size_bytes, bins=500) print(' -- N. Rows:')