def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42)

    # Train the models
    decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={},
                                         impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2)

    # Test the model
    testDecisionTree(decisionTreeModel, parallelized_test_set)
def longest_common_substring(strands):
	pass
	# create the Spark context
	conf = SparkConf().setAppName("longest_common_substring")
	sc = SparkContext(conf=conf)

	# create an accumulator for key-value pairs, where each key is a substring, and each value is the set of strings where the substring can be found
	class ArrayAccumulatorParam(AccumulatorParam):
		def zero(self, initialValue):
			return initialValue

		def addInPlace(self, v1, v2):
			if type(v2) is list:
				v1.extend(v2)
			elif type(v2) is tuple:
				v1.append(v2)

			return v1

	acc = sc.accumulator([], ArrayAccumulatorParam())

	def generate_substrings(data_element):
		k, v = data_element
		i = 0
		while i < len(v):
			j = i + 1
			while j < len(v):
				acc.add((v[i:j],k))
				j += 1
			i += 1

	sc.parallelize([(k, v) for k, v in strands.iteritems()]).foreach(generate_substrings)

	all_substrings = sc.parallelize(acc.value)
	return all_substrings.groupByKey().filter(lambda x: set(list(x[1])) == set(strands.keys())).takeOrdered(1, key=lambda x: -len(x[0]))[0][0]
def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Truncate the last 2 features of the data
    for dataPoint in train_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    for dataPoint in test_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42)

    # Train the models
    randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={},
                                         numTrees=750, seed=42, maxDepth=30, maxBins=32)

    # Test the model
    testRandomForest(randomForestModel, parallelized_test_set)
def SearchTiles_and_Factorize(n): 
	global globalmergedtiles
	global globalcoordinates
	global factors_accum 
	global spcon

	spcon = SparkContext("local[4]","Spark_TileSearch_Optimized")

	if persisted_tiles == True:
        	tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r")

        	tileintervalslist=tileintervalsf.read().split("\n")
		#print "tileintervalslist=",tileintervalslist
        	tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam())
		paralleltileintervals=spcon.parallelize(tileintervalslist)
		paralleltileintervals.foreach(tilesearch)
	else:
		factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w")
		hardy_ramanujan_ray_shooting_queries(n)
		hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n)
		baker_harman_pintz_ray_shooting_queries(n)
		cramer_ray_shooting_queries(n)
		zhang_ray_shooting_queries(n)
        	factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam())
		#spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent)
		spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent)
		print "factors_accum.value = ", factors_accum.value
		factors=[]
		factordict={}
		for f in factors_accum.value:
			factors += f
		factordict[n]=factors
		json.dump(factordict,factorsfile)
		return factors
def SparkBroadcastAccumulator(n): 
	global broadcast_var
	global accumulator_var
	spcon = SparkContext("local[2]","SparkBroadcastAccumulator")
	broadcast_var=spcon.broadcast("broadcast_message")
	accumulator_var=spcon.accumulator(0)
	spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))
Beispiel #6
0
class LookAlikeTest(unittest.TestCase):
    def setUp(self):
        conf = SparkConf().setAppName("Tests").setMaster("local")
        self.sc = SparkContext(conf=conf)

    def tearDown(self):
        self.sc.stop()

    def test_ratings_calculation(self):
        data = [("u1", 123), ("u1", 123), ("u1", 132),
                ("u2", 123), ("u2", 111), ("u2", 111), ("u2", 111), ("u2", 111),
                ("u3", 123), ("u3", 123), ("u3", 125), ("u3", 125), ("u3", 111)]
        input_data = self.sc.parallelize(data)
        ratings = calculate_ratings(input_data).collectAsMap()
        self.assertEqual(ratings["u1"][123], 1.0)
        self.assertEqual(ratings["u1"][132], 0.5)
        self.assertEqual(ratings["u2"][111], 1.0)
        self.assertEqual(ratings["u2"][123], 0.25)
        self.assertEqual(ratings["u3"][123], 1.0)
        self.assertEqual(ratings["u3"][125], 1.0)
        self.assertEqual(ratings["u3"][111], 0.5)

    def test_correlations_calculation(self):
        ratings = [("u1", {1: 0.5, 2: 1.0, 3: 0.1}),
                   ("u2", {1: 0.25, 3: 1.0}),
                   ("u3", {2: 0.25, 3: 1.0})]
        ratings_data = self.sc.parallelize(ratings)
        correlations = calculate_correlations(ratings_data, 3).collectAsMap()
        self.assertEqual(round(correlations[1], 2), -1.0)
        self.assertEqual(round(correlations[2], 2), -1.0)
class TestWordCounter(unittest.TestCase):



	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.counter = WordCounter() 

	def tearDown(self):
	   self.sc.stop()

	def test_when_exist_one_movie_and_counter(self):
	   movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]	 
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
	             ('Toy', ['::Toy Story Toy (1995)::']))                
	   movies = self.sc.parallelize(movieList)
 	   self.assertEqual(self.counter.getMaxValues(movies),result)   


 	def test_when_exist_one_movie_and_counter_moreMovies(self):
	   movieList = ["1993::Toy Story Toy (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryB ToyA (1995)::Animation|Children's|Comedy",
	                "1993::ToyA StoryA ToyA (1995)::Animation|Children's|Comedy"]	 
	   result = (('ToyA', ['::ToyA StoryB ToyA (1995)::','::ToyA StoryA ToyA (1995)::']))                
	   movies = self.sc.parallelize(movieList)
 	   self.assertEqual(self.counter.getMaxValues(movies),result)   
Beispiel #8
0
def parallelDisassembler(matrix,groups):


	def splitGroup(g):
		b,n1,n2 = isAFalse(g,matrix)
		if b:
			g1 = [n1]
			g2 = [n2]

			for nid in g:
				if nid != n1 and nid != n2:
					sim1 = 0.0
					sim2 = 0.0
					for tmp in g1:
						sim1 += jcSig(matrix[tmp],matrix[nid])
					for tmp in g2:
						sim2 += jcSig(matrix[tmp],matrix[nid])


					if sim1 / len(g1) > sim2 / len(g2):
						g1 += [nid]
					else:
						g2 += [nid]

			return g1,g2

		return ([],g)

	tmp = len(groups)
	sc = SparkContext(appName="Splitter")
	parrGroup = sc.parallelize(groups)
	groups = parrGroup.map(splitGroup).collect()
	tmpgrp = []
	for g1,g2 in groups:
		tmpgrp += [g1]
		tmpgrp += [g2]

	groups = tmpgrp

	while len(groups) != tmp :

		tmp = len(groups)
		#print(tmp)

		parrGroup = sc.parallelize(groups)
		groups = parrGroup.map(splitGroup).collect()
		tmpgrp = []
		for g1,g2 in groups:
			if g1 != []:
				tmpgrp += [g1]
			tmpgrp += [g2]

		groups = tmpgrp

	sc.stop()

	return groups
Beispiel #9
0
def _kmeans_spark(X, n_clusters, max_iter=300, worker_nums=10, init='k-means++', random_state=None, tol=1e-4):
    from pyspark import SparkContext, SparkConf

    conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%worker_nums)
    sc = SparkContext(conf=conf)
    data = sc.parallelize(X)
    data.cache()

    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    x_squared_norms = row_norms(X, squared=True)
    #  x_squared_norms = data.map(lambda x: (x*x).sum(axis=0)).collect()
    #  x_squared_norms = np.array(x_squared_norms, dtype='float64')

    centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms)

    bs = X.shape[0]/worker_nums
    data_temp = []
    for i in range(worker_nums-1):
        data_temp.append(X[i*bs:(i+1)*bs])
    data_temp.append(X[(worker_nums-1)*bs:])
    data_temp = np.array(data_temp, dtype='float64')
    data_temp = sc.parallelize(data_temp)
    data_temp.cache()


    for i in range(max_iter):
        centers_old = centers.copy()

        all_distances = data_temp.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
        temp_all_distances = all_distances[0]
        for i in range(1, worker_nums):
            temp_all_distances = np.hstack((temp_all_distances, all_distances[i]))
        all_distances = temp_all_distances

        #  all_distances = data.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
        #  # reshape, from (1, n_samples, k) to (k, n_samples)
        #  all_distances = np.asarray(all_distances, dtype="float64").T[0]

        # Assignment, also called E-step of EM
        labels, inertia = _labels_inertia(X, x_squared_norms, centers, all_distances=all_distances)
        # re-computation of the centroids, also called M-step of EM
        centers = _centers(X, labels, n_clusters)

        if best_inertia is None or inertia < best_inertia:
            best_labels  = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        shift = squared_norm(centers_old - centers)
        if shift <= tol:
            break

    return best_centers, best_labels, best_inertia
Beispiel #10
0
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca):

    pca_n = 2
    sc = SparkContext(master)
    data = sc.textFile(dataPath)

# not RDD data 

    ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)])))

    if label == 0:
        ndata = ndata.map(lambda line: line[::-1])

    if normalize == 1:
        test_data = norm(ndata.collect())    
        norm_data = sc.parallelize(test_data)
        train_data = norm_data.map(lambda part: lbp(part[0], part[1]))   
     #raw_data = data.map(lambda line: line.split(character))


    else:
        test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect()
        train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1]))
    
    
    if ispca == 1:
        pca = PCA(n_components = pca_n)
        pca_train = [test_data[i][1] for i in range(len(test_data))]
        pca_data = pca.fit(pca_train).transform(pca_train)

        test = []
        for i in range(len(pca_data)):
            test.append([test_data[i][0], pca_data[i]])

        train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1]))
        test_data = test
            

    model_lr = lr.train(train_data)
    err_lr = 0.0
    size = len(train_data.collect())
   
    for i in range(size):
        err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0])
           

    print "result:", err_lr/size

    String = "Linear Regression Result:\n"
    String = String + str(model_lr.weights) + '\n'
    String = String + "Error: " + str(err_lr / size) 
    
    sc.stop()

    return String
def main(num_factors, num_workers, num_iterations, beta_value, lambda_value, Wm_value, \
        V_filename, output_W_filename, output_H_filename):
    # Conf
    conf = SparkConf().setAppName("Spark SGD MF")
    sc = SparkContext(conf=conf)
    
    user_movie_ratings = sc.textFile(V_filename).map(line_to_movie_user_ratings)
    user_movie_ratings.persist()

    #global user_nonzero, movie_nonzero
    #user_nonzero = user_movie_ratings.keyBy(first_element).countByKey()
    #movie_nonzero = user_movie_ratings.keyBy(second_element).countByKey()

    num_users = int(user_movie_ratings.map(first_element).reduce(max))
    num_movies = int(user_movie_ratings.map(second_element).reduce(max))

    global updates_total
    updates_total = 0
   
    # Begin iterations
    iter = 0
    global seed
    while iter < num_iterations:
        # Initialize W and H
        if iter == 0:
            W = sc.parallelize(range(num_users+1)).map(key_to_entry_rand).persist()#(user_id,rand(num_factors))
            H = sc.parallelize(range(num_movies+1)).map(key_to_entry_rand).persist()#(movie_id,rand(num_factors)

        # Set random seed
        seed = random.randrange(MAXSEED)

        # Partition parameters
        W_blocks = W.keyBy(lambda W_entry: item_to_block(W_entry[0]))#key:worker_id,value:(user_id,rand(num_factors))
        H_blocks = H.keyBy(lambda H_entry: item_to_block(H_entry[0]))#key:worker_id,value:(movie_id,rand(num_factors)

        # Filter diagonal blocks
        V_diagonal = user_movie_ratings.filter(filter_diagonal).persist()#(user_id,movie_id,rating) where worker_id(user_id) == worker_id(movie_id)
        V_blocks = V_diagonal.keyBy(lambda t : item_to_block(t[0]))#key:worker_id,value:(user_id,movie_id,rating) where user_id == movie_id
        updates_curr = V_diagonal.count()
        V_diagonal.unpersist()    
        V_group = V_blocks.groupWith(W_blocks, H_blocks).coalesce(num_workers)#key:worker_id,value:seq[V],seq[W],seq[H]

        # Perform SGD
        updatedWH = V_group.map(SGD_update).persist()
        W = updatedWH.flatMap(first_element).persist()
        H = updatedWH.flatMap(second_element).persist()
        updates_total += updates_curr
        iter += 1
   
    W_result = numpy.vstack(W.sortByKey().map(second_element).collect()[1:])
    H_result = numpy.vstack(H.sortByKey().map(second_element).collect()[1:])
    # Save W and H
    savetxt(output_W_filename, W_result, delimiter=',')
    savetxt(output_H_filename, H_result, delimiter=',')
    sc.stop
def main(image_files):
    sc = SparkContext( appName="Resize Images")
    sc.parallelize(image_files).map(resize_image_file).count()

    #read all the resized images into an array to save as a pickled object
    #out_dir = CUR_DIR + TEST_OR_TRAIN + '_' + str(IMAGE_SIZE)
    #save_images(out_dir)

    #read all the resized images into an array to save as a csv file
    out_dir = CUR_DIR + TEST_OR_TRAIN + '_' + str(IMAGE_SIZE)
    save_images_csv(out_dir)
def model(classifier, ftrain, fvalid, fprediction):

    startTime = time.time()

    ctx = SparkContext(appName="model_on_Spark")
    sqlContext = SQLContext(ctx)
    logger = SparkLogger(ctx)
    logger.set_level('ERROR')

    # load and prepare training and validation data
    rawTrain, train = prepData(sqlContext, ctx, ftrain)
    rawValid, valid = prepData(sqlContext, ctx, fvalid)

    # is needed to join columns
    valid = indexData(valid)
    rawValid = indexData(rawValid)

    classifiers = {
        "RandomForestClassifier" : RFC
    }

    clf = classifiers[classifier]()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    # train and predict
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
    model = pipeline.fit(train)

    predictions = model.transform(valid)

    # write to file:

    subsetPrediction = predictions.select("prediction", "index")
    subsetValidData = rawValid.select("dataset", "index")

    output = (subsetValidData
               .join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
                    .drop("index")
                    .drop("index"))

    lines = output.map(toCSVLine)
    lines.saveAsTextFile('output')

    evaluator = MulticlassClassificationEvaluator(
       labelCol="label", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print "Test Error = %g" % (1.0 - accuracy)

    executionTime = time.time() - startTime
    row=classifier+','+str(executionTime)
    ctx.parallelize([row]).saveAsTextFile("timing")
Beispiel #14
0
def main():
  HDFS_URI = "hdfs://hdfs.domain.cc/folder"
  sc = SparkContext()
  rdd = sc.parallelize([("a", 1), ("b", 2), ("c", 3)])
  rdd.saveAsNewAPIHadoopFile(HDFS_URI + "/01", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
  rdd = sc.parallelize([("d", 4), ("e", 5), ("f", 6)])
  rdd.saveAsNewAPIHadoopFile(HDFS_URI + "/02", "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat")
  folder = TwoDHDFSMap(sc, HDFS_URI)
  print("hdfsURI test: ", folder.hdfsURI == HDFS_URI)
  print("folder[\"01\"][\"a\"] test: ", folder["01"]["a"] == 1)
  print("\"01\" in folder test: ", "01" in folder)
  print("\"02\" in folder test: ", "02" in folder)
  print("folder[\"02\"][\"d\"] test: ", folder["02"]["d"] == 4)
class TestCalculator (unittest.TestCase):

	def setUp(self):
	   conf = SparkConf().setAppName("appTest").setMaster("local[*]")
	   self.sc = SparkContext(conf=conf)
	   self.setMovies = SetMovies() 

	def tearDown(self):
	   self.sc.stop()

	def test_when_calculate_set_word_most_repeater(self):
	   entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])),
	            ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])),
	            ('Story', (1, ['::Toy Story Toy (1995)::'])),
	            ('StoryA', (3, ['::ToyA StoryA ToyA (1995)::']))]
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']),
	             ("StoryA",["::ToyA StoryA ToyA (1995)::"]))
	   funcReverseTuple = lambda value :((value[1][0],(value[0],value[1][1])))
	   rdd = self.sc.parallelize(entry)		              
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)

	def test_when_calculate_set_word_most_repeater_one(self):
	   entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])),
	            ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])),
	            ('Story', (1, ['::Toy Story Toy (1995)::'])),
	            ('StoryA', (1, ['::ToyA StoryA ToyA (1995)::']))]
	   result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']))
	   funcReverseTuple = lambda value :(value[1][0],(value[0],value[1][1]))
	   rdd = self.sc.parallelize(entry)		              
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)	   

	def test_when_calculate_maximum_year(self):
	   entry = [('(1996)',2),
	            ('(1998)',2),
	            ('(1997)',1)]  
	   result = ('(1996)','(1998)')
	   rdd = self.sc.parallelize(entry)	
	   funcReverseTuple = lambda value :(value[1],value[0])
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)

	def test_when_calculate_maximum_year_with_only_one(self):
	   entry = [('(1996)',2),
	            ('(1998)',1),
	            ('(1997)',1),
	            ('(1999)',1)]  
	   result = ('(1996)')
	   rdd = self.sc.parallelize(entry)	
	   funcReverseTuple = lambda value :(value[1],value[0])
	   self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)        
def log_mapreducer(logfilename, pattern, filt="None"):
        spcon=SparkContext()
	if filt == "None":
        	input=open(logfilename,'r')
        	paralleldata=spcon.parallelize(input.readlines())
        	patternlines=paralleldata.filter(lambda patternline: pattern in patternline)
		print "pattern lines",patternlines.collect()
        	matches=patternlines.map(mapFunction).reduceByKey(reduceFunction)
	else:
        	input=spcon.textFile(logfilename)
		matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction)
        matches_collected=matches.collect()
	print "matches_collected:",matches_collected
	if len(matches_collected) > 0:
		sqlContext=SQLContext(spcon)
		bytes_stream_schema=sqlContext.createDataFrame(matches_collected)
		bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream")
		query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream")
		dict_query_results=dict(query_results.collect())
        	print "----------------------------------------------------------------------------------"
        	print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]"
        	print "----------------------------------------------------------------------------------"
		dict_matches=dict(matches_collected)
		sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True)
        	print "pattern matching lines:",sorted_dict_matches 
        	print "----------------------------------------------------------------------------------"
		print "SparkSQL DataFrame query results:"
        	print "----------------------------------------------------------------------------------"
		pprint.pprint(dict_query_results)
        	print "----------------------------------------------------------------------------------"
		print "Cardinality of Stream Dataset:"
        	print "----------------------------------------------------------------------------------"
		print len(dict_query_results)
		spcon.stop()
        	return sorted_dict_matches 
def solve_puzzle(master, output, height, width, slaves):
    global HEIGHT, WIDTH, level
    HEIGHT=height
    WIDTH=width
    level = 0

    sc = SparkContext(master, "python")

    """ YOUR CODE HERE """
    """ YOUR MAP REDUCE PROCESSING CODE HERE """
    solution=Sliding.solution(WIDTH, HEIGHT)
    sol = Sliding.board_to_hash(WIDTH, HEIGHT, solution)
    data = sc.parallelize([(sol,level),])
    counter = 0
    curLen = 1 
    while(counter < curLen):
        level += 1
        data = data.flatMap(bfs_flat_map)
        

        if (level% 12 == 0):
            data = data.partitionBy(PARTITION_COUNT)
        data = data.reduceByKey(bfs_reduce)
        if (level% 6 == 0):
            counter = curLen
            curLen = data.count()
        
        
    """ YOUR OUTPUT CODE HERE """
    data.coalesce(slaves).saveAsTextFile(output)
    sc.stop()
Beispiel #18
0
def solve_puzzle(master, output, height, width, slaves):
    global HEIGHT, WIDTH, level
    HEIGHT=height
    WIDTH=width
    level = 0

    sc = SparkContext(master, "python")

    """ YOUR CODE HERE """
    sol = Sliding.board_to_hash(WIDTH, HEIGHT, Sliding.solution(WIDTH, HEIGHT))    
    RDD = sc.parallelize([(sol,level)]) 
    count = RDD.count()
    RDD_count = 0
    search = True
    k = 1
    """ YOUR MAP REDUCE PROCESSING CODE HERE """
    while search:
        if k % 3== 0:
            RDD = RDD.flatMap(bfs_map).partitionBy(PARTITION_COUNT).reduceByKey(bfs_reduce) #PUT PARTITION_COUNT FOR 16
        else:
            RDD = RDD.flatMap(bfs_map).reduceByKey(bfs_reduce) 
        if k % 2 == 0:
            RDD_count = RDD.count() 
            if RDD_count == count: 
                search = False
            count = RDD_count
        k = k + 1
        level = level + 1
    """ YOUR OUTPUT CODE HERE """
    RDD = RDD.map(swap_map)  
    RDD.coalesce(slaves).saveAsTextFile(output)    
    #outputLst = RDD.collect()
    #for elem in outputLst:
       #output(str(elem[0]) + " " + str(elem[1])) #output the elements
    sc.stop()
    def run(self):

        sc = SparkContext()
        sqlContext = SQLContext(sc)
        #sqlContext = HiveContext(sc)

        start_scrape = datetime.now()
        begin, begin_parts = scrape.get_boundary(self.begin)
        end, end_parts = scrape.get_boundary(self.end)

        print "here"
        all_years_months_days = self.getYearsMonths()
        print "all_years=", all_years_months_days

        game_ids = scrape.get_games(all_years_months_days, source=scrape.filesystem_scraper)
        print "games=", game_ids

        gamesRDD = sc.parallelize(game_ids)
        gamesRDD.cache()
        print "fileRDD=", gamesRDD

        print "# parttions:", gamesRDD.getNumPartitions()
        print "count=", gamesRDD.count()

        # create RDDs
        self.createRawParquet(sc, sqlContext, gamesRDD)
    
        # Hitter Stats
        batter_games = self.createHitterStats(sqlContext)

        # create Pitcher Stats
        self.createPitcherStats(sqlContext)
        
        print "STOPPING"
        sc.stop()
def raw_files_to_labeled_features(raw_files, label_file):
    # Initialize spark
    conf = SparkConf().setAppName("SpamFilter").setMaster("local[*]")
    sc = SparkContext(conf=conf)

    # Get the set of words that we will be accepting as valid features
    valid_words = set(w.lower() for w in words.words())

    # Load training data and convert to our desired format
    raw_files = sc.wholeTextFiles(raw_files)

    # Extract a document of filtered words from each text file
    documents = raw_files.map(lambda x: (x[0], extract_words(x[1], valid_words)))

    # Calculate TF-IDF values for each document
    tfidf = calculate_tfidf(documents)

    # Load labels
    labels = sc.parallelize(load_labels(label_file)).map(lambda x: x[0])

    # Append indexes to features and labels
    indexed_labels = labels.zipWithIndex().map(lambda x: (x[1],x[0]))
    indexed_features = tfidf.zipWithIndex().map(lambda x: (x[1],x[0]))

    # Join labels and features into tuples and return
    return indexed_labels.join(indexed_features).map(lambda x: x[1]).collect()
def MakeDriveResultFrames():

    try:
        sc = SparkContext()
        print('Making sc')
    except:
        print('Spark Context already exists')
    NumCores=MP.cpu_count();
    yr=range(2009,2015)
    weekRange=range(1,18)
    gameRange=range(0,16)
    GameID=np.zeros([len(yr)*len(weekRange)*len(gameRange),3])
    GameID[:,0]=np.repeat(yr,272)
    GameID[:,1]=np.tile(np.sort(np.tile(weekRange,16)),6)
    GameID[:,2]=np.tile(gameRange,17*len(yr))

    count1=sc.parallelize(range(len(GameID)),NumCores)
    count2=count1.map(lambda x: DriveResult(GameID[x,:]))
    R=count2.collect()
    FrameR=pd.DataFrame(columns=('Field Position','Result'))
    for i in range(len(R)):
        if len(np.shape(R[i]))==2:
            FrameR=FrameR.append(R[i])
    #ResFrame=DriveStartResult(FrameR)
    sc.stop()
    FrameR.to_csv('DriveResults.csv')

    return FrameR
def solve_puzzle(master, output, height, width, slaves):
    global HEIGHT, WIDTH, level
    HEIGHT=height
    WIDTH=width
    level = 0

    sc = SparkContext(master, "python")

    sol = Sliding.solution(WIDTH, HEIGHT)

    """ YOUR CODE HERE """
    sol = Sliding.board_to_hash(WIDTH, HEIGHT, sol)
    new_visited = [(sol, level)]
    
    new_visited = sc.parallelize(new_visited)
    num = 1

    #while there are still (k, v) pairs at the current level
    while num:
        #use += as we do not retain board sets not at the global level
        #in our mapping function
        new_visited += new_visited.flatMap(bfs_map)
        if level % 4 == 3: # only reduce and filter every other iteration for performance reasons
            new_visited = new_visited.reduceByKey(bfs_reduce)
            new_visited = new_visited.partitionBy(PARTITION_COUNT) #figure out how to use hash
            num = new_visited.filter(filter_func).count() # count the number of elements in the RDD at the current level
        level += 1
        # Debuggin purposes print("\n\n\nLevel " + str(level) + '\n\n\n')

    """ YOUR OUTPUT CODE HERE """
    new_visited.coalesce(slaves).saveAsTextFile(output)

    sc.stop()
Beispiel #23
0
def main(training_file,n):

    epochs = int(n);
    x,y,tags = read_training_data(training_file)
    v = {}
    sc = SparkContext(appName="parameterMixing")
    tags = sc.broadcast(tags)
    time0 = time.time()
    training_data = []
    for i in range(len(x)):
        training_data.append((x[i],y[i]))
    train_data = sc.parallelize(training_data).cache()
    for round in range(0,epochs):
        fv = sc.broadcast(v)
        feat_vec_list = train_data.mapPartitions(lambda t: perc_train(t, tags.value, fv.value))
        feat_vec_list = feat_vec_list.combineByKey((lambda x: (x,1)),
                             (lambda x, y: (x[0] + y, x[1] + 1)),
                             (lambda x, y: (x[0] + y[0], x[1] + y[1]))).collect()

        for (feat, (a,b)) in feat_vec_list:
            v[feat] = float(a)/float(b)
    sc.stop()
    # Compute the weight vector using the Perceptron algorithm
    #trainer.perceptron_algorithm(5)
    print "iteration %d in %f seconds" %(iterations, time.time()-t0)
    # Write out the final weight vector
    write_weight_vector(v)
def main():

    master = 'local[1]'
    app_name = 'reduce_demo1'

    # print(range(0, 3))

    sc = SparkContext(master, app_name)

    # 测试1:正常
    # rdd_list = [sc.parallelize(range(i * 3, (i+1) * 3)) for i in range(0,3)]
    # rdd_union = sc.union(rdd_list)
    # print(rdd_union.getNumPartitions())
    # result = rdd_union.map(fun_map_print)
    # result.count()

    # 测试2:两次 union
    rdd_list_outer = []
    for x in ['a', 'b', 'c']:
        rdd_list_inner = [sc.parallelize(map(lambda j: x + str(j),range(i * 3, (i+1) * 3))) for i in range(0,3)]
        rdd_union_inner = sc.union(rdd_list_inner)
        rdd_list_outer.append(rdd_union_inner)

    rdd_union_outer = reduce(lambda rddx, rddy: rddx.union(rddy), rdd_list_outer)
    result = rdd_union_outer.map(fun_map_print)
    result.count()

    sc.stop()
def main():
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: example <keyspace_name> <column_family_name>"
        sys.exit(-1)

    keyspace_name = sys.argv[1]
    column_family_name = sys.argv[2]

    # Valid config options here https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md
    conf = SparkConf().set("spark.cassandra.connection.host", "127.0.0.1")

    sc = SparkContext(appName="Spark + Cassandra Example",
                      conf=conf)

    # import time; time.sleep(30)
    java_import(sc._gateway.jvm, "com.datastax.spark.connector.CassandraJavaUtil")
    print sc._jvm.CassandraJavaUtil

    users = (
        ["Mike", "Sukmanowsky"],
        ["Andrew", "Montalenti"],
        ["Keith", "Bourgoin"],
    )
    rdd = sc.parallelize(users)
    print rdd.collect()
Beispiel #26
0
def solve_puzzle(master, output, height, width, slaves):
    global HEIGHT, WIDTH, level
    HEIGHT=height
    WIDTH=width
    level = 0

    sc = SparkContext(master, "python")

    """ YOUR CODE HERE """
    NUM_WORKERS = slaves

    sol = Sliding.solution(WIDTH, HEIGHT)
    """ MAP REDUCE PROCESSING CODE HERE """
    level_pos = sc.parallelize((make_state(level, sol),))
    prev_size, size = 0, 1

    while prev_size != size:
        level += 1
        if level % 10 == 0:
            level_pos = level_pos.partitionBy(PARTITION_COUNT)
        level_pos = level_pos.flatMap(bfs_flat_map).reduceByKey(bfs_reduce)
        prev_size = size
        size = level_pos.count()

    """ OUTPUT CODE HERE """
    level_pos = level_pos.map(unhash_board)
    level_pos.coalesce(NUM_WORKERS).saveAsTextFile(output)

    sc.stop()
Beispiel #27
0
def _train_spark(data, n_components, n_pc, covar_types, verbose, n_jobs, n_iter_search):
    # Spark configuration.
    conf = (SparkConf()
             .setMaster("local[" + str(n_jobs) + "]")
             .setAppName("FDD")
             .set("spark.executor.memory", "512mb")
             .set("spark.cores.max", str(n_jobs)))
    sc = SparkContext(conf=conf)
    # Build hyperparameter vectors.
    parameters = cartesian((n_components,
                            n_pc,
                            covar_types))
    # Distribute the hyperparameters vector.
    parameters_rdd = sc.parallelize(parameters, 96)
    # Broadcast the data to all workers.
    data_broadcast = sc.broadcast(data)
    # Train a model for each hyperparameter set.
    models = parameters_rdd.map(lambda param: train_with_parameters(param, data_broadcast))
    # Persist the models the avoid re-computation.
    models.persist(StorageLevel(True, True, False, True, 1))
    # Sort by BIC.
    sorted_models = models.sortBy(lambda model: model[0])
    # The first is the best model.
    best_model = sorted_models.collect()[0][1]
    sc.stop()
    return best_model
def solve_puzzle(width, height, output_path, slave_number):
	def hash_to_board(state):
		return Sliding.hash_to_board(width, height, state)
	def board_to_hash(board):
		return Sliding.board_to_hash(width, height, board)
	def get_children_boards(board):
		return Sliding.children(width, height, board)
	def get_solution_hash():
		return Sliding.board_to_hash(width, height, Sliding.solution(width, height))

	sc = SparkContext("local", "Slide")
	boards_rdd = sc.parallelize([(get_solution_hash(), 0)])
	current_level = 0
	while True:
		current_level += 1
		frontier_rdd = boards_rdd.filter(lambda (state, level): level == current_level - 1)
		frontier_rdd.persist()
		if frontier_rdd.isEmpty():
			break
		boards_rdd = frontier_rdd\
			.flatMap(lambda (state, level): get_children_boards(hash_to_board(state)))\
			.map(lambda state_board: (get_children_boards(state_board), current_level))\
			.union(boards_rdd)\
			.reduceByKey(lambda step_level_a, step_level_b: min(step_level_a, step_level_b))\
			.partitionBy(slave_number)

	boards_rdd\
		.map(lambda (state, level): (level, hash_to_board(state)))\
		.sortByKey()\
		.coalesce(1)\
		.saveAsTextFile(output_path)

	sc.stop()
Beispiel #29
0
class Stack(object):

    def __init__(self, target):
        self.target = target

    def connect(self, spark_host, job_name):
        self.spark_host = spark_host
        self.job_name = job_name
        self.spark_context = SparkContext(spark_host, job_name)

    @staticmethod
    def addJobTreeOptions(parser):
        parser.add_option("--batchSystem", dest="batchSystem",
                      help="This is an old flag that is kept to maintain compatibility default=%default",
                      default="spark")


        parser.add_option("--jobTree", dest="jobTree", 
                      help="This is an old flag that it is maintained for compatibility",
                      default=None)
    


    def startJobTree(self, options):
        self.options = options
        extra_path = os.path.dirname(os.path.abspath(sys.argv[0]))
        os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH', "") + ":" + extra_path

        #print "Starting"

        sm = StackManager(self.spark_context)
        targets = self.spark_context.parallelize([('start', self.target)]) 
        sm.runTargetList(targets)
Beispiel #30
0
 def distribute(k, primeList):
     from pyspark import SparkContext
     sc = SparkContext(appName="bern_spark")
     
     rdd = sc.parallelize(primeList)
     rp = rdd.map(lambda p : (computeBkModP(p, k), p)).collect()      
     return rp
Beispiel #31
0
from pyspark import SparkConf, SparkContext

conf1 = SparkConf()
sc = SparkContext(conf=conf1)

List1 = [1, 2, 3, 4, 5]


def double(x):
    return x * 2


Rdd = sc.parallelize(List1)

Rdd2 = Rdd.map(double)
Data1 = Rdd2.collect()
Data2 = Rdd.collect()

print(Data1)
print(Data2)
Beispiel #32
0
    master = os.environ["SPARK_MASTER"]
    master = "spark://{}:7077".format(master)
    conf = SparkConf().setAppName("SpotTrawl").setMaster(master)
    spark = SparkContext(conf=conf)

#===========DEFINE SAMPLING FUNCTION=======
numSamples = 10**7


def sample(p):
    x, y = np.random.random(), np.random.random()
    return 1 if x * x + y * y < 1 else 0


#==========TAKE SAMPLES======================
count = spark.parallelize(xrange(0, numSamples)).map(sample) \
             .reduce(lambda a, b: a + b)

#==========ESTIMATE 4pi======================
piEst = 4.0 * count / numSamples

#==========FIND NUMBER OF MATCHING DIGITS====
n, pi = 0, np.pi
while int(pi * 10**n) == int(piEst * 10**n):
    n += 1

#=========PRINT RESULTS FOR OBSERVATION======
print "Pi is roughly {}".format(piEst)
print "Error: {}%".format((piEst - pi) / pi)
print "Matching Digits: {}".format(n)
print("DESIRED OUTPUT LENGTH: 3")
time.sleep(1000)
Beispiel #33
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys
from random import random
from operator import add

from pyspark import SparkContext

if __name__ == "__main__":
    """
        Usage: pi [partitions]
    """
    sc = SparkContext(appName="PythonPi")
    partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
    n = 100000 * partitions

    def f(_):
        x = random() * 2 - 1
        y = random() * 2 - 1
        return 1 if x**2 + y**2 < 1 else 0

    count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
    print("Pi is roughly %f" % (4.0 * count / n))
    print("-------------------------")
    print(sc.master)
    print("----------")

    sc.stop()
Beispiel #34
0
from pyspark.sql import SQLContext
from pyspark.sql import functions as F

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as functions
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

spark = SparkContext()
spark.setLogLevel("ERROR")
sql_context = SQLContext(spark)

data = spark.parallelize([
    ("company1", 2, 2.0),
    ("company2", 2, 4.0),
    ("company3", 1, 1.0),
    ("company4", 1, 0.0),
    ("company5", 1, 2.0),
])

schema = StructType([
    StructField("id", StringType(), True),
    StructField("degree", IntegerType(), True),
    StructField("nnd", FloatType(), True)
])

df = sql_context.createDataFrame(data, schema)
df.show()

# average nearest neighbour degree from nearest neighbour degree
annd = df\
         "FROM `test1.subredditMembershipv2`")

# API request - fetches results
# Row values can be accessed by field name or index
query_job = query_generator(query)

# Writes QueryJob rows to a list to parallelize into Spark RDD
query_job_list = list()
for row in query_job:
    row = list(row)
    row.append(1)
    query_job_list.append(tuple(i for i in row))

# Convert output from QueryJob (list of tuples) into Spark RDD
partitions = 5000
user_sub = sc.parallelize(query_job_list, partitions)

# # Test RDD Data
# user_sub_count = sc.parallelize([(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1), (4, 1), (5, 1)])
# sub_user = sc.parallelize([('a',[2]), ('a',[1]), ('b',[1]), ('c',[3]), ('c',[4]), ('c',[5]), ('c',[1]), ('c',[2])])
# # print('sub_user')
# # print(sub_user.collect())
# sub_members = sub_user.reduceByKey(lambda a, b: a+b)
# # print('sub_members complete')
# # print(sub_members.collect())

user_sub_count = user_sub.map(lambda x: (x[0], 1))
sub_user = user_sub.map(lambda x: (x[1], [x[0]]))
sub_members = sub_user.reduceByKey(lambda a, b: a + b)
print('sub_members complete: ' + str(datetime.datetime.now()))
Beispiel #36
0
RDD_file = sc.textFile("input.txt")
data_file = RDD_file.collect()
existing_items = []

#Removing Punctuations
for i in data_file:
    items = str(i)

    aux_words = string.punctuation
    existing_items.append(
        items.translate(None, digits).translate(None,
                                                aux_words).lower().replace(
                                                    "  ", " ").strip())

print(existing_items)
RDD_file = sc.parallelize(existing_items)


#Function for word pair in same line
def sample(items):
    condition = []
    output = []
    for i in range(len(items)):
        if items[i] not in condition:
            condition.append(items[i])
            for j in range(len(items)):
                if i != j:
                    output.append((items[i], items[j]))
    return output

Beispiel #37
0
        sys.stdout.write('\r' + \
            'Processing data point ' + str(i) + ' hcid ' + thiswell)
        allwells[thiswell] = wellent

    # make some pandas dataframes to compute the results
    print(" ...done")
    print("computing summaries")
    alldfs = []
    for w in allwells:
        cols = allwells[w]
        pdf1 = pd.DataFrame(
            {
                "Pressure": cols[1],
                "Temp": cols[2],
                "Oil Pct": cols[3]
            },
            index=cols[0])
        pdf2 = pd.DataFrame({
            "Prod": cols[4],
            "Inject Vol": cols[5]
        },
                            index=cols[0])
        alldfs.append((w, pdf1, pdf2))
    return (alldfs)


print("writing graphs...")
sc = SparkContext()
s_rdd = sc.parallelize(load_well_data_from_maprdb())
fnames = s_rdd.map(lambda x: output_graph(x[0], x[1], x[2])).collect()
Beispiel #38
0
def mapToFormat(line):
    overall = line[0]
    result = line[1]
    splited = overall.split('/')
    vid = splited[0]
    country = splited[1]
    category = splited[2]
    key = vid
    value = str(result) + ',' + category + ',' + country
    return key, value


if __name__ == "__main__":
    sc = SparkContext(appName='part-2')
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", help="Input path", default='~/')
    parser.add_argument("--output", help="Output path", default='~/')
    args = parser.parse_args()
    input_path = args.input
    output_path = args.output
    csv = sc.textFile(input_path + 'AllVideos_short.csv')
    csvNoHead = csv.zipWithIndex().filter(lambda tup: tup[1] > 0).keys()
    after_map = csvNoHead.map(mapper)
    after_calculate = after_map.groupByKey().mapValues(calculate_difference)
    sortedByResult = after_calculate.sortBy(lambda a: a[1], 0)
    answer = sortedByResult.map(mapToFormat)
    output = answer.collect()[:10]
    output1 = sc.parallelize(output)
    output1.saveAsTextFile(output_path)

Beispiel #39
0
    read_duration = read_stop - read_start
    # Get important parts
    d_keyAndText = d_corpus.map(lambda x: (x[x.index('id="') + 4:x.index(
        '" url=')], x[x.index('">') + 2:][:-6]))
    regex = re.compile('[^a-zA-Z]')
    # Split in to words.
    d_keyAndListOfWords = d_keyAndText.map(
        lambda x: (str(x[0]), regex.sub(' ', x[1]).lower().split()))
    # Flat to all words in corpas
    allWords = d_keyAndListOfWords.flatMap(lambda x: x[1]).map(lambda x:
                                                               (x, 1))
    # Count each words.
    allCounts = allWords.reduceByKey(add)
    # Take to 20000
    topWords = allCounts.top(20000, key=lambda x: x[1])
    twentyK = sc.parallelize(range(20000))
    # Create dictionary.
    dictionary = twentyK.map(lambda x: (topWords[x][0], x))
    # Get only words in dictionally for each document
    allWords = d_keyAndListOfWords.flatMap(lambda x: ((j, x[0]) for j in x[1]))
    allDictionaryWords = dictionary.join(allWords)
    justDocAndPos = allDictionaryWords.map(lambda x: (x[1][1], x[1][0]))
    allDictionaryWordsInEachDoc = justDocAndPos.groupByKey()
    # Calculate term frequence.
    tfs = allDictionaryWordsInEachDoc.map(lambda x: (x[0], buildArray(x[1])))

    # Make label 0 or 1.
    data = tfs.map(lambda x: (oneHotEncoding(x[0]), x[1]))

    # Get sample number of train data
    num_train = data.count()
Beispiel #40
0
    return (vector[0], (0, vector[1] / h_max_val[1]))
#normalize by making the maximum 1
def a_normalize(vector):
    return (vector[0], (0, vector[1] / a_max_val[1]))

if __name__ == "__main__":
    arr = [(i + 1, 0, 1.0) for i in range(1000)] #initial h

    conf = SparkConf()
    sc = SparkContext(conf=conf)
    lines = sc.textFile(sys.argv[1])

    link = lines.map(parser).distinct() #obtain distict inputs and create matrix
    link_t = lines.map(parser_t).distinct()

    h = sc.parallelize(arr)
    h_pair = h.map(mapper)
    a_pair = None

    for j in range(50):
        h_matmul_pair = link.join(h_pair) #get pairs to multiply
        h_matmul = h_matmul_pair.map(lambda x: (x[1][0][0], x[1][0][1] * x[1][1][1])).reduceByKey(lambda a, b: a + b)  #matrix multiplication
        h_max_val = h_matmul.max(key= lambda x: x[1])
        a_pair = h_matmul.map(h_normalize) #normalize vector

        a_matmul_pair = link_t.join(a_pair) #get pairs to multiply
        a_matmul = a_matmul_pair.map(lambda x: (x[1][0][0], x[1][0][1] * x[1][1][1])).reduceByKey(lambda a, b: a + b)  #matrix multiplication
        a_max_val = a_matmul.max(key= lambda x: x[1])
        h_pair = a_matmul.map(a_normalize) #normalize vector

    h_top_10 = sorted(h_pair.collect(), key=lambda x: -x[1][1])[:10] #get top 10
Beispiel #41
0
class TaskContextTests(PySparkTestCase):
    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        # Allow retries even though they are normally disabled in local mode
        self.sc = SparkContext('local[4, 2]', class_name)

    def test_stage_id(self):
        """Test the stage ids are available and incrementing as expected."""
        rdd = self.sc.parallelize(range(10))
        stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
        stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
        # Test using the constructor directly rather than the get()
        stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
        self.assertEqual(stage1 + 1, stage2)
        self.assertEqual(stage1 + 2, stage3)
        self.assertEqual(stage2 + 1, stage3)

    def test_resources(self):
        """Test the resources are empty by default."""
        rdd = self.sc.parallelize(range(10))
        resources1 = rdd.map(lambda x: TaskContext.get().resources()).take(
            1)[0]
        # Test using the constructor directly rather than the get()
        resources2 = rdd.map(lambda x: TaskContext().resources()).take(1)[0]
        self.assertEqual(len(resources1), 0)
        self.assertEqual(len(resources2), 0)

    def test_partition_id(self):
        """Test the partition id."""
        rdd1 = self.sc.parallelize(range(10), 1)
        rdd2 = self.sc.parallelize(range(10), 2)
        pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect()
        pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect()
        self.assertEqual(0, pids1[0])
        self.assertEqual(0, pids1[9])
        self.assertEqual(0, pids2[0])
        self.assertEqual(1, pids2[9])

    def test_attempt_number(self):
        """Verify the attempt numbers are correctly reported."""
        rdd = self.sc.parallelize(range(10))
        # Verify a simple job with no failures
        attempt_numbers = rdd.map(
            lambda x: TaskContext.get().attemptNumber()).collect()
        map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)

        def fail_on_first(x):
            """Fail on the first attempt so we get a positive attempt number"""
            tc = TaskContext.get()
            attempt_number = tc.attemptNumber()
            partition_id = tc.partitionId()
            attempt_id = tc.taskAttemptId()
            if attempt_number == 0 and partition_id == 0:
                raise Exception("Failing on first attempt")
            else:
                return [x, partition_id, attempt_number, attempt_id]

        result = rdd.map(fail_on_first).collect()
        # We should re-submit the first partition to it but other partitions should be attempt 0
        self.assertEqual([0, 0, 1], result[0][0:3])
        self.assertEqual([9, 3, 0], result[9][0:3])
        first_partition = filter(lambda x: x[1] == 0, result)
        map(lambda x: self.assertEqual(1, x[2]), first_partition)
        other_partitions = filter(lambda x: x[1] != 0, result)
        map(lambda x: self.assertEqual(0, x[2]), other_partitions)
        # The task attempt id should be different
        self.assertTrue(result[0][3] != result[9][3])

    def test_tc_on_driver(self):
        """Verify that getting the TaskContext on the driver returns None."""
        tc = TaskContext.get()
        self.assertTrue(tc is None)

    def test_get_local_property(self):
        """Verify that local properties set on the driver are available in TaskContext."""
        key = "testkey"
        value = "testvalue"
        self.sc.setLocalProperty(key, value)
        try:
            rdd = self.sc.parallelize(range(1), 1)
            prop1 = rdd.map(
                lambda _: TaskContext.get().getLocalProperty(key)).collect()[0]
            self.assertEqual(prop1, value)
            prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty(
                "otherkey")).collect()[0]
            self.assertTrue(prop2 is None)
        finally:
            self.sc.setLocalProperty(key, None)

    def test_barrier(self):
        """
        Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks
        within a stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        def context_barrier(x):
            tc = BarrierTaskContext.get()
            time.sleep(random.randint(1, 10))
            tc.barrier()
            return time.time()

        times = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
        self.assertTrue(max(times) - min(times) < 1)

    def test_barrier_infos(self):
        """
        Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the
        barrier stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            yield sum(iterator)

        taskInfos = rdd.barrier().mapPartitions(f).map(
            lambda x: BarrierTaskContext.get().getTaskInfos()).collect()
        self.assertTrue(len(taskInfos) == 4)
        self.assertTrue(len(taskInfos[0]) == 4)

    def test_context_get(self):
        """
        Verify that TaskContext.get() works both in or not in a barrier stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            taskContext = TaskContext.get()
            if isinstance(taskContext, BarrierTaskContext):
                yield taskContext.partitionId() + 1
            elif isinstance(taskContext, TaskContext):
                yield taskContext.partitionId() + 2
            else:
                yield -1

        # for normal stage
        result1 = rdd.mapPartitions(f).collect()
        self.assertTrue(result1 == [2, 3, 4, 5])
        # for barrier stage
        result2 = rdd.barrier().mapPartitions(f).collect()
        self.assertTrue(result2 == [1, 2, 3, 4])

    def test_barrier_context_get(self):
        """
        Verify that BarrierTaskContext.get() should only works in a barrier stage.
        """
        rdd = self.sc.parallelize(range(10), 4)

        def f(iterator):
            try:
                taskContext = BarrierTaskContext.get()
            except Exception:
                yield -1
            else:
                yield taskContext.partitionId()

        # for normal stage
        result1 = rdd.mapPartitions(f).collect()
        self.assertTrue(result1 == [-1, -1, -1, -1])
        # for barrier stage
        result2 = rdd.barrier().mapPartitions(f).collect()
        self.assertTrue(result2 == [0, 1, 2, 3])
Beispiel #42
0
class TaskContextTestsWithWorkerReuse(unittest.TestCase):
    def setUp(self):
        class_name = self.__class__.__name__
        conf = SparkConf().set("spark.python.worker.reuse", "true")
        self.sc = SparkContext('local[2]', class_name, conf=conf)

    def test_barrier_with_python_worker_reuse(self):
        """
        Regression test for SPARK-25921: verify that BarrierTaskContext.barrier() with
        reused python worker.
        """
        # start a normal job first to start all workers and get all worker pids
        worker_pids = self.sc.parallelize(
            range(2), 2).map(lambda x: os.getpid()).collect()
        # the worker will reuse in this barrier job
        rdd = self.sc.parallelize(range(10), 2)

        def f(iterator):
            yield sum(iterator)

        def context_barrier(x):
            tc = BarrierTaskContext.get()
            time.sleep(random.randint(1, 10))
            tc.barrier()
            return (time.time(), os.getpid())

        result = rdd.barrier().mapPartitions(f).map(context_barrier).collect()
        times = list(map(lambda x: x[0], result))
        pids = list(map(lambda x: x[1], result))
        # check both barrier and worker reuse effect
        self.assertTrue(max(times) - min(times) < 1)
        for pid in pids:
            self.assertTrue(pid in worker_pids)

    def test_task_context_correct_with_python_worker_reuse(self):
        """Verify the task context correct when reused python worker"""
        # start a normal job first to start all workers and get all worker pids
        worker_pids = self.sc.parallelize(
            xrange(2), 2).map(lambda x: os.getpid()).collect()
        # the worker will reuse in this barrier job
        rdd = self.sc.parallelize(xrange(10), 2)

        def context(iterator):
            tp = TaskContext.get().partitionId()
            try:
                bp = BarrierTaskContext.get().partitionId()
            except Exception:
                bp = -1

            yield (tp, bp, os.getpid())

        # normal stage after normal stage
        normal_result = rdd.mapPartitions(context).collect()
        tps, bps, pids = zip(*normal_result)
        print(tps)
        self.assertTrue(tps == (0, 1))
        self.assertTrue(bps == (-1, -1))
        for pid in pids:
            self.assertTrue(pid in worker_pids)
        # barrier stage after normal stage
        barrier_result = rdd.barrier().mapPartitions(context).collect()
        tps, bps, pids = zip(*barrier_result)
        self.assertTrue(tps == (0, 1))
        self.assertTrue(bps == (0, 1))
        for pid in pids:
            self.assertTrue(pid in worker_pids)
        # normal stage after barrier stage
        normal_result2 = rdd.mapPartitions(context).collect()
        tps, bps, pids = zip(*normal_result2)
        self.assertTrue(tps == (0, 1))
        self.assertTrue(bps == (-1, -1))
        for pid in pids:
            self.assertTrue(pid in worker_pids)

    def tearDown(self):
        self.sc.stop()
for filename in filenames:
    f = open("/Users/panpan/Desktop/linkedin/followings/group3/%s" % filename,
             "r")
    files.append(f.readline())

    #initialize mutual_list
mutual_list = numpy.zeros((len(filenames), len(filenames)))

#pick two users each time, and calculate their common freinds
for i in range(0, len(files)):
    if i + 1 >= len(files):
        continue
    for j in range(i, len(files)):
        file_1 = files[i].split(",")
        file_2 = files[j].split(",")
        file1 = sc.parallelize(file_1)
        file2 = sc.parallelize(file_2)
        #common friends of the two users
        file_12 = file1.intersection(file2)
        mutual = len(file_12.collect())
        #define a way to cauculate how much percent they are similar to each other
        mutual_proportion = 1.0 / 2 * mutual * (1.0 / len(file_1) +
                                                1.0 / len(file_2))
        mutual_list[i][j] = mutual_list[j][i] = mutual_proportion

###Cluster the models
model = cl.KMeans.train(sc.parallelize(mutual_list),
                        5,
                        maxIterations=15,
                        runs=20,
                        initializationMode="random",
def compute_final_svd(sc: SparkContext, y: List[DenseVector],
                      k: int) -> SingularValueDecomposition:
    Y: RowMatrix = RowMatrix(sc.parallelize(y))
    svd_model = Y.computeSVD(k=k, computeU=True)
    return svd_model
Beispiel #45
0
def run():
    conf = SparkConf()
    #conf.set('spark.shuffle.blockTransferService', 'nio')
    conf.set('spark.files.fetchTimeout', '180')
    conf.set('spark.files.overwrite', 'yes')
    conf.set('spark.akka.timeout', '180')
    #conf.set('spark.task.maxFailures', '30000')
    conf.set('spark.akka.frameSize', '500')
    conf.set('spark.network.timeout', '180')

    myClassifierOnevsOne = pickle.load(open('myClassifierOnevsOne.p', 'rb'))

    dataSetMaker = DataSetMakerV2(n=200000)

    feed = FeedNewsFromGoogleFinance()

    def sendRecord(rdd):
        print('new try...')
        if (not rdd.isEmpty()):
            newsRDD = dataSetMaker.processKeepNews(rdd)
            res = newsRDD.map(
                lambda x: (x[0], myClassifierOnevsOne.predict(x[1].features)))
            print('for each result...')
            for result in res.collect():
                symbole = result[0].symbole
                r = requests.put('http://wtun.mooo.com:5000',
                                 data={
                                     'jdata':
                                     NewsPrediction(result[0],
                                                    str(result[1])).json(),
                                     'symbole':
                                     symbole,
                                     'label':
                                     str(result[1])
                                 })
                print('send ok')
                print('receive %s' % str(r.text))
        else:
            print('empty!')

    sc = SparkContext(conf=conf)

    symbolesRDD = sc.parallelize([('NASDAQ:GOOGL', ['GOOG', 'GOOGL',
                                                    'GOOGLE']),
                                  ('NASDAQ:NVDA', ['NVIDIA']),
                                  ('VTX:SCMN', ['SWISSCOM'])])
    taskdt = 600
    running = True
    oldNewsRDD = None
    firstTime = True
    intersectRDD = None
    dataDirectory = 'hdfs://157.26.83.52/user/wdroz/stream2'
    cpt = 0
    while (running):
        today = datetime.datetime.now()
        yesterday = today - datetime.timedelta(days=1)
        tomorrow = today + datetime.timedelta(days=1)
        newsRDD = symbolesRDD.flatMap(
            lambda x: feed.lookingAt(x[0], yesterday, tomorrow, x[1]))
        if (firstTime):
            firstTime = False
            intersectRDD = newsRDD
        else:
            try:
                intersectRDD = oldNewsRDD.intersection(newsRDD)
            except:
                pass  # empty rdd

        oldNewsRDD = newsRDD

        try:
            sendRecord(intersectRDD)
            intersectRDD.saveAsPickleFile(
                dataDirectory + '/' +
                datetime.datetime.now().strftime('%Y-%m-%d--') + str(cpt))
            cpt += 1
        except:
            pass  # empty rdd

        time.sleep(taskdt)

    running = False  # TODO remove it
Beispiel #46
0
    # Read all edge inputs, reverse edges and union as graph is undirected
    edgeInputs = bFile.map(lambda x: struct.unpack("<qq", x))
    allEdgeList = sc.union(
        [edgeInputs, edgeInputs.map(lambda x: (x[1], x[0]))])
    totalEdgeCount = allEdgeList.count()

    # Reduce edge list to tuple of vertex and array of child vertices
    inputGraph = allEdgeList.map(
        lambda edge: (edge[0], [edge[1]])).reduceByKey(lambda a, b: a + b)
    inputGraph.cache()
    totalVertexCount = inputGraph.count()

    distances = inputGraph.map(lambda x: (x[0], -1))
    distances.cache()
    currentLevel = 0
    currentLevelQueue = sc.parallelize([(root, currentLevel)])

    currentLevelQueue = currentLevelQueue.join(distances).filter(
        lambda x: x[1][1] == -1).map(lambda x: (x[0], x[1][0]))
    currentLevelQueue.cache()
    #print("CurrentLevel: {}\n".format(currentLevel))

    while (not (currentLevelQueue.isEmpty())):
        distances = distances.leftOuterJoin(currentLevelQueue).map(
            lambda x: (x[0], x[1][0]) if x[1][1] is None else (x[0], x[1][1]))
        distances.cache()
        currentLevel += 1

        nextLevelQueue = inputGraph.join(currentLevelQueue).flatMap(
            lambda node: map((lambda child: child), node[1][0]))
        currentLevelQueue = nextLevelQueue.distinct().map(lambda x:
# -*- coding: utf-8 -*-
import findspark
findspark.init()

from pyspark import SparkContext

data_path = "C:\\PySpark\\data"

sc = SparkContext("local", "repartition")

rdd1 = sc.parallelize(range(1, 100), 2)
print("rdd1 partitions: {}".format(rdd1.getNumPartitions()))

rdd2 = rdd1.repartition(5)
print("rdd2 partitions: {}".format(rdd2.getNumPartitions()))

rdd3 = rdd2.coalesce(3)
print("rdd3 partitions: {}".format(rdd3.getNumPartitions()))

sc.stop()
Beispiel #48
0
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix
from pyspark.sql import SQLContext
from pyspark import SparkContext

sc = SparkContext()
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).zipWithIndex()

# need a SQLContext() to generate an IndexedRowMatrix from RDD
sqlContext = SQLContext(sc)
block_matrix = IndexedRowMatrix( \
    rows \
    .map(lambda row: IndexedRow(row[1], row[0])) \
    ).toBlockMatrix()

mat_product = block_matrix.multiply(block_matrix)
result = mat_product.toLocalMatrix()
print("Matrix Product \n", result)
mat_sum = block_matrix.add(block_matrix)
result = mat_sum.toLocalMatrix()
print("Matrix Sum \n", result)

mat_transpose = block_matrix.transpose()
result = mat_transpose.toLocalMatrix()
print("Matrix Transpose \n", result)
Beispiel #49
0
    engineStr = cred.getEngineStr()
    engine = create_engine(engineStr)
    Session = sessionmaker()
    Session.configure(bind=engine)
    session = Session()
    return session


session = makeSession()


def addVersions(records):
    session = makeSession()
    for row in records:
        curId = row[0]
        elem = session.query(RawXML).get(curId)
        root = etree.fromstring(elem.XML)
        elem.Version = root.attrib["returnVersion"]
        session.add(elem)
        session.commit()


sc = SparkContext()
records = session.query(RawXML.id, RawXML.Version)\
        .filter(RawXML.Version == None)

session.close()

sc.parallelize(records)\
        .foreachPartition(addVersions)
Beispiel #50
0
def updateStockInfo(stockFile, iniFile):
    config = configparser.ConfigParser()
    config.read(iniFile)
    localeStr = config['stats']['locale']
    locale.setlocale(locale.LC_ALL, localeStr)
    configStore = config['store']
    numJobs = config['spark']['numJobs']

    stocks = []
    with open(stockFile, 'r') as stockFile:
        for stock in stockFile:
            stock = stock.strip(' \n\r')
            if (stock != ''):
                stocks.append(stock)

    if (stocks is None or len(stocks) == 0):
        print(f"Failed to read any stocks from file: {stockFile}")
        exit

    conf = SparkConf().setAppName("LatestStockInfo")
    sc = SparkContext(conf=conf)

    broadCastConfig = sc.broadcast(config)

    tries = 5
    attempts = 1
    startStocksNum = len(stocks)
    lastCount = startStocksNum
    # processedCount = 0
    while tries > 0:
        #Parallise the stock list - one spark process per stock
        print(
            f"****************Attempt {attempts}: Parallelising stock processing job for {len(stocks)} stocks....standby"
        )
        rdd = sc.parallelize(stocks, numSlices=numJobs)
        #This does the actual work of retrieving the stock data and working out the metrics and scores
        #It returns a dict of scores
        mrdd = rdd.map(
            lambda stock: retrieveStockInfoSpark(broadCastConfig, stock))
        #Collect the info by combining the returned dicts holding the info, this triggers the map operation
        infos = mrdd.collect()
        infos = [s for s in infos if s]
        print(
            f"*************Attempt {attempts}: Collected {len(infos)} stocks out of {len(stocks)}"
        )
        #Check that we have all the info
        print(
            f"***************Attempt {attempts}: Checking all info retreived")
        mrdd = rdd.map(lambda stock: checkStockSpark(broadCastConfig, stock))
        stocks = mrdd.collect()
        #Remove Nones
        stocks = [s for s in stocks if s]
        if (len(stocks) == 0):
            print(
                f"***************Attempt {attempts}: All stocks info check out apparently"
            )
            #Done
            break
        if (len(stocks) == lastCount):
            print(
                f"***************Attempt {attempts}: Number of stocks left {len(stocks)} is the same as last attempt - aborting"
            )
            if (len(stocks) < 20):
                print(f"***************Failed stocks: {stocks}")
            #Done
            break
        tries -= 1
        attempts += 1
        print(
            f"***************Attempt {attempts}: Retrying for stocks: {stocks}"
        )
    print(
        f"***************Job complete: Processed {startStocksNum - len(stocks)} out of {startStocksNum} stocks"
    )
Beispiel #51
0
#
#
# Tutorialspoint - PySpark; Learn Pyspark
#
#
#----------------------------------------foreach.py---------------------------------------
from pyspark import SparkContext
sc = SparkContext("local", "ForEach app")
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)
def f(x): print(x)
fore = words.foreach(f) 


Beispiel #52
0
#!/usr/bin/env python
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf
conf = SparkConf().setMaster("local").setAppName("My app")
sc = SparkContext(conf = conf)
lines = sc.textFile("ch01.py")
inputRDD = lines.filter(lambda x:"sc" in x)
for line in inputRDD.take(10):
    print line

lines = sc.parallelize(["hello world", "hi"])
words = lines.flatMap(lambda line:line.split(" "))
print words.first()

data = sc.parallelize([1,2,3,4,1,3])
print data.reduce(lambda x,y: x+y)

def printall(rdd):
    print("----------")
    for r in list(rdd.collect()):
        print r
printall(data.distinct())
for d in list(data.distinct().collect()):
    print d

d = sc.parallelize(["1, hello", "2, hi", "3, how are you"])
for _ in list(d.map(lambda x:(x.split(",")[0], x)).collect()):
    print _
Beispiel #53
0
    def _process_wav(self, record: WavRecord):
        wav = self.audio.load_wav(record.wav_path)
        wav = self.audio.trim(wav)
        file_path = os.path.join(self.out_dir, f"{record.key}.tfrecord")
        write_preprocessed_data(record.key, wav, record.speaker_info.id, record.speaker_info.age,
                                record.speaker_info.gender, file_path)
        return record.key


if __name__ == "__main__":
    args = docopt(__doc__)
    in_dir = args["<in_dir>"]
    out_dir = args["<out_dir>"]

    default_params.parse(args["--hparams"])

    instance = VCTK(in_dir, out_dir, default_params)

    sc = SparkContext()

    rdd = instance.process_wavs(
        sc.parallelize(instance.list_wav_files()))

    data_file_paths = rdd.collect()

    with open(os.path.join(out_dir, 'list.csv'), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for path in data_file_paths:
            writer.writerow([path])
Beispiel #54
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a local StreamingContext with two working threads and a batch interval of 2 seconds
sc = SparkContext("local[2]", "Sensor")
ssc = StreamingContext(sc, 20)

# Create a DStream
lines = ssc.socketTextStream("sandbox-hdp.hortonworks.com", 3333)

# Bazic reduceByKey example in python
# creating PairRDD x with key value pairs
xx = sc.parallelize([("a", 1), ("b", 1), ("a", 1), ("a", 1), ("b", 1),
                     ("b", 1), ("b", 1), ("b", 1)], 3)

# Applying reduceByKey operation on x
y = xx.reduceByKey(lambda accum, n: accum + n)
print(y.collect())

# [('b', 5), ('a', 3)]


# Define associative function separately
def sumFunc(accum, n):
    return accum + n


y = xx.reduceByKey(sumFunc)
print(y.collect())
# [('b', 5), ('a', 3)]
    page_num = 0
    total_pages = 1
    batch_size = 100

    while page_num < total_pages:
        url_query = url_cve + "/pages/" + str(page_num) + "?limit=" + str(
            batch_size)
        results_json = apiDownloader.download_api(url_query, "isi",
                                                  args.password)

        if results_json is not None and "results" in results_json:
            results = results_json["results"]
            num_results = len(results)
            total_pages = results_json["total_pages"]
            print "Downloaded ", num_results, " new CVE data rows. Adding them to CDR. Page:", (
                page_num + 1), " of ", total_pages
            if num_results > 0:
                apiDownloader.load_into_cdr(results, "hg_cve", args.team,
                                            "hg-cve")
                print "Done loading into CDR"
                print "Taking backup on S3"

                rdd = sc.parallelize(results)
                rdd.map(lambda x: ("hg-cve", json.dumps(x))
                        ).saveAsSequenceFile(args.outputFolder + "/hg-cve/" +
                                             str(page_num))
                print "Done taking backing on S3"
        else:
            print "No data found:", results_json
        page_num += 1
Beispiel #56
0
if len(sys.argv) > 3 and sys.argv[3] == 'local':
    spark = SparkContext('local', appName = 'SparkLCA')
else:
    spark = SparkContext(appName = 'SparkLCA')

tstart = time.time()
N = int(sys.argv[2])
g = load_graph(sys.argv[1])

if len(sys.argv) > 3 and sys.argv[3] != 'local':
    out_hdfs = sys.argv[3]
else:
    out_hdfs = None

print 'finish loading graph data %f secs elapsed' % (time.time()-tstart)
seeds = spark.parallelize([p for p in g.nodes() if p <= N])
gtuple = spark.broadcast(g.get_tuple())
print 'finish broadcasting, %f secs elapsed' % (time.time()-tstart)

cite_depth = seeds.flatMap(lambda k: shortest_path(gtuple.value, k))
dist_root = cite_depth.groupByKey()
pairs_rdd = dist_root.flatMap(lambda x: map_pairs(x[1], get_year(gtuple.value, x[0]), x[0]))
lca_rdd = pairs_rdd.reduceByKey(lambda x, y: x if cmp_key(x) < cmp_key(y) else y)
lca = lca_rdd.map(lambda x: x[0] + x[1])

print 'finish calculation, %f secs elapsed' % (time.time()-tstart)

if out_hdfs is None:
    lca = lca.collect()
    with open(sys.argv[1]+'/result-%d.csv' % N, 'wb') as resultsfile:
        writer = csv.writer(resultsfile)
Beispiel #57
0
    print (" ")
    print (" ")
    print ("matriz de correlacion:")
    print (" ")
    print(Statistics.corr(rows, method="pearson")

    '''

    file = sc.textFile("Process_Data/SuperFile/superfile.dat")

    row = file.map(lambda line: line.split(' ')[1:len(line)]).map(
        lambda xs: [float(x) for x in xs])
    row_list = row.collect()  #transforms to list
    print(row_list)

    #matrix
    w, h = 1, 38
    new_list = [[0 for x in range(w)] for y in range(h)]

    for i in range(0, len(row_list)):
        new_list[i][:] = Vectors.dense(row_list[i])
        i += 1
    rows = sc.parallelize([new_list])
    print(rows)
    summary = Statistics.colStats(rows)

    print("media:"), (summary.mean())
    print("varianza:"), (summary.variance())
    print("max:"), (summary.max())
    print("min:"), (summary.min())
    print("non Zeros:"), (summary.numNonzeros())
Beispiel #58
0
    levelIndex=2
    levelSets=[tuple([i]) for i in range(1,16)]+[tuple(i for i in range(5*j+16, 5*j+21)) for j in range(7)]

    normedRows=[]
    normedLevelSets={}
    denormalizers={}
    for levelSet in levelSets:
        levelRows=[row for row in archmageList if row[levelIndex] in levelSet]
        normalized=normalize(levelRows)
        normedRows+=normalized
        for i in range(len(normalized)):
            row=normalized[i]
            denormalizers[tuple(row)]=levelRows[i]


    normalArchmage = sc.parallelize(normedRows).persist()

    numClusters= int(sys.argv[1])
    numIterations= int(sys.argv[2])
    model = KMeans.train(normalArchmage, numClusters, maxIterations=numIterations)

    randomRow=normedRows[0]
    # print("row:", randomRow)
    # print("denormed:", denormalizers[tuple(randomRow)] )
    # print("cluster:", model.predict(randomRow))


    #print("labeled cluster:", columnToArcher(model.centers[model.predict(randomRow)]))
    #print("\n\n")

            tokenizer = RegexpTokenizer(r'\w+')
            tokens1 = tokenizer.tokenize(j)
            tokens1 = [w.lower() for w in tokens1]
            if (i in tokens1):
                sst = ' '.join(tokens1)
                ss = str(list(parser.raw_parse(sst)))
                wor.append(calc1(ss, tokens1))
        di[i] = wor
    implicit = []
    explicit = []
    tokenizer = RegexpTokenizer(r'\w+')
    for key in di:
        for ele in di[key]:
            if (key in tokenizer.tokenize(ele)):
                explicit.append((key, ele))
            else:
                implicit.append((key, ele))
    rdd = sc.parallelize(explicit)
    rdd1 = sc.parallelize(implicit)
    #implicit reviews
    #explicit reviews
    rdd.coalesce(
        1, shuffle=True).saveAsTextFile("hdfs://localhost:9000/output/" +
                                        iiii.split('/')[-1].split(".txt")[0] +
                                        "/exp")
    rdd1.coalesce(
        1, shuffle=True).saveAsTextFile("hdfs://localhost:9000/output/" +
                                        iiii.split('/')[-1].split(".txt")[0] +
                                        "/imp")
    print("new file new file new file new file new  file")
# #############################################################################
    # HDFS Client
    hdfs_client = None
    if cluster_execution:
        hdfs_client = InsecureClient(hdfs_address, user=hdfs_user)

    # searching for data
    dataset_files = list()
    for dataset_path in list_dir(output_dir, hdfs_client, cluster_execution):
        for f in list_dir(os.path.join(output_dir, dataset_path), hdfs_client,
                          cluster_execution):
            if 'learningData.csv' in f:
                dataset_files.append(os.path.join(output_dir, dataset_path, f))

    # computing stats
    files = sc.parallelize(dataset_files, 365)
    stats = files.flatMap(
        lambda x: generate_stats_from_dataset(x, params)).persist(
            StorageLevel.MEMORY_AND_DISK)

    n_rows = stats.map(lambda x: x[0]).collect()

    n_columns = stats.map(lambda x: x[1]).collect()

    size_bytes = stats.map(lambda x: x[2]).collect()

    hist_n_rows = np.histogram(n_rows, bins=500)
    hist_n_columns = np.histogram(n_columns, bins=500)
    hist_size_bytes = np.histogram(size_bytes, bins=500)

    print(' -- N. Rows:')