def main(argv):

    Conf = (SparkConf().setAppName("recommendation"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)

    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
    rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER)
    # argv[1] is the dump of training data in hdfs
    # argv[2] is the user perferences

    # User Hash Lookup stored into cassandra
    user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a)))
    distinctUser = user_hash.distinct()
    userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"])
    userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace =  keyspace).save(mode="append")
    

    # Product Hash Lookup stored into cassandra
    product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b)))
    distinctProduct = product_hash.distinct()
    productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"])
    productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace =  keyspace).save(mode="append")

    # Ratings for training
    # ALS requires a java hash of string. This function does that and stores it as Rating Object
    # for the algorithm to consume
    ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c)))

    
    model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5)
    model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model")

    sc.stop()
 def _rdd_to_df(rdd, schema):
     """convert rdd to dataframe using schema."""
     spark_context = rdd.context
     sql_context = SQLContext(spark_context)
     if schema is None:
         df = sql_context.createDataFrame(rdd)
     else:
         df = sql_context.createDataFrame(rdd, schema)
     return df
def main():
	inputs = sys.argv[1]
	output = sys.argv[2] 
	ntlk_path = sys.argv[3]

	conf = SparkConf().setAppName('TF-IDF Representation')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'

	sqlContext = SQLContext(sc)

	'''sbaronia - get 3 fields from json file and filter those with empty review'''
   	review = sqlContext.read.json(inputs).select('reviewText','overall','reviewTime').cache()
   	review_df = review.filter(review.reviewText != "").cache()

   	'''sbaronia - get year and rating and zip them with index'''
   	year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache()
   	year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache()

   	rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache()
   	rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache()
   	
	stop_words = stop_words_func(ntlk_path)

	'''sbaronia - rdd cotaining unique words from review'''
	clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText,stop_words)).filter(lambda x: x[0] != 'null').cache()

	'''sbaronia - finding tf-idf and zipping it with index'''
	tfidf_rdd = rdd_zip(tf_idf_cal(clean_words_rdd).cache()).cache()

	tfidf_df = sqlContext.createDataFrame(tfidf_rdd, ['tfidf', 'index']).cache()

	'''sbaronia - making dataframe with only rating and tfidf'''
	year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache()
	tfyrrating_df = tfidf_df.join(year_rating_df, tfidf_df.index == year_rating_df.index, 'inner').drop(tfidf_df.index).cache()
	
	'''sbaronia - making training and testing rdd with <2014 and =2014 condition
	in a splitable format with :: '''
	train_rdd = tfyrrating_df.filter(tfyrrating_df.year < 2014) \
	                        .select('rating','tfidf') \
	                        .map(lambda line: (str(line.rating) + ' :: ' + str(line.tfidf))) \
	                        .coalesce(1) \
	                        .cache()
	
	test_rdd = tfyrrating_df.filter(tfyrrating_df.year == 2014) \
	                       .select('rating','tfidf') \
	                       .map(lambda line: (str(line.rating) + ' :: ' + str(line.tfidf))) \
	                       .coalesce(1) \
	                       .cache()
	
	'''sbaronia - save rdds to text''' 
   	train_rdd.saveAsTextFile(output + '/train-text')
	test_rdd.saveAsTextFile(output + '/test-text')
Exemple #4
0
def split_data(rev2, sc):
    # Split train and test set.
    data = rev2.copy()
    train, test = train_test_split(data)
    X_test = test.copy()
    y_test = X_test.pop("rating")

    sql_context = SQLContext(sc)
    train_df = sql_context.createDataFrame(train).rdd
    X_test_df = sql_context.createDataFrame(X_test).rdd
    test_df = sql_context.createDataFrame(test).rdd
    return train_df, train, test, test_df, X_test_df, y_test
def main():

	sc = SparkContext()
	sqlCtx = SQLContext(sc)

	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path that contains all files for analysis
	path_analysis = config.get('DEFAULT', 'path_analysis')	
	#Ligand Database file
	ligand_database  = config.get('DEFAULT', 'ligand_database_path_file')
	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	#Adding Python Source file
	sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py"))

	start_time = datetime.now()

#**************** Loading file that contains all scores
	score_file_name = os.path.join(path_analysis,get_file_name_sorted_energy())
	text_file = sc.textFile(score_file_name)

	#Spliting score file by \t
	header = text_file.first() #extract header
	rdd_vs_score_sorted_split = text_file.filter(lambda x:x !=header)    #filter out header
	rdd_vs_score_sorted_split = rdd_vs_score_sorted_split.map(lambda line: line.split("\t"))
	rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(energy=float(p[0]), pose=str(p[1]), ligand=get_ligand_from_receptor_ligand_model(p[1]) )) 
	#Creating Vina Datafrase based on score file
	vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted)	
	vina_table.registerTempTable("vina")	
#**************** Finish 

#**************** Loading Ligand Database
	rdd_database = load_database(sc, ligand_database)	
	#Creating Dataframe
	database_table = sqlCtx.createDataFrame(rdd_database)	
	database_table.registerTempTable("database")
#**************** Finish 
	
	#Computing ligand efficiency
	ligand_efficiencyRDD = sqlCtx.sql("SELECT vina.pose, vina.energy as affinity, (vina.energy / database.heavyAtom) as lig_efficiency FROM database JOIN  vina ON vina.ligand = database.ligand ORDER BY vina.energy") 
	ligand_efficiencyRDD = ligand_efficiencyRDD.map(lambda p: (p.pose, p.affinity, p.lig_efficiency) ).collect()

	#Saving ligand efficiency file
	save_ligand_efficiency(path_analysis, ligand_efficiencyRDD)

	finish_time = datetime.now()

	save_ligand_efficiency_log(finish_time, start_time)
def main(sc):
    sql_context = SQLContext(sc)
    all_data = get_all_data()

    # Input data: Each row is a bag of words from a sentence or document.
    training_data = [(id_gen.next(), text.split(" ")) for text in all_data]
    documentdf = sql_context.createDataFrame(training_data, ["id", "text"])

    remover = StopWordsRemover(inputCol="text", outputCol="text_filtered")
    cleaned_document = remover.transform(documentdf)

    # Learn a mapping from words to Vectors.
    word2vec = Word2Vec(vectorSize=len(training_data),
                        inputCol="text_filtered",
                        outputCol="result")
    model = word2vec.fit(cleaned_document)
    matrix = column_similarities(model.transform(cleaned_document))

    # We use the size of the target data to filter only
    # products of target data to filter data and avoid
    # products of taret data to itself
    values = matrix.entries.filter(
        lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy(
        keyfunc=lambda x: x.value, ascending=False).map(
        lambda x: x.j).distinct().take(100)

    training_data_index = dict(training_data)
    for position, item in enumerate(values):
        line = " ".join(training_data_index[int(item)])
        print('%d -> %s' % (position, line.encode('utf-8')))
Exemple #7
0
def main(dataFile, outputPath):

    conf = SparkConf().setAppName("S3 Example").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    raw_text = sc.textFile(dataFile).persist(StorageLevel.MEMORY_AND_DISK)

    csv_data = raw_text.map(lambda l: l.split(","))
    row_data = csv_data.map(lambda p: dataIO.dataStruc(p))

    interaction_df = sqlContext.createDataFrame(row_data)

    # features.save_hdfs_parquet(interaction_df, outputPath)
    dataIO.save_hdfs_parquet(interaction_df, outputPath)

    interaction_df.registerTempTable("interactions")

    tcp_interactions = sqlContext.sql( """
        SELECT duration, dst_bytes, protocol_type FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes=0
    """)

    tcp_interactions.show()

    features.print_tcp_interactions(tcp_interactions)
    dataIO.print_from_dataio()
    features.print_from_feature()

    sc.stop()
def Spark_MapReduce_Parents(keyword, tokensofprevlevel, graphcache):
	#tokensofprevlevelkeyword=tokensofprevlevel
	#tokensofprevlevelkeyword.append(keyword)
	md5hashparents = hashlib.md5(keyword).hexdigest()

	#md5hashparents = keyword
	md5hashparents = md5hashparents + "$parents"

	picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
	asfer_pickle_string_dump(keyword,picklef_keyword)
	picklef_keyword.close()
	cachevalue=graphcache.get(md5hashparents)
	if cachevalue:
		print "Spark_MapReduce_Parents(): hash = ", md5hashparents, "; returning from cache"
		return cachevalue 
	else:	
		#picklelock.acquire()
		spcon = SparkContext("local[2]","Spark_MapReduce_Parents")
		#picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
		#asfer_pickle_string_dump(keyword,picklef_keyword)
		#picklef_keyword.close()
		paralleldata = spcon.parallelize(tokensofprevlevel).cache()
		#k=paralleldata.map(lambda keyword: mapFunction_Parents(keyword,tokensofprevlevel)).reduceByKey(reduceFunction_Parents)
		k=paralleldata.map(mapFunction_Parents).reduceByKey(reduceFunction_Parents)
		sqlContext=SQLContext(spcon)
		parents_schema=sqlContext.createDataFrame(k.collect())
		parents_schema.registerTempTable("Interview_RecursiveGlossOverlap_Parents")
		query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap_Parents")
		dict_query_results=dict(query_results.collect())
		#print "Spark_MapReduce_Parents() - SparkSQL DataFrame query results:"
		#picklelock.release()
		graphcache.set(md5hashparents,dict_query_results[1])
		spcon.stop()
		print "graphcache_mapreduce_parents updated:", graphcache
		return dict_query_results[1]
 def mock_data(self):
     """Mock data to imitate read from database."""
     sqlContext = SQLContext(self.sc)
     mock_data_rdd = self.sc.parallelize([("A", 1, 1), ("B", 1, 0), ("C", 0, 2), ("D", 2, 4), ("E", 3, 5) ])
     schema = ["id", "x", "y"]
     mock_data_df = sqlContext.createDataFrame(mock_data_rdd, schema)
     return mock_data_df
def main(sc):
    	path = "events"
    	#text_file = sc.textFile(path)
    	sqlContext = SQLContext(sc)
    	events = sqlContext.jsonFile(path)

	events = events.select(events["events.event"]).flatMap(lambda p: p.event)
	events = events.map(lambda p: Row(
		id=p.id,\
		title=p.title, \
		lat=p.latitude, \
		long=p.longitude, \
		postal_code=p.postal_code, \
		start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
		stop_time=p.stop_time)) 	
	events_df = sqlContext.createDataFrame(events)
	
	events_df.registerTempTable("events")

	sqlContext.registerFunction("to_hour", lambda x: x.hour)
	sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))

	e = sqlContext.sql("select title, str_date(start_time) as event_date,
	to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")

	events_grouped = sqlContext.sql("select event_date, hour, postal_code, 
	count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")

	grouped_csv = events_grouped.map(toCSV)
	grouped_csv.saveAsTextFile('events_cluster')
def main():
	
	sc = SparkContext()
	sqlCtx = SQLContext(sc)
	config = configparser.ConfigParser()
	config.read('config.ini')

	#Path where docking list file will be saved
	path_to_save = str(sys.argv[1])

	#Path for drugdesign project
	path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign')

	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_crud.py"))
	sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py"))


#**************** Loading Ligand Database
	ligand_database = config.get('DEFAULT', 'ligand_database_path_file')
	rdd_database = load_database(sc, ligand_database)	
	#Creating Dataframe
	database_table = sqlCtx.createDataFrame(rdd_database)	
	database_table.registerTempTable("database")
#**************** Finish 

	#Creating input files for peforming virtual screening
	creating_docking_list(path_to_save, config, sqlCtx)
def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
Exemple #13
0
 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def main(n_part, hdfs_path):
    print "********************\n*"
    print "* Start main\n*"
    print "********************"
    conf = SparkConf().setAppName("Benchmark Spark SQL")
    sc = SparkContext(conf = conf)
    sqlContext = SQLContext(sc)
    rowsRDD = sc.textFile(hdfs_path).repartition(n_part).map(lambda x: recordToRows(x)).cache()
    df = sqlContext.createDataFrame(rowsRDD).cache()
    df.count()
    df.registerTempTable("msd_table")
    print "********************\n*"
    print "* Start querres\n*"
    print "********************"
    [ave_t1, std1, dt1, n1] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext)
    [ave_t2, std2, dt2, n2] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext, method=1)
    [ave_t3, std3, dt3, n3] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext)
    [ave_t4, std4, dt4, n4] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext, method=1)
    if n1 != n2:
        print "\t!!!!Error, counts disagree for the number of T.S. songs!"
    if n3 != n4:
        print "\t!!!!Error, counts disagree for the number of high paced songs!"
    print "********************\n*"
    print "* Results"
    print "\t".join(map(lambda x: str(x), [ave_t1, std1, dt1, ave_t2, std2, dt2, ave_t3, std3, dt3, ave_t4, std4, dt4]))
    print "********************"
Exemple #15
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
Exemple #16
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     cvModel = cv.fit(dataset)
     cvPath = temp_path + "/cv"
     cv.save(cvPath)
     loadedCV = CrossValidator.load(cvPath)
     self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
     self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
     self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
     cvModelPath = temp_path + "/cvModel"
     cvModel.save(cvModelPath)
     loadedModel = CrossValidatorModel.load(cvModelPath)
     self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def log_mapreducer(logfilename, pattern, filt="None"):
        spcon=SparkContext()
	if filt == "None":
        	input=open(logfilename,'r')
        	paralleldata=spcon.parallelize(input.readlines())
        	patternlines=paralleldata.filter(lambda patternline: pattern in patternline)
		print "pattern lines",patternlines.collect()
        	matches=patternlines.map(mapFunction).reduceByKey(reduceFunction)
	else:
        	input=spcon.textFile(logfilename)
		matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction)
        matches_collected=matches.collect()
	print "matches_collected:",matches_collected
	if len(matches_collected) > 0:
		sqlContext=SQLContext(spcon)
		bytes_stream_schema=sqlContext.createDataFrame(matches_collected)
		bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream")
		query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream")
		dict_query_results=dict(query_results.collect())
        	print "----------------------------------------------------------------------------------"
        	print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]"
        	print "----------------------------------------------------------------------------------"
		dict_matches=dict(matches_collected)
		sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True)
        	print "pattern matching lines:",sorted_dict_matches 
        	print "----------------------------------------------------------------------------------"
		print "SparkSQL DataFrame query results:"
        	print "----------------------------------------------------------------------------------"
		pprint.pprint(dict_query_results)
        	print "----------------------------------------------------------------------------------"
		print "Cardinality of Stream Dataset:"
        	print "----------------------------------------------------------------------------------"
		print len(dict_query_results)
		spcon.stop()
        	return sorted_dict_matches 
Exemple #18
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvsModel = tvs.fit(dataset)
     tvsPath = temp_path + "/tvs"
     tvs.save(tvsPath)
     loadedTvs = TrainValidationSplit.load(tvsPath)
     self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
     self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
     self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())
     tvsModelPath = temp_path + "/tvsModel"
     tvsModel.save(tvsModelPath)
     loadedModel = TrainValidationSplitModel.load(tvsModelPath)
     self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
Exemple #19
0
 def __init__(self, predictionAndLabels):
     sc = predictionAndLabels.ctx
     sql_ctx = SQLContext(sc)
     df = sql_ctx.createDataFrame(predictionAndLabels,
                                  schema=sql_ctx._inferSchema(predictionAndLabels))
     java_model = callMLlibFunc("newRankingMetrics", df._jdf)
     super(RankingMetrics, self).__init__(java_model)
def index(request):
    string  = u'template显示字符串变量'
    list = ['第一','第二','第三']
    tuple = ('q','w','e','r','t')
    dict = {'a':1,'b':2,'c':3,'d':4}
    conf = SparkConf().setAppName("djangotest").setMaster("spark://HP-Pavilion:7077")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    url='jdbc:mysql://127.0.0.1:3306?user=root&password=raymon'
    dbtable='networkPublicOpinionAnalysisSystem.test'
    df = sqlContext.read.format('jdbc').options(url=url,dbtable=dbtable).load()
    lines = sc.textFile(settings.BASE_DIR+'/system/data/roll_news_sina_com_cn.csv')
    parts = lines.map(lambda l:l.split(','))
    schemaNews = parts.map(lambda p : Row(category=p[0],title=p[1],url=p[2],time=p[3]))
    news = sqlContext.createDataFrame(schemaNews)
    # news.registerTempTable('test')
    # dbtable = 'networkPublicOpinionAnalysisSystem.test'
    # news.write.format('jdbc').options(url=url).insertInto(tableName=dbtable)
    # string = news.count()
    row = news.first()
    a = Row()
    print(type(news))
    print(type(row))
    # print(type(a))
    # dict = row.asDict()
    # string = dict['title']

    # news.write.jdbc(url,table=dbtable)
    return render(request,'index.html',{'string':string,'list':list,'tuple':tuple,'dict':dict})
class ZeppelinReporterTest(unittest.TestCase):
    def setUp(self):
        self.sc = SparkContext()
        self.sql = SQLContext(self.sc)
        self.df = self.sql.createDataFrame([(1, "a"), (1, None), (3, "c")])

    def tearDown(self):
        self.sc.stop()

    def test_output(self):
        with patch("pyddq.reporters.get_field") as get_field:
            baos = ByteArrayOutputStream()
            baos.jvm = self.df._sc._jvm

            get_field.return_value = baos.jvm_obj
            check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2")
            z = Mock()
            reporter = ZeppelinReporter(z)
            check.run([reporter])
            expected_output = """
%html
</p>
<h4>Checking [_1: bigint, _2: string]</h4>
<h5>It has a total number of 2 columns and 3 rows.</h5>
<table>
<tr><td style="padding:3px">&#10060;</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr>
<tr><td style="padding:3px">&#9989;</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr>
</table>
<p hidden>
""".strip()
            self.assertEqual(baos.get_output(), expected_output)
Exemple #22
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        sqlContext = SQLContext(self.sc)
        temp_path = tempfile.mkdtemp()

        try:
            df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
def Spark_MapReduce(level, wordsatthislevel, graphcache):
	freqterms1_local=wordsatthislevel
	md5hash = hashlib.md5(",".join(wordsatthislevel)).hexdigest()
	#md5hash = ",".join(wordsatthislevel)
	cachevalue=graphcache.get(md5hash)
	if cachevalue: 
		print "Spark_MapReduce(): hash = ", md5hash, "; returning from cache"
		return cachevalue 
	else:	
		spcon=SparkContext("local[2]","Spark_MapReduce")
		print "Spark_MapReduce(): wordsatthislevel:",wordsatthislevel
		paralleldata=spcon.parallelize(wordsatthislevel).cache()
		#k=paralleldata.map(lambda wordsatthislevel: mapFunction(wordsatthislevel)).reduceByKey(reduceFunction)
		k=paralleldata.map(mapFunction2).reduceByKey(reduceFunction)
		#k=paralleldata.map(mapFunction).reduceByKey(reduceFunction)

		#dict_k=k.collect()
		#s = sorted(dict_k.items(),key=operator.itemgetter(1), reverse=True)
		#print "Spark MapReduce results:"
		#print s
		############################
		sqlContext=SQLContext(spcon)
		recursiveglossoverlap_schema=sqlContext.createDataFrame(k.collect())
		recursiveglossoverlap_schema.registerTempTable("Interview_RecursiveGlossOverlap")
		query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap")
		dict_query_results=dict(query_results.collect())
		#print "Spark_MapReduce() - SparkSQL DataFrame query results:"
		#print dict_query_results[1]
		graphcache.set(md5hash, dict_query_results[1])
		print "graphcache_mapreduce updated:", graphcache
		spcon.stop()
		return dict_query_results[1]
Exemple #24
0
    def get_top_restaurants(self, c, food1):
        
        try:
            sc = self.sc
            sqlc = SQLContext(sc)
            food = food1
            city = c

            #Check if the Database already has the results of this request
            df = sqlc.read.format("jdbc").options(url="jdbc:postgresql://localhost:5432/foodninjadb?user=w205&password=postgres", dbtable="toprestaurants",driver="org.postgresql.Driver").load()
            resultrdd = df.rdd.filter(lambda x: city in x[0]).filter(lambda x: food in x[1])

            #If the Database has the data then display the result
            if(resultrdd.count() > 0):
                return sqlc.createDataFrame(resultrdd).collect()

            #Else run the pyspark job to get the result and display it
            else:
                
                # Read the Yelp Business data JSON file from HDFS and store the required columns in a dataframe (COUNT OF RECORDS = 77445)
                bdf = sqlc.read.json("/user/w205/project1/yelpbusinessdata").select('business_id', 'name', 'categories', 'city', 'full_address', 'review_count', 'state', 'type')
     
                # Store the restaurant business data in a temp table for querying
                bdf.registerTempTable("business")
                
                # Read the Yelp Review data JSON file from HDFS and store all the columns in a dataframe
                rdf = sqlc.read.json("/user/w205/project1/yelpreviewdata")
                #rdf.printSchema()

                # Filter only the rows where the user rating is greater than 2 
                # Since we are interested only in the top rated restaurants, we filter out lower rated reviews
                #reviewrdd = rdf.rdd.filter(lambda x: x[3] >2)

                # Store the review data in a temp table for querying
                #sqlc.createDataFrame(reviewrdd).registerTempTable("review")

                rdf.registerTempTable("review")
                
                #Run the query to get the top 5 restaurants 
                query = "select b.city as City, '"+ food + "' as FOOD, b.business_id as Bid, b.name as Name, count(*) as NReviews, avg(r.stars) as AvgRating  from review r, business b where r.text like '%"+food+"%' AND r.stars>2 and r.business_id=b.business_id AND b.city='"+city+"' group by b.business_id, b.name, b.city order by NReviews desc limit 5"
                toprestaurants = sqlc.sql(query)
                #toprestaurants is a dataframe
                #toprestaurants.show()
                #print("The query was "+query)

                #Since this request was not in the Database, write it now to the database
                props = {
                    "user": "******",
                    "password": "******"
                }
                toprestaurants.write.jdbc(url="jdbc:postgresql://localhost:5432/foodninjadb", table="toprestaurants", mode="append", properties=props)

                return toprestaurants.collect()

                        
        except Exception as inst:
            logger.info(inst.args)
            logger.info(inst)
            return "There was an error in the execution of this request. Please check the input and place the request again."
Exemple #25
0
 def __init__(self, predictionAndLabels):
     sc = predictionAndLabels.ctx
     sql_ctx = SQLContext(sc)
     df = sql_ctx.createDataFrame(predictionAndLabels,
                                  schema=sql_ctx._inferSchema(predictionAndLabels))
     java_class = sc._jvm.org.apache.spark.mllib.evaluation.MultilabelMetrics
     java_model = java_class(df._jdf)
     super(MultilabelMetrics, self).__init__(java_model)
Exemple #26
0
def to_data_frame(sc, features, labels, categorical=False):
    '''
    Convert numpy arrays of features and labels into Spark DataFrame
    '''
    lp_rdd = to_labeled_point(sc, features, labels, categorical)
    sql_context = SQLContext(sc)
    df = sql_context.createDataFrame(lp_rdd)
    return df
def SurvivalIndexTimeout(timeoutpidsmap):
	global spcon
	sqlcon = SQLContext(spcon)
	timeoutdf=sqlcon.createDataFrame(timeoutpidsmap,['index','process_ids'])
	fpGrowth=FPGrowth(itemsCol="process_ids",minSupport=0.5,minConfidence=0.5)
	fpModel=fpGrowth.fit(timeoutdf)
	fpModel.freqItemsets.show()
	fpModel.associationRules.show()
def get_summary_statistics(sc, rdd_vs_energies_sorted):

	sqlCtx = SQLContext(sc)
	vs_energies_sorted_table = sqlCtx.createDataFrame(rdd_vs_energies_sorted)
	vs_energies_sorted_table.registerTempTable("vs_energies_sorted")

	summary_statistics = sqlCtx.sql("SELECT count(energy) as total, min(energy) as min_e, max(energy) as max_e, avg(energy) as avg_e FROM vs_energies_sorted")
	return summary_statistics	
Exemple #29
0
 def test_fit_maximize_metric(self):
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame([
         (10, 10.0),
         (50, 50.0),
         (100, 100.0),
         (500, 500.0)] * 10,
         ["feature", "label"])
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
    pathLength = g.shortestPaths(landmarks=vertexList)
    # Break up the map and group by ID for summing
    pathLength = pathLength.select('id', explode('distances'))
    # Sum by ID
    distance_df = pathLength.groupBy('id').sum('value')
    # Get the inverses and generate desired dataframe.
    centrality_df = distance_df.rdd.map(lambda x: (x[0], 1 / float(x[1])))
    final_df = sqlContext.createDataFrame(centrality_df, ['id', 'closeness'])
    #final_df.toPandas().to_csv("centrality_out.csv")
    return final_df


print("Reading in graph for problem 2.")
graph = sc.parallelize([('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'A'),
                        ('B', 'C'), ('B', 'D'), ('B', 'E'), ('C', 'A'),
                        ('C', 'B'), ('C', 'D'), ('C', 'F'), ('C', 'H'),
                        ('D', 'A'), ('D', 'B'), ('D', 'C'), ('D', 'E'),
                        ('D', 'F'), ('D', 'G'), ('E', 'B'), ('E', 'D'),
                        ('E', 'F'), ('E', 'G'), ('F', 'C'), ('F', 'D'),
                        ('F', 'E'), ('F', 'G'), ('F', 'H'), ('G', 'D'),
                        ('G', 'E'), ('G', 'F'), ('H', 'C'), ('H', 'F'),
                        ('H', 'I'), ('I', 'H'), ('I', 'J'), ('J', 'I')])

e = sqlContext.createDataFrame(graph, ['src', 'dst'])
v = e.selectExpr('src as id').unionAll(e.selectExpr('dst as id')).distinct()
print("Generating GraphFrame.")
g = GraphFrame(v, e)

print("Calculating closeness.")
closeness(g).sort('closeness', ascending=False).show()
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row

conf = SparkConf().setAppName("spark_sql_dataframe_select")

sc = SparkContext(conf=conf)

sqlCtx = SQLContext(sc)

lines = sc.parallelize(["a,1", "b,2", "c,3"])

people = lines.map(lambda line: line.split(",")).map(
    lambda words: Row(name=words[0], age=words[1]))

schemaPeople = sqlCtx.createDataFrame(people)

schemaPeople.select("*").show()

schemaPeople.select("name", "age").show()

schemaPeople.select("name", schemaPeople["age"]).show()

# error schemaPeople.select("name", schemaPeople2["age"]).show()

# error schemaPeople.select("name", "age * 2").show()

schemaPeople.select(schemaPeople["name"].alias("name2"),
                    schemaPeople.age.cast("int").alias("age2")).show()

sc.stop()
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row

sc = SparkContext()
sqlcontext = SQLContext(sc)

# creating dataframe from RDD
l = [('Ankit', 25), ('Jalfaizy', 22), ('saurabh', 20), ('Bala', 26)]
rdd = sc.parallelize(l)
# print rdd.take(2)

people = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
print 'People RDD: \n', people, '\n'

schemaPeople = sqlcontext.createDataFrame(people)
print 'Schemapeople collect(): \n', schemaPeople.collect(), '\n'
print 'Schemapeople show(): \n', schemaPeople.show(), '\n'

print 'Type of schemaPeople: \n', type(schemaPeople), '\n'
Exemple #34
0
rdd = lines.filter(lambda line: line != header).map(parseLine)

totalByMax = rdd.map(lambda x: (x[0] + ',' + x[1]+','+x[2], x[3])).\
mapValues(lambda x:(x,x,x,1)).reduceByKey(lambda x, y: (max(x[0],y[0]), min(x[1],y[1]),(x[2]+y[2]),(x[3]+y[3]) )).cache().sortByKey(True, 1)
averagesByMax = totalByMax.mapValues(lambda x: (x[0], x[1], (x[2] / x[3])))

result_list = []

results = averagesByMax.sortByKey(True, 1).collect()

for result in results:
    node1 = result[0].split(',')
    date1 = node1[0]
    device1 = node1[1]
    sensor1 = node1[2]
    node2 = result[1]
    max1 = result[1][0]
    min1 = result[1][1]
    avg = result[1][2]
    result_tuple = (date1, device1, sensor1, max1, min1, avg)
    #print(result_tuple)
    result_list.append(result_tuple)

df = sqlContext.createDataFrame(
    result_list, ["date", "deviceid", "sensor", "max", "min", "avg"])

df.write.jdbc(url=url,
              table="limo_max_min",
              mode="append",
              properties=properties)
Exemple #35
0
def split_words(line): return line.split()
def create_pair(word): return (word,1)
pairs_RDD=text_RDD.flatMap(split_words).map(create_pair)

students = sc.parallelize([[100, "Alice", 8.5, "Computer Science"],
                          [101, "Bob", 7.1, "Engineering"],
                          [102, "Carl", 6.2, "Engineering"]
                          ])
def extract_grade(row): return row[2]
students.map(extract_grade).mean()
def extract_degree_grade(row): return (row[3], row[2])
degree_grade_RDD = students.map(extract_degree_grade)
degree_grade_RDD.collect()
degree_grade_RDD.reduceByKey(max).collect()
#"phoneNumbers": [{"type": "home","number": "212 555-1234"},{"type": "office","number": "646 555-4567"}],"children": [],"spouse": null}
students_df = sqlCtx.createDataFrame(students, ["id", "name", "grade", "degree"])
students_df.printSchema()
students_df.agg({"grade":"mean"}).collect()
students_df.groupBy("degree").max("grade").collect()
students_df.groupBy("degree").max("grade").show()

from pyspark.sql.types import *

schema = StructType([
StructField("id", LongType(), True),
StructField("name", StringType(), True),
StructField("grade", DoubleType(), True),
StructField("degree", StringType(), True) ])
students_df = sqlCtx.createDataFrame(students, schema)
students_json = [ '{"id":100, "name":"Alice", "grade":8.5, "degree":"Computer Science"}', '{"id":101, "name":"Bob", "grade":7.1, "degree":"Engineering"}']
with open("students.json", "w") as f: