def main(argv): Conf = (SparkConf().setAppName("recommendation")) sc = SparkContext(conf=Conf) sqlContext = SQLContext(sc) dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet" rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER) # argv[1] is the dump of training data in hdfs # argv[2] is the user perferences # User Hash Lookup stored into cassandra user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a))) distinctUser = user_hash.distinct() userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"]) userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace = keyspace).save(mode="append") # Product Hash Lookup stored into cassandra product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b))) distinctProduct = product_hash.distinct() productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"]) productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace = keyspace).save(mode="append") # Ratings for training # ALS requires a java hash of string. This function does that and stores it as Rating Object # for the algorithm to consume ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c))) model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5) model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model") sc.stop()
def _rdd_to_df(rdd, schema): """convert rdd to dataframe using schema.""" spark_context = rdd.context sql_context = SQLContext(spark_context) if schema is None: df = sql_context.createDataFrame(rdd) else: df = sql_context.createDataFrame(rdd, schema) return df
def main(): inputs = sys.argv[1] output = sys.argv[2] ntlk_path = sys.argv[3] conf = SparkConf().setAppName('TF-IDF Representation') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) '''sbaronia - get 3 fields from json file and filter those with empty review''' review = sqlContext.read.json(inputs).select('reviewText','overall','reviewTime').cache() review_df = review.filter(review.reviewText != "").cache() '''sbaronia - get year and rating and zip them with index''' year_rdd = rdd_zip(review_df.map(extract_year).cache()).cache() year_df = sqlContext.createDataFrame(year_rdd, ['year', 'index']).cache() rating_rdd = rdd_zip(review_df.map(lambda line: float(line.overall)).cache()).cache() rating_df = sqlContext.createDataFrame(rating_rdd, ['rating', 'index']).cache() stop_words = stop_words_func(ntlk_path) '''sbaronia - rdd cotaining unique words from review''' clean_words_rdd = review_df.map(lambda review: clean_string_to_words(review.reviewText,stop_words)).filter(lambda x: x[0] != 'null').cache() '''sbaronia - finding tf-idf and zipping it with index''' tfidf_rdd = rdd_zip(tf_idf_cal(clean_words_rdd).cache()).cache() tfidf_df = sqlContext.createDataFrame(tfidf_rdd, ['tfidf', 'index']).cache() '''sbaronia - making dataframe with only rating and tfidf''' year_rating_df = rating_df.join(year_df, rating_df.index == year_df.index, 'outer').drop(rating_df.index).cache() tfyrrating_df = tfidf_df.join(year_rating_df, tfidf_df.index == year_rating_df.index, 'inner').drop(tfidf_df.index).cache() '''sbaronia - making training and testing rdd with <2014 and =2014 condition in a splitable format with :: ''' train_rdd = tfyrrating_df.filter(tfyrrating_df.year < 2014) \ .select('rating','tfidf') \ .map(lambda line: (str(line.rating) + ' :: ' + str(line.tfidf))) \ .coalesce(1) \ .cache() test_rdd = tfyrrating_df.filter(tfyrrating_df.year == 2014) \ .select('rating','tfidf') \ .map(lambda line: (str(line.rating) + ' :: ' + str(line.tfidf))) \ .coalesce(1) \ .cache() '''sbaronia - save rdds to text''' train_rdd.saveAsTextFile(output + '/train-text') test_rdd.saveAsTextFile(output + '/test-text')
def split_data(rev2, sc): # Split train and test set. data = rev2.copy() train, test = train_test_split(data) X_test = test.copy() y_test = X_test.pop("rating") sql_context = SQLContext(sc) train_df = sql_context.createDataFrame(train).rdd X_test_df = sql_context.createDataFrame(X_test).rdd test_df = sql_context.createDataFrame(test).rdd return train_df, train, test, test_df, X_test_df, y_test
def main(): sc = SparkContext() sqlCtx = SQLContext(sc) config = configparser.ConfigParser() config.read('config.ini') #Path that contains all files for analysis path_analysis = config.get('DEFAULT', 'path_analysis') #Ligand Database file ligand_database = config.get('DEFAULT', 'ligand_database_path_file') #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') #Adding Python Source file sc.addPyFile(os.path.join(path_spark_drugdesign,"vina_utils.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"json_utils.py")) start_time = datetime.now() #**************** Loading file that contains all scores score_file_name = os.path.join(path_analysis,get_file_name_sorted_energy()) text_file = sc.textFile(score_file_name) #Spliting score file by \t header = text_file.first() #extract header rdd_vs_score_sorted_split = text_file.filter(lambda x:x !=header) #filter out header rdd_vs_score_sorted_split = rdd_vs_score_sorted_split.map(lambda line: line.split("\t")) rdd_vs_score_sorted = rdd_vs_score_sorted_split.map(lambda p: Row(energy=float(p[0]), pose=str(p[1]), ligand=get_ligand_from_receptor_ligand_model(p[1]) )) #Creating Vina Datafrase based on score file vina_table = sqlCtx.createDataFrame(rdd_vs_score_sorted) vina_table.registerTempTable("vina") #**************** Finish #**************** Loading Ligand Database rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") #**************** Finish #Computing ligand efficiency ligand_efficiencyRDD = sqlCtx.sql("SELECT vina.pose, vina.energy as affinity, (vina.energy / database.heavyAtom) as lig_efficiency FROM database JOIN vina ON vina.ligand = database.ligand ORDER BY vina.energy") ligand_efficiencyRDD = ligand_efficiencyRDD.map(lambda p: (p.pose, p.affinity, p.lig_efficiency) ).collect() #Saving ligand efficiency file save_ligand_efficiency(path_analysis, ligand_efficiencyRDD) finish_time = datetime.now() save_ligand_efficiency_log(finish_time, start_time)
def main(sc): sql_context = SQLContext(sc) all_data = get_all_data() # Input data: Each row is a bag of words from a sentence or document. training_data = [(id_gen.next(), text.split(" ")) for text in all_data] documentdf = sql_context.createDataFrame(training_data, ["id", "text"]) remover = StopWordsRemover(inputCol="text", outputCol="text_filtered") cleaned_document = remover.transform(documentdf) # Learn a mapping from words to Vectors. word2vec = Word2Vec(vectorSize=len(training_data), inputCol="text_filtered", outputCol="result") model = word2vec.fit(cleaned_document) matrix = column_similarities(model.transform(cleaned_document)) # We use the size of the target data to filter only # products of target data to filter data and avoid # products of taret data to itself values = matrix.entries.filter( lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy( keyfunc=lambda x: x.value, ascending=False).map( lambda x: x.j).distinct().take(100) training_data_index = dict(training_data) for position, item in enumerate(values): line = " ".join(training_data_index[int(item)]) print('%d -> %s' % (position, line.encode('utf-8')))
def main(dataFile, outputPath): conf = SparkConf().setAppName("S3 Example").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) raw_text = sc.textFile(dataFile).persist(StorageLevel.MEMORY_AND_DISK) csv_data = raw_text.map(lambda l: l.split(",")) row_data = csv_data.map(lambda p: dataIO.dataStruc(p)) interaction_df = sqlContext.createDataFrame(row_data) # features.save_hdfs_parquet(interaction_df, outputPath) dataIO.save_hdfs_parquet(interaction_df, outputPath) interaction_df.registerTempTable("interactions") tcp_interactions = sqlContext.sql( """ SELECT duration, dst_bytes, protocol_type FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes=0 """) tcp_interactions.show() features.print_tcp_interactions(tcp_interactions) dataIO.print_from_dataio() features.print_from_feature() sc.stop()
def Spark_MapReduce_Parents(keyword, tokensofprevlevel, graphcache): #tokensofprevlevelkeyword=tokensofprevlevel #tokensofprevlevelkeyword.append(keyword) md5hashparents = hashlib.md5(keyword).hexdigest() #md5hashparents = keyword md5hashparents = md5hashparents + "$parents" picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w") asfer_pickle_string_dump(keyword,picklef_keyword) picklef_keyword.close() cachevalue=graphcache.get(md5hashparents) if cachevalue: print "Spark_MapReduce_Parents(): hash = ", md5hashparents, "; returning from cache" return cachevalue else: #picklelock.acquire() spcon = SparkContext("local[2]","Spark_MapReduce_Parents") #picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w") #asfer_pickle_string_dump(keyword,picklef_keyword) #picklef_keyword.close() paralleldata = spcon.parallelize(tokensofprevlevel).cache() #k=paralleldata.map(lambda keyword: mapFunction_Parents(keyword,tokensofprevlevel)).reduceByKey(reduceFunction_Parents) k=paralleldata.map(mapFunction_Parents).reduceByKey(reduceFunction_Parents) sqlContext=SQLContext(spcon) parents_schema=sqlContext.createDataFrame(k.collect()) parents_schema.registerTempTable("Interview_RecursiveGlossOverlap_Parents") query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap_Parents") dict_query_results=dict(query_results.collect()) #print "Spark_MapReduce_Parents() - SparkSQL DataFrame query results:" #picklelock.release() graphcache.set(md5hashparents,dict_query_results[1]) spcon.stop() print "graphcache_mapreduce_parents updated:", graphcache return dict_query_results[1]
def mock_data(self): """Mock data to imitate read from database.""" sqlContext = SQLContext(self.sc) mock_data_rdd = self.sc.parallelize([("A", 1, 1), ("B", 1, 0), ("C", 0, 2), ("D", 2, 4), ("E", 3, 5) ]) schema = ["id", "x", "y"] mock_data_df = sqlContext.createDataFrame(mock_data_rdd, schema) return mock_data_df
def main(sc): path = "events" #text_file = sc.textFile(path) sqlContext = SQLContext(sc) events = sqlContext.jsonFile(path) events = events.select(events["events.event"]).flatMap(lambda p: p.event) events = events.map(lambda p: Row( id=p.id,\ title=p.title, \ lat=p.latitude, \ long=p.longitude, \ postal_code=p.postal_code, \ start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \ stop_time=p.stop_time)) events_df = sqlContext.createDataFrame(events) events_df.registerTempTable("events") sqlContext.registerFunction("to_hour", lambda x: x.hour) sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year)) e = sqlContext.sql("select title, str_date(start_time) as event_date, to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null") events_grouped = sqlContext.sql("select event_date, hour, postal_code, count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour") grouped_csv = events_grouped.map(toCSV) grouped_csv.saveAsTextFile('events_cluster')
def main(): sc = SparkContext() sqlCtx = SQLContext(sc) config = configparser.ConfigParser() config.read('config.ini') #Path where docking list file will be saved path_to_save = str(sys.argv[1]) #Path for drugdesign project path_spark_drugdesign = config.get('DRUGDESIGN', 'path_spark_drugdesign') sc.addPyFile(os.path.join(path_spark_drugdesign,"database_crud.py")) sc.addPyFile(os.path.join(path_spark_drugdesign,"database_io.py")) #**************** Loading Ligand Database ligand_database = config.get('DEFAULT', 'ligand_database_path_file') rdd_database = load_database(sc, ligand_database) #Creating Dataframe database_table = sqlCtx.createDataFrame(rdd_database) database_table.registerTempTable("database") #**************** Finish #Creating input files for peforming virtual screening creating_docking_list(path_to_save, config, sqlCtx)
def RunRandomForest(tf, ctx): sqlContext = SQLContext(ctx) rdd = tf.map(parseForRandomForest) # The schema is encoded in a string. schema = ['genre', 'track_id', 'features'] # Apply the schema to the RDD. songDF = sqlContext.createDataFrame(rdd, schema) # Register the DataFrame as a table. songDF.registerTempTable("genclass") labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF) trainingData, testData = songDF.randomSplit([0.8, 0.2]) labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features") #rfc = SVMModel([.5, 10, 20], 5) #rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features") pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.show() evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision") accuracy = evaluator.evaluate(predictions) print 'Accuracy of RandomForest = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def test_logistic_regression_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def main(n_part, hdfs_path): print "********************\n*" print "* Start main\n*" print "********************" conf = SparkConf().setAppName("Benchmark Spark SQL") sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) rowsRDD = sc.textFile(hdfs_path).repartition(n_part).map(lambda x: recordToRows(x)).cache() df = sqlContext.createDataFrame(rowsRDD).cache() df.count() df.registerTempTable("msd_table") print "********************\n*" print "* Start querres\n*" print "********************" [ave_t1, std1, dt1, n1] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext) [ave_t2, std2, dt2, n2] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_name = 'Taylor Swift'", sqlContext, method=1) [ave_t3, std3, dt3, n3] = time_querry("SELECT * FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext) [ave_t4, std4, dt4, n4] = time_querry("SELECT COUNT(*) FROM msd_table WHERE msd_table.artist_hotness > 0.75", sqlContext, method=1) if n1 != n2: print "\t!!!!Error, counts disagree for the number of T.S. songs!" if n3 != n4: print "\t!!!!Error, counts disagree for the number of high paced songs!" print "********************\n*" print "* Results" print "\t".join(map(lambda x: str(x), [ave_t1, std1, dt1, ave_t2, std2, dt2, ave_t3, std3, dt3, ave_t4, std4, dt4])) print "********************"
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def log_mapreducer(logfilename, pattern, filt="None"): spcon=SparkContext() if filt == "None": input=open(logfilename,'r') paralleldata=spcon.parallelize(input.readlines()) patternlines=paralleldata.filter(lambda patternline: pattern in patternline) print "pattern lines",patternlines.collect() matches=patternlines.map(mapFunction).reduceByKey(reduceFunction) else: input=spcon.textFile(logfilename) matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction) matches_collected=matches.collect() print "matches_collected:",matches_collected if len(matches_collected) > 0: sqlContext=SQLContext(spcon) bytes_stream_schema=sqlContext.createDataFrame(matches_collected) bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream") query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream") dict_query_results=dict(query_results.collect()) print "----------------------------------------------------------------------------------" print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]" print "----------------------------------------------------------------------------------" dict_matches=dict(matches_collected) sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True) print "pattern matching lines:",sorted_dict_matches print "----------------------------------------------------------------------------------" print "SparkSQL DataFrame query results:" print "----------------------------------------------------------------------------------" pprint.pprint(dict_query_results) print "----------------------------------------------------------------------------------" print "Cardinality of Stream Dataset:" print "----------------------------------------------------------------------------------" print len(dict_query_results) spcon.stop() return sorted_dict_matches
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def __init__(self, predictionAndLabels): sc = predictionAndLabels.ctx sql_ctx = SQLContext(sc) df = sql_ctx.createDataFrame(predictionAndLabels, schema=sql_ctx._inferSchema(predictionAndLabels)) java_model = callMLlibFunc("newRankingMetrics", df._jdf) super(RankingMetrics, self).__init__(java_model)
def index(request): string = u'template显示字符串变量' list = ['第一','第二','第三'] tuple = ('q','w','e','r','t') dict = {'a':1,'b':2,'c':3,'d':4} conf = SparkConf().setAppName("djangotest").setMaster("spark://HP-Pavilion:7077") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) url='jdbc:mysql://127.0.0.1:3306?user=root&password=raymon' dbtable='networkPublicOpinionAnalysisSystem.test' df = sqlContext.read.format('jdbc').options(url=url,dbtable=dbtable).load() lines = sc.textFile(settings.BASE_DIR+'/system/data/roll_news_sina_com_cn.csv') parts = lines.map(lambda l:l.split(',')) schemaNews = parts.map(lambda p : Row(category=p[0],title=p[1],url=p[2],time=p[3])) news = sqlContext.createDataFrame(schemaNews) # news.registerTempTable('test') # dbtable = 'networkPublicOpinionAnalysisSystem.test' # news.write.format('jdbc').options(url=url).insertInto(tableName=dbtable) # string = news.count() row = news.first() a = Row() print(type(news)) print(type(row)) # print(type(a)) # dict = row.asDict() # string = dict['title'] # news.write.jdbc(url,table=dbtable) return render(request,'index.html',{'string':string,'list':list,'tuple':tuple,'dict':dict})
class ZeppelinReporterTest(unittest.TestCase): def setUp(self): self.sc = SparkContext() self.sql = SQLContext(self.sc) self.df = self.sql.createDataFrame([(1, "a"), (1, None), (3, "c")]) def tearDown(self): self.sc.stop() def test_output(self): with patch("pyddq.reporters.get_field") as get_field: baos = ByteArrayOutputStream() baos.jvm = self.df._sc._jvm get_field.return_value = baos.jvm_obj check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") z = Mock() reporter = ZeppelinReporter(z) check.run([reporter]) expected_output = """ %html </p> <h4>Checking [_1: bigint, _2: string]</h4> <h5>It has a total number of 2 columns and 3 rows.</h5> <table> <tr><td style="padding:3px">❌</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr> <tr><td style="padding:3px">✅</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr> </table> <p hidden> """.strip() self.assertEqual(baos.get_output(), expected_output)
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def Spark_MapReduce(level, wordsatthislevel, graphcache): freqterms1_local=wordsatthislevel md5hash = hashlib.md5(",".join(wordsatthislevel)).hexdigest() #md5hash = ",".join(wordsatthislevel) cachevalue=graphcache.get(md5hash) if cachevalue: print "Spark_MapReduce(): hash = ", md5hash, "; returning from cache" return cachevalue else: spcon=SparkContext("local[2]","Spark_MapReduce") print "Spark_MapReduce(): wordsatthislevel:",wordsatthislevel paralleldata=spcon.parallelize(wordsatthislevel).cache() #k=paralleldata.map(lambda wordsatthislevel: mapFunction(wordsatthislevel)).reduceByKey(reduceFunction) k=paralleldata.map(mapFunction2).reduceByKey(reduceFunction) #k=paralleldata.map(mapFunction).reduceByKey(reduceFunction) #dict_k=k.collect() #s = sorted(dict_k.items(),key=operator.itemgetter(1), reverse=True) #print "Spark MapReduce results:" #print s ############################ sqlContext=SQLContext(spcon) recursiveglossoverlap_schema=sqlContext.createDataFrame(k.collect()) recursiveglossoverlap_schema.registerTempTable("Interview_RecursiveGlossOverlap") query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap") dict_query_results=dict(query_results.collect()) #print "Spark_MapReduce() - SparkSQL DataFrame query results:" #print dict_query_results[1] graphcache.set(md5hash, dict_query_results[1]) print "graphcache_mapreduce updated:", graphcache spcon.stop() return dict_query_results[1]
def get_top_restaurants(self, c, food1): try: sc = self.sc sqlc = SQLContext(sc) food = food1 city = c #Check if the Database already has the results of this request df = sqlc.read.format("jdbc").options(url="jdbc:postgresql://localhost:5432/foodninjadb?user=w205&password=postgres", dbtable="toprestaurants",driver="org.postgresql.Driver").load() resultrdd = df.rdd.filter(lambda x: city in x[0]).filter(lambda x: food in x[1]) #If the Database has the data then display the result if(resultrdd.count() > 0): return sqlc.createDataFrame(resultrdd).collect() #Else run the pyspark job to get the result and display it else: # Read the Yelp Business data JSON file from HDFS and store the required columns in a dataframe (COUNT OF RECORDS = 77445) bdf = sqlc.read.json("/user/w205/project1/yelpbusinessdata").select('business_id', 'name', 'categories', 'city', 'full_address', 'review_count', 'state', 'type') # Store the restaurant business data in a temp table for querying bdf.registerTempTable("business") # Read the Yelp Review data JSON file from HDFS and store all the columns in a dataframe rdf = sqlc.read.json("/user/w205/project1/yelpreviewdata") #rdf.printSchema() # Filter only the rows where the user rating is greater than 2 # Since we are interested only in the top rated restaurants, we filter out lower rated reviews #reviewrdd = rdf.rdd.filter(lambda x: x[3] >2) # Store the review data in a temp table for querying #sqlc.createDataFrame(reviewrdd).registerTempTable("review") rdf.registerTempTable("review") #Run the query to get the top 5 restaurants query = "select b.city as City, '"+ food + "' as FOOD, b.business_id as Bid, b.name as Name, count(*) as NReviews, avg(r.stars) as AvgRating from review r, business b where r.text like '%"+food+"%' AND r.stars>2 and r.business_id=b.business_id AND b.city='"+city+"' group by b.business_id, b.name, b.city order by NReviews desc limit 5" toprestaurants = sqlc.sql(query) #toprestaurants is a dataframe #toprestaurants.show() #print("The query was "+query) #Since this request was not in the Database, write it now to the database props = { "user": "******", "password": "******" } toprestaurants.write.jdbc(url="jdbc:postgresql://localhost:5432/foodninjadb", table="toprestaurants", mode="append", properties=props) return toprestaurants.collect() except Exception as inst: logger.info(inst.args) logger.info(inst) return "There was an error in the execution of this request. Please check the input and place the request again."
def __init__(self, predictionAndLabels): sc = predictionAndLabels.ctx sql_ctx = SQLContext(sc) df = sql_ctx.createDataFrame(predictionAndLabels, schema=sql_ctx._inferSchema(predictionAndLabels)) java_class = sc._jvm.org.apache.spark.mllib.evaluation.MultilabelMetrics java_model = java_class(df._jdf) super(MultilabelMetrics, self).__init__(java_model)
def to_data_frame(sc, features, labels, categorical=False): ''' Convert numpy arrays of features and labels into Spark DataFrame ''' lp_rdd = to_labeled_point(sc, features, labels, categorical) sql_context = SQLContext(sc) df = sql_context.createDataFrame(lp_rdd) return df
def SurvivalIndexTimeout(timeoutpidsmap): global spcon sqlcon = SQLContext(spcon) timeoutdf=sqlcon.createDataFrame(timeoutpidsmap,['index','process_ids']) fpGrowth=FPGrowth(itemsCol="process_ids",minSupport=0.5,minConfidence=0.5) fpModel=fpGrowth.fit(timeoutdf) fpModel.freqItemsets.show() fpModel.associationRules.show()
def get_summary_statistics(sc, rdd_vs_energies_sorted): sqlCtx = SQLContext(sc) vs_energies_sorted_table = sqlCtx.createDataFrame(rdd_vs_energies_sorted) vs_energies_sorted_table.registerTempTable("vs_energies_sorted") summary_statistics = sqlCtx.sql("SELECT count(energy) as total, min(energy) as min_e, max(energy) as max_e, avg(energy) as avg_e FROM vs_energies_sorted") return summary_statistics
def test_fit_maximize_metric(self): sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"])
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
pathLength = g.shortestPaths(landmarks=vertexList) # Break up the map and group by ID for summing pathLength = pathLength.select('id', explode('distances')) # Sum by ID distance_df = pathLength.groupBy('id').sum('value') # Get the inverses and generate desired dataframe. centrality_df = distance_df.rdd.map(lambda x: (x[0], 1 / float(x[1]))) final_df = sqlContext.createDataFrame(centrality_df, ['id', 'closeness']) #final_df.toPandas().to_csv("centrality_out.csv") return final_df print("Reading in graph for problem 2.") graph = sc.parallelize([('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'A'), ('B', 'C'), ('B', 'D'), ('B', 'E'), ('C', 'A'), ('C', 'B'), ('C', 'D'), ('C', 'F'), ('C', 'H'), ('D', 'A'), ('D', 'B'), ('D', 'C'), ('D', 'E'), ('D', 'F'), ('D', 'G'), ('E', 'B'), ('E', 'D'), ('E', 'F'), ('E', 'G'), ('F', 'C'), ('F', 'D'), ('F', 'E'), ('F', 'G'), ('F', 'H'), ('G', 'D'), ('G', 'E'), ('G', 'F'), ('H', 'C'), ('H', 'F'), ('H', 'I'), ('I', 'H'), ('I', 'J'), ('J', 'I')]) e = sqlContext.createDataFrame(graph, ['src', 'dst']) v = e.selectExpr('src as id').unionAll(e.selectExpr('dst as id')).distinct() print("Generating GraphFrame.") g = GraphFrame(v, e) print("Calculating closeness.") closeness(g).sort('closeness', ascending=False).show()
from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row conf = SparkConf().setAppName("spark_sql_dataframe_select") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) lines = sc.parallelize(["a,1", "b,2", "c,3"]) people = lines.map(lambda line: line.split(",")).map( lambda words: Row(name=words[0], age=words[1])) schemaPeople = sqlCtx.createDataFrame(people) schemaPeople.select("*").show() schemaPeople.select("name", "age").show() schemaPeople.select("name", schemaPeople["age"]).show() # error schemaPeople.select("name", schemaPeople2["age"]).show() # error schemaPeople.select("name", "age * 2").show() schemaPeople.select(schemaPeople["name"].alias("name2"), schemaPeople.age.cast("int").alias("age2")).show() sc.stop()
from pyspark import SparkContext from pyspark.sql import SQLContext, Row sc = SparkContext() sqlcontext = SQLContext(sc) # creating dataframe from RDD l = [('Ankit', 25), ('Jalfaizy', 22), ('saurabh', 20), ('Bala', 26)] rdd = sc.parallelize(l) # print rdd.take(2) people = rdd.map(lambda x: Row(name=x[0], age=int(x[1]))) print 'People RDD: \n', people, '\n' schemaPeople = sqlcontext.createDataFrame(people) print 'Schemapeople collect(): \n', schemaPeople.collect(), '\n' print 'Schemapeople show(): \n', schemaPeople.show(), '\n' print 'Type of schemaPeople: \n', type(schemaPeople), '\n'
rdd = lines.filter(lambda line: line != header).map(parseLine) totalByMax = rdd.map(lambda x: (x[0] + ',' + x[1]+','+x[2], x[3])).\ mapValues(lambda x:(x,x,x,1)).reduceByKey(lambda x, y: (max(x[0],y[0]), min(x[1],y[1]),(x[2]+y[2]),(x[3]+y[3]) )).cache().sortByKey(True, 1) averagesByMax = totalByMax.mapValues(lambda x: (x[0], x[1], (x[2] / x[3]))) result_list = [] results = averagesByMax.sortByKey(True, 1).collect() for result in results: node1 = result[0].split(',') date1 = node1[0] device1 = node1[1] sensor1 = node1[2] node2 = result[1] max1 = result[1][0] min1 = result[1][1] avg = result[1][2] result_tuple = (date1, device1, sensor1, max1, min1, avg) #print(result_tuple) result_list.append(result_tuple) df = sqlContext.createDataFrame( result_list, ["date", "deviceid", "sensor", "max", "min", "avg"]) df.write.jdbc(url=url, table="limo_max_min", mode="append", properties=properties)
def split_words(line): return line.split() def create_pair(word): return (word,1) pairs_RDD=text_RDD.flatMap(split_words).map(create_pair) students = sc.parallelize([[100, "Alice", 8.5, "Computer Science"], [101, "Bob", 7.1, "Engineering"], [102, "Carl", 6.2, "Engineering"] ]) def extract_grade(row): return row[2] students.map(extract_grade).mean() def extract_degree_grade(row): return (row[3], row[2]) degree_grade_RDD = students.map(extract_degree_grade) degree_grade_RDD.collect() degree_grade_RDD.reduceByKey(max).collect() #"phoneNumbers": [{"type": "home","number": "212 555-1234"},{"type": "office","number": "646 555-4567"}],"children": [],"spouse": null} students_df = sqlCtx.createDataFrame(students, ["id", "name", "grade", "degree"]) students_df.printSchema() students_df.agg({"grade":"mean"}).collect() students_df.groupBy("degree").max("grade").collect() students_df.groupBy("degree").max("grade").show() from pyspark.sql.types import * schema = StructType([ StructField("id", LongType(), True), StructField("name", StringType(), True), StructField("grade", DoubleType(), True), StructField("degree", StringType(), True) ]) students_df = sqlCtx.createDataFrame(students, schema) students_json = [ '{"id":100, "name":"Alice", "grade":8.5, "degree":"Computer Science"}', '{"id":101, "name":"Bob", "grade":7.1, "degree":"Engineering"}'] with open("students.json", "w") as f: