def processing1(request): if request.method!='POST': return HttpResponseRedirect('/sp/processing') else: progress=request.POST.get('progress') sc = SparkContext('local', 'test') spark = SparkSession.builder.getOrCreate() if progress=='1': city = request.POST.get('city') edu = request.POST.get('education') introduce = request.POST.get('introduce') position = request.POST.get('job') exp = request.POST.get('exp') print(city,edu,introduce,position,exp) explain1='第一步:将信息按照类别形成RDD后,通过map操作,将传入的信息进行合并,转化,并根据词频进行分词处理,后形成dataframe,便于下一步操作,此时信息的状态如下' dataRDD = sc.parallelize([[edu, city, position, exp, introduce]]) dataDF = dataRDD.map(lambda i: Row(**{ 'education': i[0], 'work_area': i[1], 'work_lable': i[2], 'work_exp': i[3], 'work_desp': i[4] })).map(lambda i: Row(**{ 'education': str(new_edu_trans(i.education)), 'city': [i.work_area], 'work_desp': i.work_desp, 'work_lable': [i.work_lable], 'work_exp': [i.work_exp] })).map(lambda i: Row(**{ 'agg': [i.education] + i.city + i.work_lable + i.work_exp, 'name_and_desp': desp_text_division(i.work_desp) })).toDF() dct={} for i in dataDF.collect(): dct['agg1']=i[0] dct['nd1']=i[1] # spark.stop() # sc.stop() agg_pro1='将学历,城市,职位,经验合并为一行:' nd_pro1='将个人简介单独为一行:' agg_pro2='学历,经验,职位,城市形成的向量:' nd_pro2='个人介绍形成的向量:' explain2='第二步:将形成的list通过spark的机器学习包转化包通过if-idf算法形成特征向量,便于下一步机器学习的使用' nd_idf = IDFModel.load('hdfs://localhost:9000/nd_idf_test') agg_idf = IDFModel.load('hdfs://localhost:9000/agg_idf_test') ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240) aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256) data = ndtf.transform(dataDF) data = aggtf.transform(data) idfdata = nd_idf.transform(data) idfdata = agg_idf.transform(idfdata) for i in idfdata.collect(): dct['agg2']=i[3] dct['nd2']=i[2] spark.stop() sc.stop() return render(request,'processing1.html',{'data':dct,'explain1':explain1,'agg_pro1':agg_pro1,'nd_pro1':nd_pro1, 'agg_pro2':agg_pro2,'nd_pro2':nd_pro2,'explain2':explain2})
def _load_models(self): hf_path = self.params_path.format('hf') idf_path = self.params_path.format('idfmodel') rf_path = self.params_path.format('rf') self.hashingTF = HashingTF.load(hf_path) self.idfmodel = IDFModel.load(idf_path) self.rf = RandomForestClassificationModel.load(rf_path)
def load_model(sc, major, minor, model_path=config['DEFAULT']['small_model_path']): modelload = model_path + "small_cluster_data_" + str(major) + "_" + str( minor) model = IDFModel.load(modelload) return model
def loadIDFModel(path): ''' Load IDFModel input : - path output: - model [IDFModel data frame] ''' model = IDFModel.load(path) return model
def topicPredict(inputs): #output_path = "/user/llbui/bigdata45_500" output_path = "C:/Users/linhb/bigdata45_500" query = inputs n = 10 #number of similar document to return feature = "abstract" #feature to compare df = sc.parallelize([(0, query)]).toDF(["id", feature]) tokenizer = RegexTokenizer(inputCol=feature, outputCol="words", pattern="\\P{Alpha}+") df2 = tokenizer.transform(df) remover = StopWordsRemover(inputCol="words", outputCol="words2") df3 = remover.transform(df2) udf_remove_words = udf(lambda x: remove_words(x), ArrayType(StringType())) df4 = df3.withColumn("words3", udf_remove_words(df3.words2)) # text to feature vector - TF_IDF countTF_model = CountVectorizerModel.load(output_path + "/tf_model") df_countTF = countTF_model.transform(df4) idf_model = IDFModel.load(output_path + "/idf_model") df_IDF = idf_model.transform(df_countTF) # LDA Model lda_model = LocalLDAModel.load(output_path + "/lda_model") #output topics for document -> topicDistribution df_Feature = lda_model.transform(df_IDF) feature_vector = df_Feature.select("id", "topicDistribution").collect()[0][1] print("Feature Vector:", feature_vector) #Load existing document df_Document = sqlCt.read.load(output_path + "/topicDistribution.parquet") udf_cosineSimilarity = udf( lambda x_vector: cosineSimilarity(x_vector, feature_vector), FloatType()) df_Similarity = df_Document.withColumn( "similarity", udf_cosineSimilarity("topicDistribution")) df_Similarity_Sorted = df_Similarity.sort(desc("similarity")) return df_Similarity_Sorted.limit(n).select("_id", "title", "abstract", "url", "topicDistribution").collect()
def update_models(): # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map logger.debug( '===================================================Starting load models===================================================' ) try: logger.debug('Loading tokenizer model') new_tokenizer = Tokenizer.load(tokenizer_file) logger.debug('Load tokenizer model successfully') except: logger.debug('Fail to load tokenizer') try: logger.debug('Loading hashing_tf model') new_hashing_tf = HashingTF.load(hashing_tf_file) logger.debug('Load hashing_tf model successfully') except: logger.debug('Fail to load hashing_tf') try: logger.debug('Loading idf_model') new_idf_model = IDFModel.load(idf_model_file) logger.debug('Load IDFModel successfully') except: logger.debug('Fail to load IDFModel') try: logger.debug('Loading nb_model') new_nb_model = NaiveBayesModel.load(nb_model_file) logger.debug('Load NaiveBayesModel successfully') except: logger.debug('Fail to load NaiveBayesModel') try: logger.debug('Updating models') tokenizer = new_tokenizer hashing_tf = new_hashing_tf idf_model = new_idf_model nb_model = new_nb_model logger.debug('update model successfully') except: logger.debug('Fail to update models') logger.debug( '===================================================Stopped load models===================================================' )
为什么要保存这些值?并且存入数据库当中? 后续计算tfidf画像需要使用,避免放入内存中占用过多,持久化使用 Hive中建立表:idf_keywords_values CREATE TABLE idf_keywords_values( keyword STRING comment "article_id", idf DOUBLE comment "idf", index INT comment "index"); ''' from pyspark.ml.feature import CountVectorizerModel # cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/countVectorizerOfArticleWords.model") cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/CV.model") from pyspark.ml.feature import IDFModel # idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDFOfArticleWords.model") idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDF.model") keywords_list_with_idf = list(zip(cv_model.vocabulary, idf_model.idf.toArray())) def func(data): for index in range(len(data)): data[index] = list(data[index]) data[index].append(index) data[index][1] = float(data[index][1]) print(len(keywords_list_with_idf)) func(keywords_list_with_idf) sc = ktt.spark.sparkContext rdd = sc.parallelize(keywords_list_with_idf) df = rdd.toDF(["keywords", "idf", "index"]) df.show()
#remove punctuation pp_udf = udf(preprocess, ArrayType(StringType())) words = ads_free.withColumn('Words', pp_udf(ads_free.Text)) #remove stop words remover = StopWordsRemover(inputCol="Words", outputCol="filtered") removed = remover.transform(words) params_path = '../tmp/{}' #Load trained hashing frequency and transform hf_path = params_path.format('hf') hashingTF = HashingTF.load(hf_path) featureized = hashingTF.transform(removed) #Load trained hashing frequency and transform idf_path = params_path.format('idfmodel') idfmodel = IDFModel.load(idf_path) result = idfmodel.transform(featureized) #load rf model and predict rf_path = params_path.format('rf') rf = RandomForestClassificationModel.load(rf_path) prediction = rf.transform(result) path_to_save = '../tmp/twitterstream_test_prediction.json' prediction.write.json(path_to_save) #test whether json is written test = spark.read.json(path_to_save)
yield row.article_id, row.channel_id, words # 分词 sqlContext.sql("use article") articleDF = sqlContext.sql("select * from article_data") # words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(["article_id", "channel_id", "words"]) wordsDF = articleDF.rdd.mapPartitions(segmentation, 5).toDF( ["article_id", "channel_id", "words"]) cv_model = CountVectorizerModel.load( "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/CV.model" ) idf_model = IDFModel.load( "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/IDF.model" ) cv_result = cv_model.transform(wordsDF) tfidf_result = idf_model.transform(cv_result) def func(partition): TOPK = 20 for row in partition: # 找到索引与IDF值并进行排序 _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values)) _ = sorted(_, key=lambda x: x[1], reverse=True) result = _[:TOPK] for word_index, tfidf in result: yield row.article_id, row.channel_id, int(word_index), round(
def salary_pre(request): sc=SparkContext('local','test') spark = SparkSession.builder.getOrCreate() hive_con=HiveContext(sc) nd_idf=IDFModel.load('hdfs://localhost:9000/ndidf') agg_idf=IDFModel.load('hdfs://localhost:9000/aggidf') model=NaiveBayesModel.load(sc,'hdfs://localhost:9000/nymodel') # hive_con.sql('use zp') # testdata=hive_con.sql('select education,mon_wa,name,work_area,work_desp,work_exp,work_lable from `qtzp` where id=789') # testdataRDD = testdata.rdd.map(lambda i: Row(**{ # 'education': new_edu_trans(i.education), # 'salary': mon_wa_trans(i.mon_wa), # 'name': i.name, # 'city': i.work_area, # 'work_desp': i.work_desp, # 'work_exp': i.work_exp, # 'work_lable': i.work_lable # })) # dataDF=testdataRDD.map(lambda i:Row(**{ # 'salary': int(i.salary), # 'agg': [i.education] + [i.city] + [i.work_lable] + [i.work_exp], # 'name_and_desp': desp_text_division(i.name + ',' + i.work_desp) # })).toDF() # dataDF.show() city=request.POST.get('city') edu=request.POST.get('education') introduce=request.POST.get('introduce') position=request.POST.get('job') exp=request.POST.get('exp') dataRDD=sc.parallelize([[edu,city,position,exp,introduce]]) # schema=StructType([StructField('education',StringType(),True),StructField('work_area',StringType(),True),StructField('work_lable', # StringType(),True),StructField('work_exp',StringType(),True),StructField('work_desp',StringType(),True)]) # rowRDD=dataRDD.map(lambda i:Row(i[0],i[1],i[2],i[3],i[4])) # dataDF=spark.createDataFrame(rowRDD,schema) # dataDF.show() dataDF=dataRDD.map(lambda i:Row(**{ 'education':i[0], 'work_area':i[1], 'work_lable':i[2], 'work_exp':i[3], 'work_desp':i[4] })).map(lambda i:Row(**{ 'education':str(new_edu_trans(i.education)), 'city':[i.work_area], 'work_desp':i.work_desp, 'work_lable':[i.work_lable], 'work_exp':[i.work_exp] })).map(lambda i:Row(**{ 'agg':[i.education] + i.city + i.work_lable + i.work_exp, 'name_and_desp':desp_text_division(i.work_desp) })).toDF() dataDF.show() ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240) aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256) data = ndtf.transform(dataDF) data = aggtf.transform(data) idfdata = nd_idf.transform(data) idfdata = agg_idf.transform(idfdata) RDD = idfdata.rdd # featuresRDD = RDD.map(lambda i: (i.salary, i.ndfeatures.toArray().tolist() + i.features_agg.toArray().tolist())) #test featuresRDD = RDD.map(lambda i: i.ndfeatures.toArray().tolist() + i.features_agg.toArray().tolist()) #应用 # featuresRDD = featuresRDD.map(lambda i: features_trans(i)) #test featuresRDD=featuresRDD.map(lambda i:DenseVector(i)) #应用 # result=featuresRDD.map(lambda i: model.predict(i.features)).collect() #test result=featuresRDD.map(lambda i:model.predict(i)).collect() # result=result[0] spark.stop() sc.stop() city_mean=models.CSR.objects.using('db2').filter(city__contains=city) city_mean=city_mean[0].salary salary=result_trans(result[0]) pos_mean=models.ITS.objects.using('db2').get(name=position) pos_mean=pos_mean.salary data_lst=[city_mean,pos_mean,salary] data_lst=json.dumps(data_lst) return render(request,'salary.html',{'result':result,'position':position,'city':city,'edu':edu,'exp':exp,'data':data_lst})
def get_idf_model(self): from pyspark.ml.feature import IDFModel idf_model = IDFModel.load(self.idf_path) return idf_model
else: tfidf_dir=args.model documents = sc.textFile(text_file) docsDataFrame = spark.createDataFrame([(id, doc) for id, doc in enumerate(documents.collect())], ["id", "docs"]) #stream=spark_streamer(sc,text_file) #docsDataFrame = spark.createDataFrame([(id, doc) for id, doc in stream], ["id", "docs"]) #docsDataFrame=sc.parallelize([(id, doc) for id, doc in enumerate(documents.collect())]).toDF(["id", "docs"]) regexTokenizer = RegexTokenizer(inputCol="docs", outputCol="words", pattern="\\W") tokenized = regexTokenizer.transform(docsDataFrame) cv = CountVectorizer(inputCol="words", outputCol="tf", minDF=2.0) cvTF=cv.fit(tokenized) tf=cvTF.transform(tokenized) if not args.predict: idf = IDF(inputCol="tf", outputCol="idf") idfModel = idf.fit(tf) tfidfData = idfModel.transform(tf) idfModel.save(tfidf_dir) print "The saved model:" print tfidfData.show() else: loadedModel = IDFModel.load(tfidf_dir) pred_idf_df = loadedModel.transform(tf) pred_idf_df.show() spark.stop()
try: # Create dstream from kafka topic directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip}) logger.debug('Create direct dstream from kafka successfully') except: logger.debug('Unable to create dstream from kafka') atexit.register(shutdown_hook, kafka_producer, spark) # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map try: logger.debug('Loading models') tokenizer = Tokenizer.load(tokenizer_file) hashing_tf = HashingTF.load(hashing_tf_file) idf_model = IDFModel.load(idf_model_file) nb_model = NaiveBayesModel.load(nb_model_file) selected_tags = pd.read_csv(selected_tags_file, header=None) local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0])) local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index))) catId_to_tags = sc.broadcast(local_catId_to_tags) tags_to_catId = sc.broadcast(local_tags_to_catId) tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType()) catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType()) logger.debug('loaded models successfully') except: logger.debug('Fail to load models') logger.debug('Start to process data') process_data(directKafkaStream, kafka_producer)
queries = idf_model.transform(queries) queries = scalerModel.transform(queries) preds = model.transform(queries) preds.select('payload', 'prediction').show() except: print('No data') APP_NAME = "BigData" conf = pyspark.SparkConf().setAll([('spark.app.name', APP_NAME), ('spark.executor.memory', '8g'), ('spark.cores.max', '2'), ('spark.driver.memory', '8g')]) sc = SparkContext(conf=conf) sqlc = SQLContext(sc) ngrams = udf(to_ngram, StringType()) tokenizer = Tokenizer.load('models/Tokenizer') vectorizer = CountVectorizerModel.load('models/Vectorizer') idf_model = IDFModel.load('models/idf') scalerModel = StandardScalerModel.load('models/scalerModel') model = LogisticRegressionModel.load('models/Logistic_Regression_Model') ssc = StreamingContext(sc, batchDuration=3) lines = ssc.socketTextStream("localhost", 9999) lines.foreachRDD(get_prediction) ssc.start() ssc.awaitTermination()
cv_model = CountVectorizerModel.load("models/CV.model") # 得出词频向量结果 cv_result = cv_model.transform(words_df) # idf from pyspark.ml.feature import IDF idf = IDF(inputCol="countFeatures", outputCol="idfFeatures") idf_model = idf.fit(cv_result) idf_model.write().overwrite().save("models/IDF.model") # tf-idf from pyspark.ml.feature import IDFModel idf_model = IDFModel.load("models/IDF.model") tfidf_result = idf_model.transform(cv_result) # 选取前20个作为关键词,此处仅为词索引 def sort_by_tfidf(partition): TOPK = 20 for row in partition: # 找到索引与IDF值并进行排序 _dict = list(zip(row.idfFeatures.indices, row.idfFeatures.values)) _dict = sorted(_dict, key=lambda x: x[1], reverse=True) result = _dict[:TOPK] for word_index, tfidf in result: yield row.article_id, row.channel_id, int(word_index), round( float(tfidf), 4)