Example #1
0
def processing1(request):
    if request.method!='POST':
        return HttpResponseRedirect('/sp/processing')
    else:
        progress=request.POST.get('progress')
        sc = SparkContext('local', 'test')
        spark = SparkSession.builder.getOrCreate()
        if progress=='1':
            city = request.POST.get('city')
            edu = request.POST.get('education')
            introduce = request.POST.get('introduce')
            position = request.POST.get('job')
            exp = request.POST.get('exp')
            print(city,edu,introduce,position,exp)
            explain1='第一步:将信息按照类别形成RDD后,通过map操作,将传入的信息进行合并,转化,并根据词频进行分词处理,后形成dataframe,便于下一步操作,此时信息的状态如下'
            dataRDD = sc.parallelize([[edu, city, position, exp, introduce]])
            dataDF = dataRDD.map(lambda i: Row(**{
                'education': i[0],
                'work_area': i[1],
                'work_lable': i[2],
                'work_exp': i[3],
                'work_desp': i[4]
            })).map(lambda i: Row(**{
                'education': str(new_edu_trans(i.education)),
                'city': [i.work_area],
                'work_desp': i.work_desp,
                'work_lable': [i.work_lable],
                'work_exp': [i.work_exp]
            })).map(lambda i: Row(**{
                'agg': [i.education] + i.city + i.work_lable + i.work_exp,
                'name_and_desp': desp_text_division(i.work_desp)
            })).toDF()
            dct={}
            for i in dataDF.collect():
                dct['agg1']=i[0]
                dct['nd1']=i[1]
            # spark.stop()
            # sc.stop()
            agg_pro1='将学历,城市,职位,经验合并为一行:'
            nd_pro1='将个人简介单独为一行:'
            agg_pro2='学历,经验,职位,城市形成的向量:'
            nd_pro2='个人介绍形成的向量:'
            explain2='第二步:将形成的list通过spark的机器学习包转化包通过if-idf算法形成特征向量,便于下一步机器学习的使用'
            nd_idf = IDFModel.load('hdfs://localhost:9000/nd_idf_test')
            agg_idf = IDFModel.load('hdfs://localhost:9000/agg_idf_test')
            ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240)
            aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256)
            data = ndtf.transform(dataDF)
            data = aggtf.transform(data)
            idfdata = nd_idf.transform(data)
            idfdata = agg_idf.transform(idfdata)
            for i in idfdata.collect():
                dct['agg2']=i[3]
                dct['nd2']=i[2]
            spark.stop()
            sc.stop()
            return render(request,'processing1.html',{'data':dct,'explain1':explain1,'agg_pro1':agg_pro1,'nd_pro1':nd_pro1,
                                                      'agg_pro2':agg_pro2,'nd_pro2':nd_pro2,'explain2':explain2})
Example #2
0
    def _load_models(self):
        hf_path = self.params_path.format('hf')
        idf_path = self.params_path.format('idfmodel')
        rf_path = self.params_path.format('rf')

        self.hashingTF = HashingTF.load(hf_path)
        self.idfmodel = IDFModel.load(idf_path)
        self.rf = RandomForestClassificationModel.load(rf_path)
def load_model(sc,
               major,
               minor,
               model_path=config['DEFAULT']['small_model_path']):
    modelload = model_path + "small_cluster_data_" + str(major) + "_" + str(
        minor)
    model = IDFModel.load(modelload)
    return model
Example #4
0
def loadIDFModel(path):
    '''
    Load IDFModel
        input : - path
        output: - model [IDFModel data frame]
    '''

    model = IDFModel.load(path)
    return model
Example #5
0
def topicPredict(inputs):
    #output_path = "/user/llbui/bigdata45_500"
    output_path = "C:/Users/linhb/bigdata45_500"
    query = inputs
    n = 10  #number of similar document to return
    feature = "abstract"  #feature to compare

    df = sc.parallelize([(0, query)]).toDF(["id", feature])

    tokenizer = RegexTokenizer(inputCol=feature,
                               outputCol="words",
                               pattern="\\P{Alpha}+")
    df2 = tokenizer.transform(df)

    remover = StopWordsRemover(inputCol="words", outputCol="words2")
    df3 = remover.transform(df2)

    udf_remove_words = udf(lambda x: remove_words(x), ArrayType(StringType()))
    df4 = df3.withColumn("words3", udf_remove_words(df3.words2))

    # text to feature vector - TF_IDF
    countTF_model = CountVectorizerModel.load(output_path + "/tf_model")
    df_countTF = countTF_model.transform(df4)

    idf_model = IDFModel.load(output_path + "/idf_model")
    df_IDF = idf_model.transform(df_countTF)

    # LDA Model
    lda_model = LocalLDAModel.load(output_path + "/lda_model")

    #output topics for document -> topicDistribution
    df_Feature = lda_model.transform(df_IDF)
    feature_vector = df_Feature.select("id",
                                       "topicDistribution").collect()[0][1]
    print("Feature Vector:", feature_vector)

    #Load existing document
    df_Document = sqlCt.read.load(output_path + "/topicDistribution.parquet")
    udf_cosineSimilarity = udf(
        lambda x_vector: cosineSimilarity(x_vector, feature_vector),
        FloatType())
    df_Similarity = df_Document.withColumn(
        "similarity", udf_cosineSimilarity("topicDistribution"))
    df_Similarity_Sorted = df_Similarity.sort(desc("similarity"))
    return df_Similarity_Sorted.limit(n).select("_id", "title", "abstract",
                                                "url",
                                                "topicDistribution").collect()
Example #6
0
def update_models():
    # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
    logger.debug(
        '===================================================Starting load models==================================================='
    )
    try:
        logger.debug('Loading tokenizer model')
        new_tokenizer = Tokenizer.load(tokenizer_file)
        logger.debug('Load tokenizer model successfully')
    except:
        logger.debug('Fail to load tokenizer')

    try:
        logger.debug('Loading hashing_tf model')
        new_hashing_tf = HashingTF.load(hashing_tf_file)
        logger.debug('Load hashing_tf model successfully')
    except:
        logger.debug('Fail to load hashing_tf')

    try:
        logger.debug('Loading idf_model')
        new_idf_model = IDFModel.load(idf_model_file)
        logger.debug('Load IDFModel successfully')
    except:
        logger.debug('Fail to load IDFModel')

    try:
        logger.debug('Loading nb_model')
        new_nb_model = NaiveBayesModel.load(nb_model_file)
        logger.debug('Load NaiveBayesModel successfully')
    except:
        logger.debug('Fail to load NaiveBayesModel')

    try:
        logger.debug('Updating models')
        tokenizer = new_tokenizer
        hashing_tf = new_hashing_tf
        idf_model = new_idf_model
        nb_model = new_nb_model
        logger.debug('update model successfully')
    except:
        logger.debug('Fail to update models')
    logger.debug(
        '===================================================Stopped load models==================================================='
    )
为什么要保存这些值?并且存入数据库当中?
后续计算tfidf画像需要使用,避免放入内存中占用过多,持久化使用

Hive中建立表:idf_keywords_values
CREATE TABLE idf_keywords_values(
keyword STRING comment "article_id",
idf DOUBLE comment "idf",
index INT comment "index");
'''
from pyspark.ml.feature import CountVectorizerModel
# cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/countVectorizerOfArticleWords.model")
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/CV.model")

from pyspark.ml.feature import IDFModel
# idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDFOfArticleWords.model")
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDF.model")

keywords_list_with_idf = list(zip(cv_model.vocabulary, idf_model.idf.toArray()))

def func(data):
    for index in range(len(data)):
        data[index] = list(data[index])
        data[index].append(index)
        data[index][1] = float(data[index][1])

print(len(keywords_list_with_idf))
func(keywords_list_with_idf)
sc = ktt.spark.sparkContext
rdd = sc.parallelize(keywords_list_with_idf)
df = rdd.toDF(["keywords", "idf", "index"])
df.show()
Example #8
0
    #remove punctuation
    pp_udf = udf(preprocess, ArrayType(StringType()))
    words = ads_free.withColumn('Words', pp_udf(ads_free.Text))

    #remove stop words
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    removed = remover.transform(words)

    params_path = '../tmp/{}'

    #Load trained hashing frequency and transform
    hf_path = params_path.format('hf')
    hashingTF = HashingTF.load(hf_path)
    featureized = hashingTF.transform(removed)

    #Load trained hashing frequency and transform
    idf_path = params_path.format('idfmodel')
    idfmodel = IDFModel.load(idf_path)
    result = idfmodel.transform(featureized)

    #load rf model and predict
    rf_path = params_path.format('rf')
    rf = RandomForestClassificationModel.load(rf_path)
    prediction = rf.transform(result)

    path_to_save = '../tmp/twitterstream_test_prediction.json'
    prediction.write.json(path_to_save)

    #test whether json is written
    test = spark.read.json(path_to_save)
        yield row.article_id, row.channel_id, words


# 分词
sqlContext.sql("use article")
articleDF = sqlContext.sql("select * from article_data")

# words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(["article_id", "channel_id", "words"])
wordsDF = articleDF.rdd.mapPartitions(segmentation, 5).toDF(
    ["article_id", "channel_id", "words"])

cv_model = CountVectorizerModel.load(
    "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/CV.model"
)
idf_model = IDFModel.load(
    "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/IDF.model"
)

cv_result = cv_model.transform(wordsDF)
tfidf_result = idf_model.transform(cv_result)


def func(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(
Example #10
0
def salary_pre(request):
    sc=SparkContext('local','test')
    spark = SparkSession.builder.getOrCreate()
    hive_con=HiveContext(sc)
    nd_idf=IDFModel.load('hdfs://localhost:9000/ndidf')
    agg_idf=IDFModel.load('hdfs://localhost:9000/aggidf')
    model=NaiveBayesModel.load(sc,'hdfs://localhost:9000/nymodel')
    # hive_con.sql('use zp')
    # testdata=hive_con.sql('select education,mon_wa,name,work_area,work_desp,work_exp,work_lable from `qtzp` where id=789')
    # testdataRDD = testdata.rdd.map(lambda i: Row(**{
    #     'education': new_edu_trans(i.education),
    #     'salary': mon_wa_trans(i.mon_wa),
    #     'name': i.name,
    #     'city': i.work_area,
    #     'work_desp': i.work_desp,
    #     'work_exp': i.work_exp,
    #     'work_lable': i.work_lable
    # }))
    # dataDF=testdataRDD.map(lambda i:Row(**{
    #     'salary': int(i.salary),
    #     'agg': [i.education] + [i.city] + [i.work_lable] + [i.work_exp],
    #     'name_and_desp': desp_text_division(i.name + ',' + i.work_desp)
    # })).toDF()
    # dataDF.show()
    city=request.POST.get('city')
    edu=request.POST.get('education')
    introduce=request.POST.get('introduce')
    position=request.POST.get('job')
    exp=request.POST.get('exp')
    dataRDD=sc.parallelize([[edu,city,position,exp,introduce]])
    # schema=StructType([StructField('education',StringType(),True),StructField('work_area',StringType(),True),StructField('work_lable',
    #         StringType(),True),StructField('work_exp',StringType(),True),StructField('work_desp',StringType(),True)])
    # rowRDD=dataRDD.map(lambda i:Row(i[0],i[1],i[2],i[3],i[4]))
    # dataDF=spark.createDataFrame(rowRDD,schema)
    # dataDF.show()
    dataDF=dataRDD.map(lambda i:Row(**{
        'education':i[0],
        'work_area':i[1],
        'work_lable':i[2],
        'work_exp':i[3],
        'work_desp':i[4]
    })).map(lambda i:Row(**{
        'education':str(new_edu_trans(i.education)),
        'city':[i.work_area],
        'work_desp':i.work_desp,
        'work_lable':[i.work_lable],
        'work_exp':[i.work_exp]
    })).map(lambda i:Row(**{
        'agg':[i.education] + i.city + i.work_lable + i.work_exp,
        'name_and_desp':desp_text_division(i.work_desp)
    })).toDF()
    dataDF.show()

    ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240)
    aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256)
    data = ndtf.transform(dataDF)
    data = aggtf.transform(data)
    idfdata = nd_idf.transform(data)
    idfdata = agg_idf.transform(idfdata)
    RDD = idfdata.rdd
    # featuresRDD = RDD.map(lambda i: (i.salary, i.ndfeatures.toArray().tolist() + i.features_agg.toArray().tolist()))  #test
    featuresRDD = RDD.map(lambda i: i.ndfeatures.toArray().tolist() + i.features_agg.toArray().tolist())      #应用
    # featuresRDD = featuresRDD.map(lambda i: features_trans(i))      #test
    featuresRDD=featuresRDD.map(lambda i:DenseVector(i))       #应用
    # result=featuresRDD.map(lambda i: model.predict(i.features)).collect()       #test
    result=featuresRDD.map(lambda i:model.predict(i)).collect()
    # result=result[0]
    spark.stop()
    sc.stop()
    city_mean=models.CSR.objects.using('db2').filter(city__contains=city)
    city_mean=city_mean[0].salary
    salary=result_trans(result[0])
    pos_mean=models.ITS.objects.using('db2').get(name=position)
    pos_mean=pos_mean.salary
    data_lst=[city_mean,pos_mean,salary]
    data_lst=json.dumps(data_lst)
    return render(request,'salary.html',{'result':result,'position':position,'city':city,'edu':edu,'exp':exp,'data':data_lst})
Example #11
0
 def get_idf_model(self):
     from pyspark.ml.feature import IDFModel
     idf_model = IDFModel.load(self.idf_path)
     return idf_model
Example #12
0
    else:
        tfidf_dir=args.model

    documents = sc.textFile(text_file)
    docsDataFrame = spark.createDataFrame([(id, doc)
                                for id, doc in enumerate(documents.collect())],
                                ["id", "docs"])
    #stream=spark_streamer(sc,text_file)
    #docsDataFrame = spark.createDataFrame([(id, doc) for id, doc in stream], ["id", "docs"])
    #docsDataFrame=sc.parallelize([(id, doc) for id, doc in enumerate(documents.collect())]).toDF(["id", "docs"])
    regexTokenizer = RegexTokenizer(inputCol="docs", outputCol="words", pattern="\\W")
    tokenized = regexTokenizer.transform(docsDataFrame)

    cv = CountVectorizer(inputCol="words", outputCol="tf", minDF=2.0)
    cvTF=cv.fit(tokenized)
    tf=cvTF.transform(tokenized)

    if not args.predict:
        idf = IDF(inputCol="tf", outputCol="idf")
        idfModel = idf.fit(tf)
        tfidfData = idfModel.transform(tf)
        idfModel.save(tfidf_dir)
        print "The saved model:"
        print tfidfData.show()
    else:
        loadedModel = IDFModel.load(tfidf_dir)
        pred_idf_df = loadedModel.transform(tf)
        pred_idf_df.show()

    spark.stop()
Example #13
0
	try:
		# Create dstream from kafka topic
		directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip})
		logger.debug('Create direct dstream from kafka successfully')
	except:
		logger.debug('Unable to create dstream from kafka')

	atexit.register(shutdown_hook, kafka_producer, spark)

	# Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
	try:
		logger.debug('Loading models')
		tokenizer = Tokenizer.load(tokenizer_file)
		hashing_tf = HashingTF.load(hashing_tf_file)
		idf_model = IDFModel.load(idf_model_file)
		nb_model = NaiveBayesModel.load(nb_model_file)
		selected_tags = pd.read_csv(selected_tags_file, header=None)
		local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0]))
		local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
		catId_to_tags = sc.broadcast(local_catId_to_tags)
		tags_to_catId = sc.broadcast(local_tags_to_catId)
		tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType())
		catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType())
		logger.debug('loaded models successfully')
	except:
		logger.debug('Fail to load models')


	logger.debug('Start to process data')
	process_data(directKafkaStream, kafka_producer)
        queries = idf_model.transform(queries)
        queries = scalerModel.transform(queries)
        preds = model.transform(queries)
        preds.select('payload', 'prediction').show()

    except:
        print('No data')


APP_NAME = "BigData"
conf = pyspark.SparkConf().setAll([('spark.app.name', APP_NAME),
                                   ('spark.executor.memory', '8g'),
                                   ('spark.cores.max', '2'),
                                   ('spark.driver.memory', '8g')])
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)

ngrams = udf(to_ngram, StringType())
tokenizer = Tokenizer.load('models/Tokenizer')
vectorizer = CountVectorizerModel.load('models/Vectorizer')
idf_model = IDFModel.load('models/idf')
scalerModel = StandardScalerModel.load('models/scalerModel')
model = LogisticRegressionModel.load('models/Logistic_Regression_Model')
ssc = StreamingContext(sc, batchDuration=3)
lines = ssc.socketTextStream("localhost", 9999)

lines.foreachRDD(get_prediction)

ssc.start()
ssc.awaitTermination()
Example #15
0
cv_model = CountVectorizerModel.load("models/CV.model")
# 得出词频向量结果
cv_result = cv_model.transform(words_df)

# idf
from pyspark.ml.feature import IDF

idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idf_model = idf.fit(cv_result)
idf_model.write().overwrite().save("models/IDF.model")

# tf-idf
from pyspark.ml.feature import IDFModel

idf_model = IDFModel.load("models/IDF.model")
tfidf_result = idf_model.transform(cv_result)


# 选取前20个作为关键词,此处仅为词索引
def sort_by_tfidf(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _dict = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _dict = sorted(_dict, key=lambda x: x[1], reverse=True)
        result = _dict[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(
                float(tfidf), 4)