Ejemplo n.º 1
0
    def transform(self, train_data, is_test=False):
        model = Word2VecModel.load(self.model_path)
        item2vec = model.getVectors()
        train_data_seq = transform_trainable(train_data, test=is_test)

        train_data_seq = model.transform(train_data_seq)
        # step5:Get the click  sequence
        train_data_click = train_data.filter(
            "action_type='clickout item'").select('user_id', "session_id",
                                                  'timestamp', 'step',
                                                  'reference', 'impressions')
        train_data_click = train_data_click.withColumn(
            'impressions', F.split(train_data.impressions,
                                   '\|')).withColumn("impressions",
                                                     F.explode("impressions"))

        cond = train_data_click.impressions == item2vec.word
        df_out = train_data_click.join(item2vec, cond, how='left').select(
            'user_id', "session_id", 'timestamp', 'step', 'reference',
            'impressions', 'vector')
        df_out = df_out.join(train_data_seq,
                             df_out.session_id == train_data_seq.session_id,
                             how='left').drop(train_data_seq.session_id)

        # step6: Find and sort the similarity between the session vector and the exposure vector
        df_out = df_out.withColumn('sim', getCosinDis('vector', 'item2vec'))
        df_out = df_out.withColumn("sim", df_out.sim.cast('float')).withColumn(
            "rank",
            F.rank().over(
                Window.partitionBy("session_id", 'timestamp',
                                   "step").orderBy("sim")))

        return df_out
def startup():
    # Download the model
    import findspark
    findspark.init()

    import pyspark # only run after findspark.init()
    from pyspark.sql import SparkSession

    spark = SparkSession.builder.getOrCreate()
    print("Spark session started\n\n")

    aws_model_cmd = f"aws s3 cp --recursive s3://{bucket_name}/{s3_dir} {local_dir}/{s3_dir}"
    # os.system(aws_model_cmd)

    # Download the vectors
    aws_vector_cmd = f"aws s3 cp --recursive s3://{bucket_name}/{title_vector} {local_dir}/{title_vector}"
    # os.system(aws_vector_cmd)

    figures_cmd = f"aws s3 cp --recursive s3://{bucket_name}/{figures} dashboard/templates/static"
    os.system(figures_cmd)

    # Load the Model and data
    saveword2vec_path = f"{local_dir}/{s3_dir}"
    model_word2vec = Word2VecModel.load(saveword2vec_path)
    title_vectors_df = spark.read.parquet(f"{local_dir}/{title_vector}")

    return spark, model_word2vec, title_vectors_df
Ejemplo n.º 3
0
def main():
    db_con = init_sync()
    if (not os.path.exists('model')):
        if (not os.path.exists('data_text')):
            print("Папка создана")
            os.mkdir('data_text')
        save_txt.save_text_db_to_txt(db_con)

        word2vec.create_w2v_model()

    persons = get_persons(db_con)
    places = get_places(db_con)

    spark = SparkSession \
        .builder \
        .appName("SimpleApplication") \
        .getOrCreate()

    model = Word2VecModel.load(PATH)

    pprint("Поиск контекстных синонимов персон:")
    persons_synonyms = get_synonyms(persons, 5, model, spark)
    insert_to_persons_synonyms(db_con, persons, persons_synonyms)
    print_elem(persons, persons_synonyms)

    pprint("Поиск контекстных синонимов достопримечательностей:")
    places_synonyms = get_synonyms(places, 5, model, spark)
    insert_to_places_synonyms(db_con, places, places_synonyms)
    print_elem(places, places_synonyms)

    spark.stop()
Ejemplo n.º 4
0
def loadModel(path):
    '''
    Load Word2Vec model
        Input : - path
        Output : - model [Word2Vec model data frame]
    '''

    model = Word2VecModel.load(path)
    return model
Ejemplo n.º 5
0
 def load_model(
     self,
     cloud_path: str,
     model_name: str,
 ) -> Word2VecModel:
     """
     Load a previously saved Word2Vec model object to memory.
     """
     if not model_name.endswith(".sparkml"):
         model_name += ".sparkml"
     self.model = Word2VecModel.load(cloud_path + "/" + model_name)
     return self.model
Ejemplo n.º 6
0
 def get_vectors_for_df(self, df):
     """
     Note:This method is used to get the vectors from the text in datafram
     Author:Aliabbas Bhojani
     """
     logging.info("Getting the vectors for the given dataframe")
     if os.path.exists(self.model_dir + self.model_name):
         model = Word2VecModel.load(self.model_dir + self.model_name + "/")
         output_df = model.transform(df)
         return output_df
     else:
         logging.info("No Models Found, retrain the model")
    def configure_processing(self):
        """ A function to operate on input raw data generates report in desired format. 
        
        """
        # Load report making data
        print "============ NLP methods starts on report data ==================="
        df = helper_method_process_data(self.total_data, self.spark_context,
                                        self.sql_context)
        print "============ NLP methods finished on report data ==================="

        # load pre-generated model
        print "============= Loading pre-build model ======================="
        model_path = self.model_save_path + self.model_date + "/"
        model = Word2VecModel.load(model_path)
        print "====================== Model loading completed =============="

        report_data = df.rdd.map(lambda a: a.asDict()).collect()
        df.unpersist()
        del df

        ### report the result
        count_id = 1
        self.es = Elasticsearch([{
            'host': self.es_output_host,
            'port': self.es_output_port
        }])

        print "============ Report seeding to Elastic Search start ==========="
        output_index_name = self.es_output_index + '-' + datetime.now(
        ).strftime("%Y-%m-%d-%H-%M-%S")
        print "============ Result will be pushed in index:", output_index_name, "=============="

        for one_line_report in report_data:
            if len(one_line_report['features']) < 3:
                continue

            for word2vec_keyword in list(set(one_line_report['features'])):
                try:
                    synonyms = model.findSynonyms(word2vec_keyword, 10)

                    self.send_result_to_elasticsearch(word2vec_keyword,
                                                      synonyms, count_id,
                                                      output_index_name)
                    count_id = count_id + 1
                except Exception as e:
                    '''
                    It will come here if model did not get the current word'''
                    print e.message
                    pass
        print "================== Report seeding Ended============="
Ejemplo n.º 8
0
def get_synonyms(word: str, count: int):
    spark = SparkSession \
        .builder \
        .appName("SimpleApplication") \
        .getOrCreate()

    model = Word2Vec.load('/home/pok/sem/project/models/model0mincount/model')
    model_fitted = Word2VecModel.load(
        '/home/pok/sem/project/models/model0mincount/fitted')
    synonyms = model_fitted.findSynonyms(word, count)
    synonyms.select("word", fmt("similarity", 5).alias("similarity")).show()

    spark.stop()

    return synonyms
Ejemplo n.º 9
0
def main():
    if not os.path.exists('model'):
        save_txt.save_text_db_to_txt()
        word2vec.create_w2v_model()

    with SparkSession.builder.appName(
            "SimpleApplication").getOrCreate() as spark_session:
        model = Word2VecModel.load(PATH)

        persons = mongo.selectAll('persons')
        places = mongo.selectAll('places')

        pprint("Поиск контекстных синонимов персон:")
        persons_synonyms = word2vec.find_synonyms(persons, model,
                                                  spark_session)
        pprint(persons_synonyms)

        pprint("Поиск контекстных синонимов достопримечательностей:")
        places_synonyms = word2vec.find_synonyms(places, model, spark_session)
        pprint(places_synonyms)
Ejemplo n.º 10
0
def init():
    ''' Initialize libraries. '''
    print('Initializing...')
    sconf = pyspark.SparkConf().setAll([
        ('spark.executor.memory', config.spark_executor_memory),
        ('spark.executor.instances', config.spark_executor_instances),
        ('spark.executor.cores', config.spark_executor_cores),
        #('spark.cores.max', config.spark_cores_max),
        ('spark.driver.memory', config.spark_driver_memory),
        ('master', config.spark_master),
    ])

    global spark, df_all, w2v_model

    spark = SparkSession.builder.appName('similarity2').config(
        conf=sconf, ).getOrCreate()
    spark.sparkContext.setLogLevel(config.spark_log_level)

    df_all = spark.read.parquet(config.input_dir).sample(
        withReplacement=False,
        fraction=config.spark_fraction,
        seed=config.spark_seed)
    w2v_model = Word2VecModel.load(config.model_file)
Ejemplo n.º 11
0
def main():
    spark = SparkSession.builder \
        .appName("Spark CV-job ad matching") \
        .config("spark.some.config.option", "some-value") \
        .master("local[*]") \
        .getOrCreate()

    df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl")

    tokenizer = Tokenizer(inputCol="skillText", outputCol="words")
    tokenized = tokenizer.transform(df_categories)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    removed = remover.transform(tokenized)

    stripped = removed.select('filtered').rdd.map(lambda x: strip_punctuation(x[0]))\
    .map(lambda x: Row(filtered=x)).toDF(['filtered'])

    # word2vec = Word2Vec(vectorSize=100, inputCol="filtered", outputCol="result")
    # model = word2vec.fit(stripped)
    #model.save("word2vec-model")
    model = Word2VecModel.load("word2vec-model")
    synonyms = model.findSynonyms(sys.argv[1], 10)
    synonyms.show(truncate=False)
Ejemplo n.º 12
0
import math
from pyspark.ml.feature import Word2VecModel
from pyspark_ml.app_root import get_root_path
from pyspark.ml.feature import PCA
import matplotlib.pyplot as plt
from pyspark.ml.clustering import KMeans
from mpl_toolkits.mplot3d import Axes3D

project_root_path = get_root_path()
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('evel_word2vec').getOrCreate()

#load model
modelPath = project_root_path + "/models/word2vec-model"
loadedModel = Word2VecModel.load(modelPath)
# loadedModel.findSynonyms('beijing',15).show(truncate=False)
wordVectorsDF = loadedModel.getVectors()
vocabSize = wordVectorsDF.count()
print("Vocabulary Size: ", vocabSize)
loadedModel.findSynonyms('coffee', 3).show(truncate=False)

# loadedModel.getVectors().show(truncate=False)

dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector', 'features')

numComponents = 3
pca = PCA(k=numComponents, inputCol='features', outputCol='pcaFeatures')
model = pca.fit(dfW2V)
dfComp = model.transform(dfW2V).select("pcaFeatures")
num_iterations = sys.argv[5]
vector_size = sys.argv[6]
debug_flag = sys.argv[7]
relations_result_file = config.get('DataSection', 'relations_result_file')


spark = SparkSession \
    .builder \
    .appName("WikiFindSynonyms") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.driver.maxResultSize", "1g") \
    .config("spark.default.parallelism", "4") \
    .getOrCreate()

model = Word2VecModel.load(sys.argv[1])
modelDF = model.getVectors()
modelDF.show()
print("Total number of records in modelDF = %d" % modelDF.count())
modelDF = modelDF.repartition(1000, 'word')

testDataDF = spark.read.csv(sys.argv[2], header=True)
testDataDF.show()
print("Total number of records in testDataDF = %d" % testDataDF.count())

testDataDF = testDataDF.join(modelDF, testDataDF.word1 == modelDF.word,'inner')\
                    .select(testDataDF.word1, testDataDF.word2, \
                            testDataDF.word3, modelDF.vector)
testDataDF.show()
testDataDF = testDataDF.withColumnRenamed('vector', 'vec1')
PATH = 'model/kurs_model/'

from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2VecModel
from pprint import pprint

test_words = [
    "дом", "бочаров", "документ", "деньги", "гараж", "волга", "дума", "закон",
    "полиция", "врач"
]

spark = SparkSession \
    .builder \
    .appName("SimpleApplication") \
    .getOrCreate()

model = Word2VecModel.load(PATH)

pprint(
    "Контекстные синонимы слов, полученные из модели, обученной на статьях:")

for test_word in test_words:
    pprint("-" * 20)
    pprint(test_word)
    result = model.findSynonyms(test_word, 5).collect()
    for el in result:
        pprint(el[0])

spark.stop()
    def load_model(self, path_to_model):
        """Load Word2Vec model from model_path."""
        from pyspark.ml.feature import Word2VecModel

        w2vec_model = Word2VecModel.load(path_to_model)
        return (w2vec_model)
Ejemplo n.º 16
0
end = time.time()
print("数据拆分用时:{}".format(end - start))
"""
对主治功能总字段进行词转向量
"""

start = time.time()
# 创建word2Vec对象并指定配置信息(维度信息,输入列,输出列)
# word2Vec = Word2Vec(vectorSize=300, minCount=0, inputCol="d_func", outputCol="d_func_result")
# 使用word2Vec对象对指定列进行转化
# hypertension_model = word2Vec.fit(manbing)

# 持久化模型(费时操作只执行一次)
# hypertension_model.save("./WV_model/")

model = Word2VecModel.load("./WV_model/")

# 对训练集的数据进行词转向量的转化
manbing = model.transform(train_set)
end = time.time()
print("词转向量用时:{}".format(end - start))
"""
训练数据构建模型
"""

start = time.time()
# 从源数据中指定特征列
assembler = VectorAssembler(inputCols=["d_func_result"], outputCol="features")
# assembler对象是一个transformer,将多列数据转化为单列的向量列(决策树可以识别的类型)
# train_set2 = assembler.transform(manbing)
Ejemplo n.º 17
0
dataset = dataset.withColumn("month", (F.col("month") - 1) / (12 - 1))
dataset = dataset.withColumn("day", (F.col("day") - 1) / (31 - 1))
dataset = dataset.withColumn("hour", (F.col("hour") - 0) / (23 - 0))
dataset = dataset.withColumn("minute", (F.col("minute") - 0) / (59 - 0))
dataset = dataset.withColumn("second", (F.col("second") - 0) / (59 - 0))

# Word2Vec

dataset = dataset.withColumn(
    'categorical',
    F.concat(F.array('rat'), F.array('mcc'), F.array('mnc'), F.array('msin'),
             F.array('tac'), F.array('snr')))

word2Vec_output_path = "{}/data/word2VecModel.bin".format(base_path)
word2Vec = Word2VecModel.load(word2Vec_output_path)
dataset = word2Vec.transform(dataset)

# VectorAssembler

sizeHint = VectorSizeHint(inputCol="vcategorical",
                          handleInvalid="skip",
                          size=50)
dataset = sizeHint.transform(dataset)

vector_assembler_output_path = "{}/data/vectorAssemblerW2VModel.bin".format(
    base_path)
vector_assembler = VectorAssembler.load(vector_assembler_output_path)
dataset = vector_assembler.transform(dataset)

# Clasificación
Ejemplo n.º 18
0
    def compute_article_similar(self, articleProfile):
        """
        计算增量文章与历史文章的相似度 word2vec
        :return:
        """

        # 得到要更新的新文章通道类别(不采用)
        # all_channel = set(articleProfile.rdd.map(lambda x: x.channel_id).collect())
        def avg(row):
            x = 0
            for v in row.vectors:
                x += v
            #  将平均向量作为article的向量
            return row.article_id, row.channel_id, x / len(row.vectors)

        for channel_id, channel_name in CHANNEL_INFO.items():

            profile = articleProfile.filter(
                'channel_id = {}'.format(channel_id))
            wv_model = Word2VecModel.load(
                "hdfs://hadoop-master:9000/headlines/models/channel_%d_%s.word2vec"
                % (channel_id, channel_name))
            vectors = wv_model.getVectors()

            # 计算向量
            profile.registerTempTable("incremental")
            articleKeywordsWeights = self.spark.sql(
                "select article_id, channel_id, keyword, weight from incremental LATERAL VIEW explode(keywords) AS keyword, weight where channel_id=%d"
                % channel_id)

            articleKeywordsWeightsAndVectors = articleKeywordsWeights.join(
                vectors, vectors.word == articleKeywordsWeights.keyword,
                "inner")
            articleKeywordVectors = articleKeywordsWeightsAndVectors.rdd.map(
                lambda r: (r.article_id, r.channel_id, r.keyword, r.weight * r.
                           vector)).toDF([
                               "article_id", "channel_id", "keyword",
                               "weightingVector"
                           ])

            articleKeywordVectors.registerTempTable("tempTable")
            articleVector = self.spark.sql(
                "select article_id, min(channel_id) channel_id, collect_set(weightingVector) vectors from tempTable group by article_id"
            ).rdd.map(avg).toDF(["article_id", "channel_id", "articleVector"])

            # 写入数据库
            def toArray(row):
                return row.article_id, row.channel_id, [
                    float(i) for i in row.articleVector.toArray()
                ]

            articleVector = articleVector.rdd.map(toArray).toDF(
                ['article_id', 'channel_id', 'articleVector'])
            articleVector.write.insertInto("article_vector")

            import gc
            del wv_model
            del vectors
            del articleKeywordsWeights
            del articleKeywordsWeightsAndVectors
            del articleKeywordVectors
            gc.collect()

            # 得到历史数据, 转换成固定格式使用LSH进行求相似
            train = self.spark.sql(
                "select * from article_vector where channel_id=%d" %
                channel_id)

            def _array_to_vector(row):
                return row.article_id, Vectors.dense(row.articleVector)

            train = train.rdd.map(_array_to_vector).toDF(
                ['article_id', 'articleVector'])
            test = articleVector.rdd.map(_array_to_vector).toDF(
                ['article_id', 'articleVector'])

            brp = BucketedRandomProjectionLSH(inputCol='articleVector',
                                              outputCol='hashes',
                                              seed=12345,
                                              bucketLength=1.0)
            model = brp.fit(train)
            similar = model.approxSimilarityJoin(test,
                                                 train,
                                                 2.0,
                                                 distCol='EuclideanDistance')

            def save_hbase(partition):
                import happybase
                for row in partition:
                    pool = happybase.ConnectionPool(size=3,
                                                    host='hadoop-master')
                    # article_similar article_id similar:article_id sim
                    with pool.connection() as conn:
                        table = conn.table("article_similar")
                        for row in partition:
                            if row.datasetA.article_id == row.datasetB.article_id:
                                pass
                            else:
                                table.put(
                                    str(row.datasetA.article_id).encode(), {
                                        b"similar:%d" % row.datasetB.article_id:
                                        b"%0.4f" % row.EuclideanDistance
                                    })
                        conn.close()

            similar.foreachPartition(save_hbase)
        "cleaned_tweets",
        regexp_replace(col("tweets"), "http.+|@.|\n|RT|\d+", ' '))
    # All words are lowercase and tokenized
    tweets_df = RegexTokenizer(inputCol="cleaned_tweets",
                               outputCol="lowercase_tweets",
                               pattern="\\W").transform(tweets_df)
    # We remove the StopWords
    tweets_df = StopWordsRemover(
        inputCol="lowercase_tweets",
        outputCol="processed_tweets").transform(tweets_df)
    # We drop the unused columns
    tweets_df = tweets_df.drop("cleaned_tweets", "lowercase_tweets", "lang",
                               "date")
    # We load the language model
    model_path = "s3://" + bucket_name + "/models/w2v_model"
    loaded_model = Word2VecModel.load(model_path)
    # We add the output columns : it is the average of the words' vectors for each tweet
    tweets_df = loaded_model.transform(tweets_df)

    # We load the classifier
    clf_path = "s3://" + bucket_name + "/models/mpc_model"
    loaded_clf = MultilayerPerceptronClassificationModel.load(clf_path)
    predictions = loaded_clf.transform(tweets_df)

    # We keep the probability only for the predicted sentiment
    to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
    predictions = predictions.withColumn("probability",
                                         to_array("probability"))
    predictions = predictions.withColumn("probability",
                                         array_max("probability"))
Ejemplo n.º 20
0
    # step1:convert  item_id to  sequence  in session
    train_data_seq = transform_trainable(train_data, test=False)

    # step2:Training model and save model
    word2Vec = Word2Vec(vectorSize=100,
                        seed=42,
                        minCount=2,
                        inputCol="reference_list",
                        outputCol="doc2vec_spark")
    model = word2Vec.fit(train_data_seq)

    model_path = '/team/cmp/hive_db/cmp_tmp/dl_model_template/recdata/rec_models'
    model.write().overwrite().save(os.path.join(model_path, "item2vec.model"))

    # step3:load model
    model = Word2VecModel.load(os.path.join(model_path, "item2vec.model"))
    # step4:Get the vector for each item and session

    item2vec = model.getVectors()
    train_data_seq = model.transform(train_data_seq)

    # step5:Get the click  sequence
    train_data_click = train_data.filter("action_type='clickout item'").select(
        'user_id', "session_id", 'timestamp', 'step', 'reference',
        'impressions')
    train_data_click = train_data_click.withColumn(
        'impressions', F.split(train_data.impressions,
                               '\|')).withColumn("impressions",
                                                 F.explode("impressions"))

    cond = train_data_click.impressions == item2vec.word
Ejemplo n.º 21
0
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)  # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words


'''
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])
words_df.show()

# 直接调用word2vec训练
w2v_model = Word2Vec(vectorSize=100, inputCol='words', outputCol='model', minCount=3)

model = w2v_model.fit(words_df)
model.save("hdfs://hadoop-master:9000/headlines/models/test.word2vec")
'''

# 1、加载某个频道模型,得到每个词的向量
from pyspark.ml.feature import Word2VecModel
channel_id = 18
channel = "python"
wv_model = Word2VecModel.load(
    "hdfs://hadoop-master:9000/headlines/models/word2vec_model/channel_%d_%s.word2vec"
    % (channel_id, channel))
vectors = wv_model.getVectors()
vectors.show()
Ejemplo n.º 22
0
def Validate(ngrams \
			, sampleSizes \
			, ctxSize \
			, sqc \
			, seqs \
			, outFile \
			, minval \
			, maxval \
			, avg \
			, nlines):

	accuracy = []
	gramSize = GramSize(ctxSize, lookahead)

	c1 = (((maxval - minval) * 1.0) / nlines) / avg
	c2 = ((minval * 1.0) / nlines) / avg
	print seqs.count()
				


	ngrams = ngrams.repartition(1 << nPartLog)
	ngrams.cache()

	#we will validate separately for each vector size
	for vecSize in vecSizes:
		print '======TESTING FOR VECTOR SIZE', vecSize
		#start fresh
		old_ngrams = ngrams
		ngrams = ngrams.withColumn('correct', lit(0))



		#use models from each sample
		modelId = 0
		for sampleSize in sampleSizes:

			w2v = Word2VecModel.load(w2vFile(outDir, ctxSize, sampleSize, vecSize))
			lrmodels = []
			for dim in range(0, vecSize):
				lrmodels.append(LinearRegressionModel.load(lrmFile(outDir, ctxSize, sampleSize, vecSize, dim)))

			success = 0
			fail = 0
			unopt = 0

			#add columns to store model success and failure
			modelSucc = 'succ_' + str(modelId)
			modelFail = 'fail_' + str(modelId)
			modelUnopt = 'unopt_' + str(modelId)
			seqs = seqs.withColumn(modelSucc, lit(0)) \
						.withColumn(modelFail, lit(0)) \
						.withColumn(modelUnopt, lit(0))
			modelId = modelId + 1



			ngrams = ngrams \
				.withColumn('predSeq', lit(''))

			#create initial feature vector
			#transform each word into a cluster center
			words, d, centers = ClusterWords(w2v \
											, seqs \
											)
		
			#record correctness for this model only
			old_ngrams = ngrams
			ngrams = ngrams.withColumn('sample_correct', lit(0)).withColumn('sample_confi', lit(1.0))

			for nextPos in range(0,lookahead):
				#build the feature vector
				ngrams = BuildSubstringFeature(ngrams, w2v, nextPos, nextPos + ctxSize, ctxSize, lookahead,)

				#build the prediction vector
				ngrams = BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize)


			

				#now assign a cluster id to each prediction vector
				old_ngrams = ngrams
				ngrams = centers.transform(ngrams).withColumnRenamed('cluster', 'predWord').withColumnRenamed('vector', 'predictionVector')
				
				
				#get the predicted word
				ngrams = ngrams.join(broadcast(words), words.cluster == ngrams.predWord, 'inner') \
								.drop('cluster') #\

				#calculate the cosine similarity between prediction vector and center vector 
				epsilon = 0.0001
				def CosineSimi (v1, v2):
					d1 = DenseVector(v1)
					d2 = DenseVector(v2)
					n1 = d1.norm(2)
					n2 = d2.norm(2)
					return float(d1.dot(d2) / (n1 * n2))
				cossim = udf(lambda v1, v2: CosineSimi(v1, v2), DoubleType())
				ngrams = ngrams.withColumn('simi', cossim('centerVector', 'predictionVector'))
				ngrams = ngrams.drop('centerVector').drop('predictionVector')


				#update predicted sequence
				ngrams = ngrams.withColumn('predSeq', concat_ws(' ', 'predSeq', 'word')) 
				ngrams = ngrams.withColumn('predSeq', ltrim(ngrams.predSeq))


				#get actual sequence
				ngrams = CreateSubstring(ngrams, 'sentence', 'actualSeq', gramSize, ' ', ctxSize, ctxSize + nextPos + 1)


				#now get the cluster id for the predicted word in the sentence
				ngrams = BuildLabelVector(ngrams, w2v, ctxSize, lookahead, nextPos).withColumnRenamed('labelVec', 'vector').drop('ngrams')
				ngrams = centers.transform(ngrams).drop('vector')

				#and host latency for actual word
				ngrams = ngrams.join(broadcast(words), 'cluster', 'inner') \
						.drop('word') \
						.drop('centerVector') #\
				
				
			
				#record correctness
				ngrams = ngrams.withColumn('round_correct', when((ngrams.predWord != ngrams.cluster) | (ngrams.simi < confidence), 0).otherwise(nextPos + 1)).drop('predWord').drop('cluster')
				ngrams = ngrams.withColumn('sample_correct', when(ngrams.sample_correct + 1 == ngrams.round_correct, ngrams.round_correct).otherwise(ngrams.sample_correct)) 




				#get overall correctness
				ngrams = ngrams.withColumn('correct', greatest('sample_correct', 'correct'))

				#get binary correctness
				ngrams = ngrams.withColumn('binary_correct', when(ngrams.correct >= nextPos + 1, 1).otherwise(0))
				ngrams = ngrams.withColumn('sample_confi', when(ngrams.binary_correct == 1, 1.0).otherwise(least(ngrams.simi, ngrams.sample_confi)))
				ngrams = ngrams.withColumn('simi', when(ngrams.binary_correct == 1, ngrams.simi).otherwise(ngrams.sample_confi))


				ngrams = ngrams.withColumn('predSeq', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), ngrams.actualSeq).otherwise(ngrams.predSeq))
				ngrams = ngrams.withColumn('succ_wt', when(ngrams.binary_correct == 1, ngrams.wt).otherwise(0))
				ngrams = ngrams.withColumn('fail_wt', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), 0).otherwise(ngrams.wt))
				ngrams = ngrams.withColumn('unopt_wt', when((ngrams.binary_correct == 0) & (ngrams.simi < confidence), ngrams.wt).otherwise(0))
				ngrams = ngrams.drop('simi')

				#now summarize success and failure rates by predicted sequence
				seqWts = ngrams.groupBy('predSeq').agg(sum('succ_wt').alias('succ_wt'), sum('fail_wt').alias('fail_wt'), sum('unopt_wt').alias('unopt_wt'))

				#update sequences table
				seqs = seqWts.join(broadcast(seqs), seqWts.predSeq==seqs.word, 'right_outer').drop('predSeq').fillna(-c2/c1, ['succ_wt', 'fail_wt', 'unopt_wt'])


				scaleback = udf(lambda s: float(s*c1 + c2), DoubleType())
				seqs = seqs.withColumn(modelSucc, col(modelSucc) + scaleback(seqs.succ_wt)).drop('succ_wt')
				seqs = seqs.withColumn(modelFail, col(modelFail) + scaleback(seqs.fail_wt)).drop('fail_wt')
				seqs = seqs.withColumn(modelUnopt, col(modelUnopt) + scaleback(seqs.unopt_wt)).drop('unopt_wt')
				seqs.cache()

				aggregated = seqs.agg(sum(modelSucc), sum(modelFail), sum(modelUnopt))
				aggregated.cache()
				new_success = aggregated.head()['sum(' + modelSucc + ')']
				new_fail = aggregated.head()['sum(' + modelFail + ')']
				new_unopt = aggregated.head()['sum(' + modelUnopt + ')']
				print nextPos, new_success - success, new_fail - fail, new_unopt - unopt 
				success = new_success
				fail = new_fail
				unopt = new_unopt


		#end for testing for each model for a particular vector size

	#end for each vector size


	seqs.orderBy('succ_0', ascending=False).write.mode('overwrite').csv(outputFile(outDir, ctxSize, vecSize, sampleSizes))


	return accuracy
Ejemplo n.º 23
0
    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words

words_df = article_data.rdd.mapPartitions(segmentation).toDF(["article_id","channel_id","words"])
words_df.show()

# 训练#17频道的word2vec
# w2v_model = Word2Vec(vectorSize=100,inputCol='words',outputCol='model',minCount=3)
# model = w2v_model.fit(words_df)
# model.save("hdfs://hadoop1:9000/headlines/models/word2vec_model_17")

# 1.加载模型,得到每个词的向量
from pyspark.ml.feature import Word2VecModel
wv = Word2VecModel.load("hdfs://hadoop1:9000/headlines/models/word2vec_model_17")
vectors = wv.getVectors()
vectors.show()

# 2.获取该频道的文章画像,得到文章画像的关键词,获取这20个关键词对应的词向量
article_profile = w2v.spark.sql("select * from article_profile where channel_id=17 limit 10")

# 3.计算得到文章每个词的向量,利用explode将keywords字典炸开得到keyword和weight
article_profile.registerTempTable('profile')
keyword_weight = w2v.spark.sql("select article_id,channel_id,keyword,weight from profile "
                               "LATERAL VIEW explode(keywords) as keyword,weight")
keyword_weight.show()

# 4.将文章画像的keyword_weight和w2v模型合并,得到每篇文章20个关键词的词向量
_keywords_vector = keyword_weight.join(vectors,vectors.word==keyword_weight.keyword,how='inner')
_keywords_vector.show()
Ejemplo n.º 24
0
    ['article_id', 'channel_id', 'words'])
print("分词数据", words_df.take(10))

# 二、word2vec训练分词数据
from pyspark.ml.feature import Word2Vec

w2v_model = Word2Vec(vectorSize=100,
                     inputCol='words',
                     outputCol='vector',
                     minCount=3)
model = w2v_model.fit(words_df)
model.write().overwrite().save("models/word2vec_model/python.word2vec")

from pyspark.ml.feature import Word2VecModel

w2v_model = Word2VecModel.load("models/word2vec_model/python.word2vec")
vectors = w2v_model.getVectors()
vectors.show()

# 三、关键词获取(tfidf)
# tdidf
# 词频,即tf
from pyspark.ml.feature import CountVectorizer

# vocabSize是总词汇的大小,minDF是文本中出现的最少次数
cv = CountVectorizer(inputCol="words",
                     outputCol="countFeatures",
                     vocabSize=200 * 10000,
                     minDF=1.0)
# 训练词频统计模型
cv_model = cv.fit(words_df)
        .config("spark.rpc.message.maxSize", "2047") \
        .config("spark.sql.catalogImplementation", "in-memory") \
        .config("spark.dynamicAllocation.enabled", "false") \
        .getOrCreate()

    sc = spark.sparkContext
else:
    spark = None
    sc = None

# load model
if args.modelType == "glint":
    from ml_glintword2vec import ServerSideGlintWord2VecModel
    model = ServerSideGlintWord2VecModel.load(args.modelPath)
elif args.modelType == "ml":
    model = Word2VecModel.load(args.modelPath)
else:
    model = GensimWord2Vec.load(args.modelPath)

# get required vectors with model
words1, words2, wordvecs1, wordvecs2 = words_and_vecs_from_csv(
    spark, model, args.csvPath)
simlex_wordvecs = wordvecs_from_simlex(spark, model, args.language)
ws353_wordvecs = wordvecs_from_wordsim353(spark, model, args.language)
predicted_synonyms = word_synonyms(words1, model)
predicted_words2 = word_analogies(wordvecs1, wordvecs2, words1, words2)

# stop model
if args.modelType == "glint":
    model.stop()
Ejemplo n.º 26
0
    def compute_article_similar(self, articleProfile):
        """
        计算增量文章与历史文章的相似度
        :param articleProfile:
        :return:
        """
        from pyspark.ml.feature import Word2VecModel

        def avg(row):
            x = 0
            for v in row.vectors:
                x += v
            return row.article_id, row.channel_id, x / len(row.vectors)

        for channel_id, channel_name in CHANNEL_INFO.items():
            profile = articleProfile.filter(
                'channel_id = {}'.format(channel_id))
            wv_model = Word2VecModel.load(
                "hdfs://hadoop1:9000/headlines/models/channel_%d_%s.word2vec" %
                (channel_id, channel_name))

            vectors = wv_model.getVectors()

            # 计算向量
            profile.registerTempTable("incremental")
            articleKeywordsWeights = self.spark.sql(
                "select article_id,channel_id,keyword,weight from profile\
                                    LATERAL VIEW explode(keywords) as keyword,weight"
            )

            articleKeywordsWeightsAndVectors = articleKeywordsWeights.join(
                vectors, vectors.word == articleKeywordsWeights.keyword,
                "inner")
            articleKeywordVectors = articleKeywordsWeightsAndVectors.rdd.map(
                lambda r: (r.article_id, r.channel_id, r.keyword, r.weight * r.
                           vector)).toDF([
                               "article_id", "channel_id", "keyword",
                               "weightVector"
                           ])

            articleKeywordVectors.registerTemptable("Temptable")
            articleVector = self.spark.sql(
                "select article_id, min(channel_id) channel_id, collect_set(weightVector) vectors from temptable group by article_id"
            ).rdd.map(avg).toDF(["article_id", "channel_id", "articleVector"])

            # 写入数据库hive
            def toArray(row):
                return row.article, row.channel_id, [
                    float(i) for i in row.articleVector.toArray()
                ]

            articleVector = articleVector.rdd.map(toArray).toDF(
                ["article_id", "channel_id", "articleVector"])
            articleVector.write.insertInto("article_vector")

            import gc
            del wv_model
            del vectors
            del articleKeywordsWeights
            del articleKeywordsWeightsAndVectors
            del articleKeywordVectors
            gc.collect()

            # 得到历史文章向量,转换成vector格式,使用LSH求相似文章
            from pyspark.ml.linalg import Vectors
            from pyspark.ml.feature import BucketedRandomProjectionLSH
            train = self.spark.sql(
                "select * from article_vector where channel_id=%d" %
                channel_id)

            def _array_to_vector(row):
                return row.article_id, Vectors.dense(row.articleVector)

            train = train.rdd.map(_array_to_vector).toDF(
                ["article_id", "articleVector"])
            test = articleVector.rdd.map(_array_to_vector).toDF(
                "article_id", "articleVector")

            brp = BucketedRandomProjectionLSH(inputCol="articleVector",
                                              outputCol="hashes",
                                              bucketLength=1.0,
                                              seed=12345)
            model = brp.fit(train)
            similar = model.approxSimilarityJoin(test,
                                                 train,
                                                 2.0,
                                                 distCol="EuclideanDistance")

            def save_hbase(partitions):
                import happybase
                pool = happybase.ConnectionPool(size=3, host='hadoop1')

                with pool.connection() as conn:
                    article_similar = conn.table('article_similar')
                    for row in partitions:
                        if row.datasetA.article_id == row.datasetB.article_id:
                            pass
                        else:
                            article_similar.put(
                                str(row.datasetA.article_id).encode(), {
                                    'similar:{}'.format(row.datasetB.article_id).encode(
                                    ):
                                    b'%0.4f' % (row.EuclideanDistance)
                                })

            similar.foreachPartition(save_hbase)
Ejemplo n.º 27
0
    udf(preprocess_text, ArrayType(StringType()))(question_dataframe.question))

# In[9]:

#print(question_tokenized_df.take(1))

# In[10]:

###now we have to generate the vectors for this given question
from pyspark.ml.feature import Word2Vec, Word2VecModel

saveword2vec_path = os.getcwd() + '/dataset/word2vecmodel'

# In[11]:

model_word2vec = Word2VecModel.load(saveword2vec_path)

# In[12]:

question_with_vector_df = model_word2vec.transform(question_tokenized_df)

# In[13]:

#taking only the dense vector
question_dense_vec = question_with_vector_df.first()["features"]

# In[14]:

#Now that we have everything in place, we just need to calculate the similarity score
import numpy as np
Ejemplo n.º 28
0
from pyspark.ml.feature import Word2VecModel, Tokenizer, StopWordsRemover
from pyspark.sql.functions import regexp_replace
import os

spark = SparkSession.builder \
      .appName("Sentiment") \
      .master("local[*]") \
      .config("spark.driver.memory","4g")\
      .config("spark.hadoop.yarn.resourcemanager.principal",os.getenv("HADOOP_USER_NAME"))\
      .getOrCreate()

storage = os.getenv("STORAGE")

tokenizer = Tokenizer(inputCol="spoken_words", outputCol="word_list")
remover = StopWordsRemover(inputCol="word_list", outputCol="wo_stop_words")
w2v_model_fitted = Word2VecModel.load(
    storage + "/datalake/data/sentiment/w2v_model_fitted")
lr_model = PipelineModel.load(storage + "/datalake/data/sentiment/lr_model")

#args = {"sentence":"I'm no dunce, I was born an oaf and I'll die an oaf"}


def predict_sentiment(args):
    input_sentence = args["sentence"]  #.split(",")
    sentence_df = spark.createDataFrame([(input_sentence, )], ['spoken_words'])
    sentence_df = sentence_df.select(
        regexp_replace('spoken_words', r'[_\"\'():;,.!?\\-]',
                       ' ').alias('spoken_words'))
    sentence_df = tokenizer.transform(sentence_df)
    sentence_df = remover.transform(sentence_df)
    sentence_df = w2v_model_fitted.transform(sentence_df)
    result = lr_model.transform(sentence_df).collect()[0]
Ejemplo n.º 29
0
wordsDF = articleDF.rdd.mapPartitions(segmentation).toDF(
    ["article_id", "channel_id", "words"])

# Train word2vec model
word2vec = Word2Vec(vectorSize=50,
                    inputCol="words",
                    outputCol="model",
                    minCount=2)
model = word2vec.fit(wordsDF)
model.save(
    "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/Word2Vec.model"
)

# Load the model
wv_model = Word2VecModel.load(
    "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/Word2Vec.model"
)
vectors = wv_model.getVectors()

profile = sqlContext.sql("select * from article_profile")
profile.registerTempTable("incremental")
articleKeywordsWeights = sqlContext.sql(
    "select article_id, channel_id, keyword, weight "
    "from incremental LATERAL VIEW explode(keywords) AS keyword, weight")
_article_profile = articleKeywordsWeights.join(
    vectors, vectors.word == articleKeywordsWeights.keyword, "inner")

articleKeywordVectors = _article_profile.rdd.map(
    lambda row: (row.article_id, row.channel_id, row.keyword, row.weight * row.
                 vector)).toDF([
                     "article_id", "channel_id", "keyword", "weightingVector"