def embeddingLSH(spark, movieEmbDf): # movieEmbSeq = [] # for key, embedding_list in movieEmbMap.items(): # embedding_list = [np.float64(embedding) for embedding in embedding_list] # movieEmbSeq.append((key, Vectors.dense(embedding_list))) # # # movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb") # 实际就是训练出一个vector,用于将emb向量内积为数字,再经过多个hash函数后取模分多个桶 bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="vector", outputCol="bucketId", bucketLength=0.1, numHashTables=3) # 训练并生成分桶 bucketModel = bucketProjectionLSH.fit(movieEmbDf) embBucketResult = bucketModel.transform(movieEmbDf) print("movieId, emb, bucketId schema:") embBucketResult.printSchema() print("movieId, emb, bucketId data result:") embBucketResult.show(10, truncate=False) # 给定任意emb vector,可以计算出其分桶,并在桶内计算余弦距离最近的emb print( "Approximately searching for 5 nearest neighbors of the sample embedding:" ) sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237) bucketModel.approxNearestNeighbors(movieEmbDf, sampleEmb, 5).show(truncate=False)
def embeddingLSH(spark, movieEmbMap): movieEmbSeq = [] for key, embedding_list in movieEmbMap.items(): embedding_list = [ np.float64(embedding) for embedding in embedding_list ] movieEmbSeq.append((key, Vectors.dense(embedding_list))) movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb") bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="emb", outputCol="bucketId", bucketLength=0.1, numHashTables=3) bucketModel = bucketProjectionLSH.fit(movieEmbDF) embBucketResult = bucketModel.transform(movieEmbDF) print("movieId, emb, bucketId schema:") embBucketResult.printSchema() print("movieId, emb, bucketId data result:") embBucketResult.show(10, truncate=False) print( "Approximately searching for 5 nearest neighbors of the sample embedding:" ) sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237) bucketModel.approxNearestNeighbors(movieEmbDF, sampleEmb, 5).show(truncate=False)
def embedding_lsh(self, spark_session: SparkSession, movie_emb_map): movie_emb_seq = [] for movieId, vector in movie_emb_map.items(): movie_emb_seq.append((movieId, Vectors.dense(vector))) movie_emb_df = spark.createDataFrame(movie_emb_seq).toDF( "movieId", "emb") bucket_projection_lsh = BucketedRandomProjectionLSH().setBucketLength(0.1).setNumHashTables(3).setInputCol("emb").\ setOutputCol("bucketId") bucket_model = bucket_projection_lsh.fit(movie_emb_df) emb_bucket_result = bucket_model.transform(movie_emb_df) print("movieId, emb, bucketId schema:") emb_bucket_result.printSchema() print("movieId, emb, bucketId data result:") emb_bucket_result.show(10, truncate=False) print( "Approximately searching for 5 nearest neighbors of the sample embedding:" ) sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237) bucket_model.approxNearestNeighbors(movie_emb_df, sampleEmb, 5).show(truncate=False)
def LSH(spot, recommend_num): spark = SparkSession \ .builder \ .appName("BucketedRandomProjectionLSHExample") \ .getOrCreate() ''' # 正则化后的所有景点数据 data = [('嵛山岛', Vectors.dense([0.2, 0.5, 0.7, 0.5]),), ('仙山牧场', Vectors.dense([0.4, 0.4, 0.1, 0.4]),), ('大洲岛', Vectors.dense([0.5, 0.1, 0.1, 0.5]),), ('御茶园', Vectors.dense([0.2, 0.4, 0.3, 0.6]),), ('洞宫山', Vectors.dense([0.3, 0.1, 0.2, 0.2]),), ('玉女峰', Vectors.dense([0.4, 0.4, 0.5, 0.4]),), ('翡翠谷', Vectors.dense([0.6, 0.1, 0.1, 0.5]),), ('白云寺', Vectors.dense([0.9, 0.1, 0.2, 0.1]),), ('泰宁地质博物苑', Vectors.dense([0.7, 0.1, 0.3, 0.7]),), ('晒布岩', Vectors.dense([1, 0.4, 0.5, 0.4]),)] ''' df = spark.createDataFrame(data, ["name", "features"]) brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=2.0, numHashTables=3) model = brp.fit(df) # key = Vectors.dense([0.5, 0.8, 0.1, 0.5]) # 做推荐的景点 # model.approxNearestNeighbors(df, key, 3).show() # 对此景点推荐3个最相似的景点 result = model.approxNearestNeighbors(df, data.get(spot), recommend_num) spark.stop() return result
def train_forPMML(sparkUrl, dataForTrainPath, savePath): # 取得模型存儲路徑 brp_path, model_path = get_model_save_path(savePath) # 載入數據 sc = get_conf(sparkUrl, 'LSH_train', "8g") df = load_sentence_data_frame(sc, dataForTrainPath) # 開始訓練模型 brp = BucketedRandomProjectionLSH() \ .setBucketLength(BUCKET_LENGTH) \ .setNumHashTables(NUM_HASH_TABLES) \ .setInputCol("vector") \ .setOutputCol("hash") # 流水線: 先提取特徵, 再訓練模型 pipeline = Pipeline(stages=[brp]) pipeline_model = pipeline.fit(df) # 顯示大概結果 # pipeline_model.transform(df).show() # 存儲模型至PMML pmmlBuilder = PMMLBuilder(sc, df, pipeline_model) pmmlBuilder.buildFile("~/pmmlModels/SM.pmml") return
def test_bucketed_random_projection_lsh(self): data = self.spark.createDataFrame([( 0, Vectors.dense([-1.0, -1.0]), ), ( 1, Vectors.dense([-1.0, 1.0]), ), ( 2, Vectors.dense([1.0, -1.0]), ), ( 3, Vectors.dense([1.0, 1.0]), )], ["id", "features"]) mh = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", seed=12345, bucketLength=1.0) model = mh.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml BucketedRandomProjectionLSH', [('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().hashes.apply(lambda x: pandas.Series(x).map( lambda y: y.values[0])).values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBucketedRandomProjectionLSH") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['hashes'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def train_lsh_model(): sf = SparkConf()\ .setMaster("local") \ .setAppName("Spark SVM tutorial") \ .set("spark.executor.memory", "8g") sc = SparkContext(conf=sf) df = read_csv() sdf = SQLContext(sc).createDataFrame(df) brp = BucketedRandomProjectionLSH() \ .setBucketLength(50.0) \ .setNumHashTables(3) \ .setInputCol("vector") \ .setOutputCol("hash") model = brp.fit(sdf) model.transform(sdf).show() model.approxSimilarityJoin(sdf, sdf, 1.5)
def validForSpark(sparkUrl, dataForTrainPath, dataForVaildPath, savePath): # 取得模型存儲路徑 brp_path, model_path = get_model_save_path(savePath) # 載入數據 sc = get_conf(sparkUrl, 'LSH_valid', "8g") dft = load_sentence_data_frame(sc, dataForTrainPath) dfv = load_sentence_data_frame(sc, dataForVaildPath) dft.cache() # 載入舊有模型 brp = BucketedRandomProjectionLSH.load(brp_path) model = BucketedRandomProjectionLSHModel.load(model_path) sets = dfv.rdd.map(lambda x: { x['sentence'], x['vector'] }).collect() # write result to file for set in sets: readFalse = False sent = None vect = None for element in set: if isinstance(element, DenseVector) == True and vect == None: vect = element elif isinstance(element, DenseVector) == True and vect != None: print('vect_error', set) readFalse = True if isinstance(element, unicode) == True and sent == None: sent = element elif isinstance(element, unicode) == True and sent != None: print('sent_error', set) readFalse = True if sent == None or vect == None: readFalse = True if readFalse == True: print('read false') break print('=================================') print(sent) print('=================================') res = model.approxNearestNeighbors(dft, vect, 5) s_s = res.select('sentence').rdd.collect() for s in s_s: print(s['sentence']) print('************************************') return
def embeddingLSH(spark, movieEmbMap): movieEmbSeq = [] for key, embedding_list in movieEmbMap.items(): embedding_list = [np.float64(embedding) for embedding in embedding_list] movieEmbSeq.append((key, Vectors.dense(embedding_list))) # 数据集准备,创建一个DataFrame movieEmbDF = spark.createDataFrame(movieEmbSeq).toDF("movieId", "emb") # 利用Spark MLlib 自带的分桶局部敏感哈希模型,其中numHashTables参数设定的是一个Embedding对应的桶数,即分桶函数的数量 bucketProjectionLSH = BucketedRandomProjectionLSH(inputCol="emb", outputCol="bucketId", bucketLength=0.1, numHashTables=3) bucketModel = bucketProjectionLSH.fit(movieEmbDF) embBucketResult = bucketModel.transform(movieEmbDF) print("movieId, emb, bucketId schema:") embBucketResult.printSchema() print("movieId, emb, bucketId data result:") embBucketResult.show(10, truncate=False) print("Approximately searching for 5 nearest neighbors of the given sample embedding:") # 给定一个Embedding向量,将其转换为Dense Vector sampleEmb = Vectors.dense(0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237) # 使用bucketProjectionLSH_model自带的函数寻找其最近邻 bucketModel.approxNearestNeighbors(dataset=movieEmbDF, key=sampleEmb, numNearestNeighbors=5).show(truncate=False)
def train(sparkUrl, dataForTrainPath, savePath): # 取得模型存儲路徑 brp_path, model_path = get_model_save_path(savePath) # 載入數據 sc = get_conf(sparkUrl, 'LSH_train', "8g") df = load_word_data_frame(sc, dataForTrainPath) # 開始訓練模型 brp = BucketedRandomProjectionLSH() \ .setBucketLength(BUCKET_LENGTH) \ .setNumHashTables(NUM_HASH_TABLES) \ .setInputCol("vector") \ .setOutputCol("hash") model = brp.fit(df) # 存儲模型 brp.save(brp_path) model.save(model_path) # 顯示大概結果 model.transform(df).show()
def embeddingLSH(moviesEmb: DataFrame): ''' 局部敏感哈希 :param spark: :param moviesEmb: :return: ''' brp = BucketedRandomProjectionLSH(inputCol='vector', outputCol='bucketId', numHashTables=3, bucketLength=0.1) model = brp.fit(moviesEmb) moviesEmbResult = model.transform(moviesEmb) moviesEmbResult.printSchema() moviesEmbResult.show(5) print( "Approximately searching for 5 nearest neighbors of the sample embedding:" ) sampleEmb = Vectors.dense([ 0.795, 0.583, 1.120, 0.850, 0.174, -0.839, -0.0633, 0.249, 0.673, -0.237 ]) model.approxNearestNeighbors(moviesEmb, sampleEmb, 5).show(5)
def similarities(self, graph, config): print("Similarity Analysis\t1\tComputing hashes of feature vectors") graph = graph.get_df() max_id = graph.agg({"dst": "max"}).collect()[0][0] + 1 # create features as sparse vectors from col and val columns def to_sparse_vector(indices, values): indices, values = zip(*sorted(zip(indices, values))) return Vectors.sparse(max_id, indices, values) def non_zero(v): return v.numNonzeros() to_sparse_vector_udf = udf( lambda indices, values: to_sparse_vector(indices, values), VectorUDT()) non_zero_udf = udf(lambda v: non_zero(v), LongType()) df = graph.groupby("src").agg( to_sparse_vector_udf( collect_list("dst"), collect_list("numberOfPaths")).alias("features")) # do not consider vector smaller than this threshold df = df.filter( non_zero_udf("features") >= int(config["sim_min_values"])) # caclulate bucket length, given the specified number of buckets total_records = df.count() buckets_length = math.ceil(math.sqrt(total_records)) # buckets_length = math.pow(total_records, -1/max_id) # print(total_records) # print(buckets_length) brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=buckets_length, numHashTables=int(config["t"])) model = brp.fit(df) df_t = model.transform(df) if ("Similarity Join" in config["analyses"]): df_t.cache() # Compute the locality sensitive hashes for the input rows, then perform approximate similarity join. print("Similarity Analysis\t2\tCalculating Similarity Join") join_distance = 3 while True: join_results = model.approxSimilarityJoin(df_t, df_t, join_distance, distCol="EuclideanDistance")\ .select( col("datasetA.src").alias("idA"), col("datasetB.src").alias("idB"), col("EuclideanDistance"))\ .filter(col("idA") != col("idB"))\ .orderBy("EuclideanDistance", ascending=True)\ .limit(int(config["searchK"])) # loop until we find as many results as requested if (join_results.count() >= int(config["searchK"])): break # else increase distance and try again join_distance *= 2 join_results.coalesce(1).write.csv(config["sim_join_out"], sep='\t', mode='overwrite') if ("Similarity Search" in config["analyses"]): print( "Similarity Analysis\t2\tCalculating Top-k Similarity Search results" ) target_id = int(config["target_id"]) key_vector = df.filter(col("src") == target_id).select( col("features")).collect() if (len(key_vector) == 0): return key_vector = key_vector[0]["features"] search_results = model.approxNearestNeighbors( df, key_vector, int(config["searchK"]) + 1).filter( col("src") != target_id).select(lit(config["target_id"]), "src", "distCol") search_results.coalesce(1).write.csv(config["sim_search_out"], sep='\t', mode='overwrite')
def compute_article_similar(self, articleProfile): """ 计算增量文章与历史文章的相似度 :param articleProfile: :return: """ from pyspark.ml.feature import Word2VecModel def avg(row): x = 0 for v in row.vectors: x += v return row.article_id, row.channel_id, x / len(row.vectors) for channel_id, channel_name in CHANNEL_INFO.items(): profile = articleProfile.filter( 'channel_id = {}'.format(channel_id)) wv_model = Word2VecModel.load( "hdfs://hadoop1:9000/headlines/models/channel_%d_%s.word2vec" % (channel_id, channel_name)) vectors = wv_model.getVectors() # 计算向量 profile.registerTempTable("incremental") articleKeywordsWeights = self.spark.sql( "select article_id,channel_id,keyword,weight from profile\ LATERAL VIEW explode(keywords) as keyword,weight" ) articleKeywordsWeightsAndVectors = articleKeywordsWeights.join( vectors, vectors.word == articleKeywordsWeights.keyword, "inner") articleKeywordVectors = articleKeywordsWeightsAndVectors.rdd.map( lambda r: (r.article_id, r.channel_id, r.keyword, r.weight * r. vector)).toDF([ "article_id", "channel_id", "keyword", "weightVector" ]) articleKeywordVectors.registerTemptable("Temptable") articleVector = self.spark.sql( "select article_id, min(channel_id) channel_id, collect_set(weightVector) vectors from temptable group by article_id" ).rdd.map(avg).toDF(["article_id", "channel_id", "articleVector"]) # 写入数据库hive def toArray(row): return row.article, row.channel_id, [ float(i) for i in row.articleVector.toArray() ] articleVector = articleVector.rdd.map(toArray).toDF( ["article_id", "channel_id", "articleVector"]) articleVector.write.insertInto("article_vector") import gc del wv_model del vectors del articleKeywordsWeights del articleKeywordsWeightsAndVectors del articleKeywordVectors gc.collect() # 得到历史文章向量,转换成vector格式,使用LSH求相似文章 from pyspark.ml.linalg import Vectors from pyspark.ml.feature import BucketedRandomProjectionLSH train = self.spark.sql( "select * from article_vector where channel_id=%d" % channel_id) def _array_to_vector(row): return row.article_id, Vectors.dense(row.articleVector) train = train.rdd.map(_array_to_vector).toDF( ["article_id", "articleVector"]) test = articleVector.rdd.map(_array_to_vector).toDF( "article_id", "articleVector") brp = BucketedRandomProjectionLSH(inputCol="articleVector", outputCol="hashes", bucketLength=1.0, seed=12345) model = brp.fit(train) similar = model.approxSimilarityJoin(test, train, 2.0, distCol="EuclideanDistance") def save_hbase(partitions): import happybase pool = happybase.ConnectionPool(size=3, host='hadoop1') with pool.connection() as conn: article_similar = conn.table('article_similar') for row in partitions: if row.datasetA.article_id == row.datasetB.article_id: pass else: article_similar.put( str(row.datasetA.article_id).encode(), { 'similar:{}'.format(row.datasetB.article_id).encode( ): b'%0.4f' % (row.EuclideanDistance) }) similar.foreachPartition(save_hbase)
article_vector = article_vector.rdd.map(toArray).toDF(["article_id","channel_id","vector"]) # article_vector.write.insertInto("article_vector") # 利用LSH计算文章相似度 # 1.读取数据,将文章向量从array转换成vector from pyspark.ml.linalg import Vectors def toVector(row): return row.article_id,Vectors.dense(row.vector) train = article_vector.rdd.map(toVector).toDF(["article_id","vector"]) # 计算相似的文章 from pyspark.ml.feature import BucketedRandomProjectionLSH brp = BucketedRandomProjectionLSH(inputCol='vector',outputCol='hashes',numHashTables=4.0,bucketLength=10.0) model = brp.fit(train) similar = model.approxSimilarityJoin(train,train,2.5,distCol='EuclideanDistance') # 按欧式距离从小到大排序,距离越小越相似 similar.sort(['EuclideanDistance']) similar.show() # 得到的similar是一个struct结构体,因此下面要用row.datasetA.article_id表示 # 相似文章结果存入hbase # def save_hbase(partitions): # import happybase # pool = happybase.ConnectionPool(size=3,host='hadoop1') # # with pool.connection() as conn: # article_similar = conn.table('article_similar') # for row in partitions:
ddf = assembler.transform( df.select("*", *(df["scores"].getItem(i) for i in range(keywords_length)))).select("user_score") normalizer = Normalizer(inputCol="user_score", outputCol="normFeatures", p=2.0) extended_user_df = normalizer.transform(ddf) extended_user_df.cache() seed_user_df = extended_user_df # LSH Algorithm brp = BucketedRandomProjectionLSH(inputCol="normFeatures", outputCol="hashes", bucketLength=bucketLength, numHashTables=numHashTables) lsh = brp.fit(extended_user_df) df_users = lsh.approxSimilarityJoin(seed_user_df, extended_user_df, 1 - minimum_similarity_score, distCol="EuclideanDistance") df_users = df_users.withColumn( 'similarity_score', udf(lambda x: 1 - x, FloatType)(df_users.EuclideanDistance)) df_users.coalesce(1000).write.mode('overwrite').parquet(write_path) sc.stop() print("job is completed")
b'%0.4f' % (row.EuclideanDistance) }) if __name__ == '__main__': ism = ItemSimilarModel() item_embeddings = pd.read_csv('./item_embeddings.csv') item_df = ism.spark.createDataFrame(item_embeddings) #将YoutubeDNN模型导出的32维movieid向量转化成LSH所需的vector格式 embedding_vecAssembler = VectorAssembler( inputCols=cols, outputCol="embeddings").transform(item_df) embedding_vecAssembler.registerTempTable('temptable') embedding_Vectors = ism.spark.sql( "select movie_id, embeddings from temptable") # 计算相似的item brp = BucketedRandomProjectionLSH(inputCol='embeddings', outputCol='similar', numHashTables=4.0, bucketLength=10.0) model = brp.fit(embedding_Vectors) similar = model.approxSimilarityJoin(embedding_Vectors, embedding_Vectors, 2.0, distCol='EuclideanDistance') #数据入库 similar.foreachPartition(save_hbase)
def compute_article_similar(self, articleProfile): """ 计算增量文章与历史文章的相似度 word2vec :return: """ # 得到要更新的新文章通道类别(不采用) # all_channel = set(articleProfile.rdd.map(lambda x: x.channel_id).collect()) def avg(row): x = 0 for v in row.vectors: x += v # 将平均向量作为article的向量 return row.article_id, row.channel_id, x / len(row.vectors) for channel_id, channel_name in CHANNEL_INFO.items(): profile = articleProfile.filter( 'channel_id = {}'.format(channel_id)) wv_model = Word2VecModel.load( "hdfs://hadoop-master:9000/headlines/models/channel_%d_%s.word2vec" % (channel_id, channel_name)) vectors = wv_model.getVectors() # 计算向量 profile.registerTempTable("incremental") articleKeywordsWeights = self.spark.sql( "select article_id, channel_id, keyword, weight from incremental LATERAL VIEW explode(keywords) AS keyword, weight where channel_id=%d" % channel_id) articleKeywordsWeightsAndVectors = articleKeywordsWeights.join( vectors, vectors.word == articleKeywordsWeights.keyword, "inner") articleKeywordVectors = articleKeywordsWeightsAndVectors.rdd.map( lambda r: (r.article_id, r.channel_id, r.keyword, r.weight * r. vector)).toDF([ "article_id", "channel_id", "keyword", "weightingVector" ]) articleKeywordVectors.registerTempTable("tempTable") articleVector = self.spark.sql( "select article_id, min(channel_id) channel_id, collect_set(weightingVector) vectors from tempTable group by article_id" ).rdd.map(avg).toDF(["article_id", "channel_id", "articleVector"]) # 写入数据库 def toArray(row): return row.article_id, row.channel_id, [ float(i) for i in row.articleVector.toArray() ] articleVector = articleVector.rdd.map(toArray).toDF( ['article_id', 'channel_id', 'articleVector']) articleVector.write.insertInto("article_vector") import gc del wv_model del vectors del articleKeywordsWeights del articleKeywordsWeightsAndVectors del articleKeywordVectors gc.collect() # 得到历史数据, 转换成固定格式使用LSH进行求相似 train = self.spark.sql( "select * from article_vector where channel_id=%d" % channel_id) def _array_to_vector(row): return row.article_id, Vectors.dense(row.articleVector) train = train.rdd.map(_array_to_vector).toDF( ['article_id', 'articleVector']) test = articleVector.rdd.map(_array_to_vector).toDF( ['article_id', 'articleVector']) brp = BucketedRandomProjectionLSH(inputCol='articleVector', outputCol='hashes', seed=12345, bucketLength=1.0) model = brp.fit(train) similar = model.approxSimilarityJoin(test, train, 2.0, distCol='EuclideanDistance') def save_hbase(partition): import happybase for row in partition: pool = happybase.ConnectionPool(size=3, host='hadoop-master') # article_similar article_id similar:article_id sim with pool.connection() as conn: table = conn.table("article_similar") for row in partition: if row.datasetA.article_id == row.datasetB.article_id: pass else: table.put( str(row.datasetA.article_id).encode(), { b"similar:%d" % row.datasetB.article_id: b"%0.4f" % row.EuclideanDistance }) conn.close() similar.foreachPartition(save_hbase)
spark = SparkSession \ .builder \ .getOrCreate() df = spark.read.parquet("user_score") normalizer = Normalizer(inputCol="user_score", outputCol="normFeatures", p=2.0) extended_user_df = normalizer.transform(df) extended_user_df.cache() # seed_user_df = extended_user_df.sample(0.1, False) # print("no seed users: ",seed_user_df.count()," no of extended users: ",extended_user_df.count()) # LSH Algorithm start_time=time.time() brp = BucketedRandomProjectionLSH(inputCol="normFeatures", outputCol="hashes", bucketLength=10000.0, numHashTables=numHashTables) brp.setSeed(random.randint()) model = brp.fit(extended_user_df) # Get the hashes for the users and convert them into a cluster ID number. df_users = model.transform(extended_user_df) df_users = df_users.withColumn('cluster_id', udf(lambda input: reduce(lambda x, y: x | y, [ 0x1 << i if value[0] != 0.0 else 0 for i, value in enumerate(input) ]), IntegerType())(df_users.hashes)) #df_users.select('hashes', 'cluster_id').show(50, truncate=False, vertical=True) df_count = df_users.groupBy(['cluster_id', 'hashes']).count().cache() df_count.show(100, truncate=False) df_count.groupBy().max('count').show() # df_users = model.approxSimilarityJoin(seed_user_df, extended_user_df, 0.99, distCol="EuclideanDistance") # df_users.coalesce(100).write.mode('overwrite').parquet("user_similarity") print("{} seconds time take by the script: ".format(time.time()-start_time))
train_data = article_vector.select(['article_id', 'articlevector']) def _array_to_vector(row): return row.article_id, Vectors.dense(row.articlevector) train_data = train_data.rdd.map(_array_to_vector).toDF( ['article_id', 'articleVector']) # train_data.show() from pyspark.ml.feature import BucketedRandomProjectionLSH brp = BucketedRandomProjectionLSH(inputCol="articleVector", outputCol="hashes", numHashTables=4, bucketLength=10) model = brp.fit(train_data) similarity = model.approxSimilarityJoin(train_data, train_data, 5.0, distCol='EuclideanDistance') def save_hbase(partition): import happybase conn = happybase.Connection('localhost') table = conn.table('article_similar') for row in partition: if row.datasetA.article_id == row.datasetB.article_id:
master('local[2]').\ appName('scholar').\ getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) df = sqlContext.read.format('parquet').load( 'hdfs:/scholar_data/token_embeddings.parquet').select( 'entities', 'embeddings') to_vector = F.udf(lambda x: Vectors.dense(x), VectorUDT()) to_normed_vector = F.udf(make_normed_vector, VectorUDT()) df = df.withColumn('normed_embeddings', to_normed_vector('embeddings')) df = df.withColumn('embeddings', to_vector('embeddings')) brpLSH = BucketedRandomProjectionLSH(inputCol="normed_embeddings", outputCol="hashes", seed=42, bucketLength=12.0, numHashTables=20) brpLSHmodel = brpLSH.fit(df) brpLSHmodel.save('hdfs:/scholar_model/brpLSH_model') df.write.save('hdfs:/scholar_data/token_normed_vector_embeddings.parquet', format='parquet', mode='overwrite') spark.stop()
# $example on$ dataA = [(0, Vectors.dense([1.0, 1.0]),), (1, Vectors.dense([1.0, -1.0]),), (2, Vectors.dense([-1.0, -1.0]),), (3, Vectors.dense([-1.0, 1.0]),)] dfA = spark.createDataFrame(dataA, ["id", "features"]) dataB = [(4, Vectors.dense([1.0, 0.0]),), (5, Vectors.dense([-1.0, 0.0]),), (6, Vectors.dense([0.0, 1.0]),), (7, Vectors.dense([0.0, -1.0]),)] dfB = spark.createDataFrame(dataB, ["id", "features"]) key = Vectors.dense([1.0, 0.0]) brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=2.0, numHashTables=3) model = brp.fit(dfA) # Feature Transformation print("The hashed dataset where hashed values are stored in the column 'hashes':") model.transform(dfA).show() # Compute the locality sensitive hashes for the input rows, then perform approximate # similarity join. # We could avoid computing hashes by passing in the already-transformed dataset, e.g. # `model.approxSimilarityJoin(transformedA, transformedB, 1.5)` print("Approximately joining dfA and dfB on Euclidean distance smaller than 1.5:") model.approxSimilarityJoin(dfA, dfB, 1.5, distCol="EuclideanDistance")\ .select(col("datasetA.id").alias("idA"), col("datasetB.id").alias("idB"), col("EuclideanDistance")).show()
# COMMAND ---------- def recommend_by_book(book_id): cluster = predictions.filter(predictions.book_id == book_id).select("prediction").collect()[0][0] titles = predictions.filter(predictions.prediction == cluster).select("title").collect() for title in titles: print(title[0]) recommend_by_book(100001) # COMMAND ---------- # DBTITLE 1,Locality Sensitive Hashing: Bucketed Random Projection for Euclidean Distance from pyspark.ml.feature import BucketedRandomProjectionLSH brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=5, numHashTables=10) model = brp.fit(data_pca) # COMMAND ---------- def find_nearest_books(book_id, num): key = data_pca.filter(data_pca.book_id == book_id).select("features").collect()[0][0] res = model.approxNearestNeighbors(data_pca, key, num).select("book_id").collect() for r in res: print(get_book_title(r[0])) find_nearest_books(100001, 10) # COMMAND ---------- # DBTITLE 1,Latent Dirichlet allocation