def compute_similarities(X, sc, threshold=0): """ Compute column similarities using Spark: Efficient dealing of sparsity with a threshold that makes sure that only relevant similarities are computed. Parameters ---------- X: an array whose features are the rows sc: SparkContext threshold: the similarity threshold Return --------- Symetric similarity matrix shape (X.shape[1], X.shape[1]) """ n = X.shape[1] rows = sc.parallelize(X) mat = RowMatrix(rows) sims = mat.columnSimilarities(threshold) # Convert to scipy sparse matrix # Each element is a Matrix entry object (i, j, value) rows_index = np.array( sims.entries.map(lambda x: x.i).collect()).astype(int) cols_index = np.array( sims.entries.map(lambda x: x.j).collect()).astype(int) values = np.array(sims.entries.map(lambda x: x.value).collect()) triang_sup = coo_matrix((values, (rows_index, cols_index)), shape=(n, n)) triang_inf = coo_matrix((values, (cols_index, rows_index)), shape=(n, n)) return ((triang_sup + triang_inf).tocsr())
def similarity_processing(self, tag_path): conf = SparkConf().setAppName("Test").setMaster("local") sc = SparkContext(conf=conf) spark = SparkSession.builder.config(conf=conf).getOrCreate() df = spark.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load(tag_path, header=True) df = df.drop("tagId") print(df.columns) rdd = df.rdd.map(list) mat = RowMatrix(rdd) print(mat.numCols(), mat.numRows()) cs = mat.columnSimilarities() for x in cs.entries.collect(): print(x) print(cs.numRows(), cs.numCols())
outputCol="scaled_1") trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map( lambda x: (x, )).toDF() scalerModel = scaler.fit(trial_df) vec_df = spark.createDataFrame( scalerModel.transform(trial_df).select("scaled_1").rdd.map( lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2])))) # Create RowMatrix from the transpose of spark_df = spark.createDataFrame(vec_df.toPandas().transpose()).rdd vector_df = sc.parallelize(spark_df.map(lambda s: Vectors.dense(s)).collect()) mat = RowMatrix(vector_df) bun = mat.rows.collect() num_clusters = 3 pre = sc.parallelize(mat.columnSimilarities().entries.map( lambda e: (e.i, e.j, e.value)).collect()) model = PowerIterationClustering.train(pre, 3, 20, "random") err = model.assignments().map(lambda x: (Vectors.dense(bun[0][x.id], bun[1][ x.id], bun[2][x.id]), x.cluster)).collect() # Silhoutte value ag = 0 agi = 1700 for er in err: avg = [0] * num_clusters avgi = [0] * num_clusters for e in err: avg[e[1]] += Vectors.squared_distance(er[0], e[0]) avgi[e[1]] += 1 a = avg[er[1]] / avgi[er[1]] b = sys.maxint
mean2 = 'NaN' if type_mapping2[1]['mean'] in (None, 'null', "None") else type_mapping2[1]['mean'] mx2 = type_mapping2[4]['max'] suggested_type2 = suggest_data_type(mean2, mx2) if suggested_type2 == 'FLOAT': d_type2 = 1 elif suggested_type2 == 'INT': d_type2 = 2 elif suggested_type2 == 'DATE': d_type2 = 3 else: d_type2 = 4 stats_feat2 = assembler.transform(features2) stats_feat_results2 = stats_feat2.select("features") l1NormData2 = normalizer.transform(stats_feat_results2, {normalizer.p: float("inf")}) normfeatures2 = l1NormData2.select("normFeatures").rdd.flatMap(list).collect()[0] features_combined2 = [(normfeatures2, d_type2)] final_feat_frame2 = spark.createDataFrame(features_combined2, ["meta_features", "data_type"]) final_results2 = final_assembler.transform(final_feat_frame2) combined_features2 = final_results2.select("final_features").rdd.flatMap(list).collect() combined_vectors = [combined_features, combined_features2] vectors = spark.sparkContext.parallelize(combined_vectors) matrix = RowMatrix(vectors) similarity_score = matrix.columnSimilarities() score_results = similarity_score.entries.first().value store_features(table_name1, table_name2, col_v, col_v2, score_results)
# -*- coding:utf-8 -*- # author [email protected] import os import sys from pyspark import SparkContext local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") from pyspark.mllib.linalg import Vector from pyspark.mllib.linalg import Vectors from pyspark.mllib.linalg.distributed import RowMatrix def main(sc, sqlContext, isHive=True): pass if __name__ == "__main__": os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6" sc = SparkContext('local[1]') rddRows = sc.parallelize(["1 0 2 0 0 1", "0 0 4 2 0 0"]) rddRows.map( lambda x: Vectors.dense([float(each) for each in str(x).split(" ")])) mat = RowMatrix(rddRows) simsPerfect = mat.columnSimilarities()
vectorizer = CountVectorizer(inputCol="package_ids", outputCol="packages_encoded") vectorizer_model = vectorizer.fit(grouped) transformedDf = vectorizer_model.transform(grouped) transformedDf = transformedDf.drop(col("package_ids")) # Extract vectors from the DataFrame in preparation for computing the similarity matrix array = [ Vectors.fromML(row.packages_encoded) for row in transformedDf.collect() ] # Create a RowMatrix matrix = RowMatrix(sc.parallelize(array)) # Compute column similarity matrix similarity = matrix.columnSimilarities() # Convert the matrix to a DataFrame entries = similarity.entries.collect() similarityDf = spark.createDataFrame(entries).toDF("package_a", "package_b", "similarity") # Write to the database url_connect = f"jdbc:postgresql://{host}/" table = "similarity" mode = "overwrite" properties = { "user": user, "password": password, "driver": "org.postgresql.Driver" }
def main(): SC = SparkContext("local[1]", "pkgpkr") # Connect to the database USER = os.environ.get("DB_USER") PASSWORD = os.environ.get("DB_PASSWORD") HOST = os.environ.get("DB_HOST") DB = psycopg2.connect(user=USER, password=PASSWORD, host=HOST) CUR = DB.cursor() # Load the raw data into Spark CUR.execute("SELECT * FROM dependencies") DEPENDENCIES = CUR.fetchall() SPARK = SparkSession.builder.master("local[1]").appName("pkgpkr").getOrCreate() DF = SPARK.createDataFrame(DEPENDENCIES).toDF("application_id", "package_id") # Close the database connection CUR.close() DB.close() # Restructure the dataframe in preparation for one-hot encoding GROUPED = DF.groupBy("application_id").agg(collect_list("package_id")) GROUPED = GROUPED.withColumnRenamed("collect_list(package_id)", "package_ids") GROUPED = GROUPED.withColumn("package_ids", col("package_ids").cast("array<string>")) # One-hot encode the data (rows are applications, columns are packages) VECTORIZER = CountVectorizer(inputCol="package_ids", outputCol="packages_encoded") VECTORIZER_MODEL = VECTORIZER.fit(GROUPED) TRANSFORMED_DF = VECTORIZER_MODEL.transform(GROUPED) TRANSFORMED_DF = TRANSFORMED_DF.drop(col("package_ids")) # Extract vectors from the DataFrame in preparation for computing the similarity matrix ARRAY = [Vectors.fromML(row.packages_encoded) for row in TRANSFORMED_DF.collect()] # Create a RowMatrix MATRIX = RowMatrix(SC.parallelize(ARRAY, numSlices=100)) # Compute column similarity matrix SIMILARITY = MATRIX.columnSimilarities() # Convert the matrix to a DataFrame ENTRIES = SIMILARITY.entries.collect() SIMILARITY_DF = SPARK.createDataFrame(ENTRIES).toDF("a", "b", "similarity") # Map the package identifiers back to their pre-vectorized values MAPPING = create_map([lit(x) for x in chain(*enumerate(VECTORIZER_MODEL.vocabulary))]) SIMILARITY_DF = SIMILARITY_DF.withColumn("package_a", MAPPING.getItem(col("a")).cast("integer")) \ .withColumn("package_b", MAPPING.getItem(col("b")).cast("integer")) SIMILARITY_DF = SIMILARITY_DF.drop(col("a")).drop(col("b")) # Mirror the columns and append to the existing dataframe so we need only query the first column SIMILARITY_DF = SIMILARITY_DF.select('package_a', 'package_b', 'similarity') \ .union(SIMILARITY_DF.select('package_b', 'package_a', 'similarity')) # Write similarity scores to the database URL_CONNECT = f"jdbc:postgresql://{HOST}/" TABLE = "similarity" MODE = "overwrite" PROPERTIES = {"user": USER, "password": PASSWORD, "driver": "org.postgresql.Driver"} SIMILARITY_DF.write.jdbc(URL_CONNECT, TABLE, MODE, PROPERTIES) # # Update popularity scores # POPULARITY_UPDATE = """ UPDATE packages SET popularity = s.popularity FROM ( SELECT package_b, COUNT(package_b) AS popularity FROM similarity GROUP BY package_b ) s WHERE packages.id = s.package_b; """ POPULARITY_NULL_TO_ZERO = """ UPDATE packages SET popularity = 0 WHERE popularity IS NULL; """ BOUNDED_POPULARITY_UPDATE = """ UPDATE packages SET bounded_popularity = s.popularity FROM ( SELECT id, WIDTH_BUCKET(LOG(popularity + 1), 0, (SELECT MAX(LOG(popularity + 1)) FROM packages), 9) AS popularity FROM packages ) s WHERE packages.id = s.id; """ # Connect to the database DB = psycopg2.connect(user=USER, password=PASSWORD, host=HOST) CUR = DB.cursor() # Execute popularity updates CUR.execute(POPULARITY_UPDATE) CUR.execute(POPULARITY_NULL_TO_ZERO) CUR.execute(BOUNDED_POPULARITY_UPDATE) # # Update trending scores # MONTHLY_DOWNLOADS_LAST_MONTH_NULL_TO_ZERO = """ UPDATE packages SET monthly_downloads_last_month = 0 WHERE monthly_downloads_last_month IS NULL; """ MONTHLY_DOWNLOADS_A_YEAR_AGO_NULL_TO_ZERO = """ UPDATE packages SET monthly_downloads_a_year_ago = 0 WHERE monthly_downloads_a_year_ago IS NULL; """ ABSOLUTE_TREND_UPDATE = """ UPDATE packages SET absolute_trend = s.absolute_trend FROM ( SELECT id, WIDTH_BUCKET( LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1), (SELECT MIN(LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1)) FROM packages), (SELECT MAX(LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1)) FROM packages), 9 ) AS absolute_trend FROM packages ) s WHERE packages.id = s.id; """ RELATIVE_TREND_UPDATE = """ UPDATE packages SET relative_trend = s.relative_trend FROM ( SELECT id, WIDTH_BUCKET( LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1), (SELECT MIN(LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1)) FROM packages), (SELECT MAX(LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1)) FROM packages), 9 ) AS relative_trend FROM packages ) s WHERE packages.id = s.id; """ # Execute trending updates CUR.execute(MONTHLY_DOWNLOADS_LAST_MONTH_NULL_TO_ZERO) CUR.execute(MONTHLY_DOWNLOADS_A_YEAR_AGO_NULL_TO_ZERO) CUR.execute(ABSOLUTE_TREND_UPDATE) CUR.execute(RELATIVE_TREND_UPDATE) # Commit changes and close the database connection DB.commit() CUR.close() DB.close()
# -*- coding:utf-8 -*- # author [email protected] import os import sys from pyspark import SparkContext local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") from pyspark.mllib.linalg import Vector from pyspark.mllib.linalg import Vectors from pyspark.mllib.linalg.distributed import RowMatrix def main(sc, sqlContext, isHive = True): pass if __name__ == "__main__": os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6" sc = SparkContext('local[1]') rddRows = sc.parallelize(["1 0 2 0 0 1", "0 0 4 2 0 0"]) rddRows.map(lambda x: Vectors.dense([float(each) for each in str(x).split(" ")])) mat = RowMatrix(rddRows) simsPerfect = mat.columnSimilarities()