Example #1
0
def compute_similarities(X, sc, threshold=0):
    """ Compute column similarities using Spark:
    Efficient dealing of sparsity with a threshold
    that makes sure that only relevant similarities are computed.
    
    Parameters
    ----------
    X: an array whose features are the rows
    sc: SparkContext
    threshold: the similarity threshold
    
    Return
    ---------
    Symetric similarity matrix shape (X.shape[1], X.shape[1])
    """
    n = X.shape[1]
    rows = sc.parallelize(X)
    mat = RowMatrix(rows)

    sims = mat.columnSimilarities(threshold)
    # Convert to scipy sparse matrix
    # Each element is a Matrix entry object (i, j, value)
    rows_index = np.array(
        sims.entries.map(lambda x: x.i).collect()).astype(int)
    cols_index = np.array(
        sims.entries.map(lambda x: x.j).collect()).astype(int)
    values = np.array(sims.entries.map(lambda x: x.value).collect())
    triang_sup = coo_matrix((values, (rows_index, cols_index)), shape=(n, n))
    triang_inf = coo_matrix((values, (cols_index, rows_index)), shape=(n, n))

    return ((triang_sup + triang_inf).tocsr())
Example #2
0
 def similarity_processing(self, tag_path):
     conf = SparkConf().setAppName("Test").setMaster("local")
     sc = SparkContext(conf=conf)
     spark = SparkSession.builder.config(conf=conf).getOrCreate()
     df = spark.read.format('com.databricks.spark.csv').options(
         header='true', inferschema='true').load(tag_path, header=True)
     df = df.drop("tagId")
     print(df.columns)
     rdd = df.rdd.map(list)
     mat = RowMatrix(rdd)
     print(mat.numCols(), mat.numRows())
     cs = mat.columnSimilarities()
     for x in cs.entries.collect():
         print(x)
     print(cs.numRows(), cs.numCols())
         outputCol="scaled_1")
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vec_df = spark.createDataFrame(
    scalerModel.transform(trial_df).select("scaled_1").rdd.map(
        lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2]))))

# Create RowMatrix from the transpose of
spark_df = spark.createDataFrame(vec_df.toPandas().transpose()).rdd
vector_df = sc.parallelize(spark_df.map(lambda s: Vectors.dense(s)).collect())
mat = RowMatrix(vector_df)
bun = mat.rows.collect()
num_clusters = 3

pre = sc.parallelize(mat.columnSimilarities().entries.map(
    lambda e: (e.i, e.j, e.value)).collect())
model = PowerIterationClustering.train(pre, 3, 20, "random")
err = model.assignments().map(lambda x: (Vectors.dense(bun[0][x.id], bun[1][
    x.id], bun[2][x.id]), x.cluster)).collect()

# Silhoutte value
ag = 0
agi = 1700
for er in err:
    avg = [0] * num_clusters
    avgi = [0] * num_clusters
    for e in err:
        avg[e[1]] += Vectors.squared_distance(er[0], e[0])
        avgi[e[1]] += 1
    a = avg[er[1]] / avgi[er[1]]
    b = sys.maxint
        mean2 = 'NaN' if type_mapping2[1]['mean'] in (None, 'null', "None") else type_mapping2[1]['mean']
        mx2 = type_mapping2[4]['max']
        suggested_type2 = suggest_data_type(mean2, mx2)

        if suggested_type2 == 'FLOAT':
            d_type2 = 1
        elif suggested_type2 == 'INT':
            d_type2 = 2
        elif suggested_type2 == 'DATE':
            d_type2 = 3
        else:
            d_type2 = 4

        stats_feat2 = assembler.transform(features2)
        stats_feat_results2 = stats_feat2.select("features")

        l1NormData2 = normalizer.transform(stats_feat_results2, {normalizer.p: float("inf")})
        normfeatures2 = l1NormData2.select("normFeatures").rdd.flatMap(list).collect()[0]

        features_combined2 = [(normfeatures2, d_type2)]

        final_feat_frame2 = spark.createDataFrame(features_combined2, ["meta_features", "data_type"])
        final_results2 = final_assembler.transform(final_feat_frame2)
        combined_features2 = final_results2.select("final_features").rdd.flatMap(list).collect()

        combined_vectors = [combined_features, combined_features2]
        vectors = spark.sparkContext.parallelize(combined_vectors)
        matrix = RowMatrix(vectors)
        similarity_score = matrix.columnSimilarities()
        score_results = similarity_score.entries.first().value
        store_features(table_name1, table_name2, col_v, col_v2, score_results)
Example #5
0
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys

from pyspark import SparkContext

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")

from pyspark.mllib.linalg import Vector
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix


def main(sc, sqlContext, isHive=True):
    pass


if __name__ == "__main__":
    os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6"
    sc = SparkContext('local[1]')
    rddRows = sc.parallelize(["1 0 2 0 0 1", "0 0 4 2 0 0"])

    rddRows.map(
        lambda x: Vectors.dense([float(each) for each in str(x).split(" ")]))
    mat = RowMatrix(rddRows)

    simsPerfect = mat.columnSimilarities()
Example #6
0
vectorizer = CountVectorizer(inputCol="package_ids",
                             outputCol="packages_encoded")
vectorizer_model = vectorizer.fit(grouped)
transformedDf = vectorizer_model.transform(grouped)
transformedDf = transformedDf.drop(col("package_ids"))

# Extract vectors from the DataFrame in preparation for computing the similarity matrix
array = [
    Vectors.fromML(row.packages_encoded) for row in transformedDf.collect()
]

# Create a RowMatrix
matrix = RowMatrix(sc.parallelize(array))

# Compute column similarity matrix
similarity = matrix.columnSimilarities()

# Convert the matrix to a DataFrame
entries = similarity.entries.collect()
similarityDf = spark.createDataFrame(entries).toDF("package_a", "package_b",
                                                   "similarity")

# Write to the database
url_connect = f"jdbc:postgresql://{host}/"
table = "similarity"
mode = "overwrite"
properties = {
    "user": user,
    "password": password,
    "driver": "org.postgresql.Driver"
}
Example #7
0
def main():
    SC = SparkContext("local[1]", "pkgpkr")

    # Connect to the database
    USER = os.environ.get("DB_USER")
    PASSWORD = os.environ.get("DB_PASSWORD")
    HOST = os.environ.get("DB_HOST")
    DB = psycopg2.connect(user=USER, password=PASSWORD, host=HOST)
    CUR = DB.cursor()

    # Load the raw data into Spark
    CUR.execute("SELECT * FROM dependencies")
    DEPENDENCIES = CUR.fetchall()
    SPARK = SparkSession.builder.master("local[1]").appName("pkgpkr").getOrCreate()
    DF = SPARK.createDataFrame(DEPENDENCIES).toDF("application_id", "package_id")

    # Close the database connection
    CUR.close()
    DB.close()

    # Restructure the dataframe in preparation for one-hot encoding
    GROUPED = DF.groupBy("application_id").agg(collect_list("package_id"))
    GROUPED = GROUPED.withColumnRenamed("collect_list(package_id)", "package_ids")
    GROUPED = GROUPED.withColumn("package_ids", col("package_ids").cast("array<string>"))

    # One-hot encode the data (rows are applications, columns are packages)
    VECTORIZER = CountVectorizer(inputCol="package_ids", outputCol="packages_encoded")
    VECTORIZER_MODEL = VECTORIZER.fit(GROUPED)
    TRANSFORMED_DF = VECTORIZER_MODEL.transform(GROUPED)
    TRANSFORMED_DF = TRANSFORMED_DF.drop(col("package_ids"))

    # Extract vectors from the DataFrame in preparation for computing the similarity matrix
    ARRAY = [Vectors.fromML(row.packages_encoded) for row in TRANSFORMED_DF.collect()]

    # Create a RowMatrix
    MATRIX = RowMatrix(SC.parallelize(ARRAY, numSlices=100))

    # Compute column similarity matrix
    SIMILARITY = MATRIX.columnSimilarities()

    # Convert the matrix to a DataFrame
    ENTRIES = SIMILARITY.entries.collect()
    SIMILARITY_DF = SPARK.createDataFrame(ENTRIES).toDF("a", "b", "similarity")

    # Map the package identifiers back to their pre-vectorized values
    MAPPING = create_map([lit(x) for x in chain(*enumerate(VECTORIZER_MODEL.vocabulary))])
    SIMILARITY_DF = SIMILARITY_DF.withColumn("package_a", MAPPING.getItem(col("a")).cast("integer")) \
                                 .withColumn("package_b", MAPPING.getItem(col("b")).cast("integer"))
    SIMILARITY_DF = SIMILARITY_DF.drop(col("a")).drop(col("b"))

    # Mirror the columns and append to the existing dataframe so we need only query the first column
    SIMILARITY_DF = SIMILARITY_DF.select('package_a', 'package_b', 'similarity') \
                                 .union(SIMILARITY_DF.select('package_b', 'package_a', 'similarity'))

    # Write similarity scores to the database
    URL_CONNECT = f"jdbc:postgresql://{HOST}/"
    TABLE = "similarity"
    MODE = "overwrite"
    PROPERTIES = {"user": USER, "password": PASSWORD, "driver": "org.postgresql.Driver"}
    SIMILARITY_DF.write.jdbc(URL_CONNECT, TABLE, MODE, PROPERTIES)

    #
    # Update popularity scores
    #

    POPULARITY_UPDATE = """
    UPDATE packages
    SET popularity = s.popularity
    FROM (
      SELECT package_b, COUNT(package_b) AS popularity
      FROM similarity
      GROUP BY package_b
    ) s
    WHERE packages.id = s.package_b;
    """

    POPULARITY_NULL_TO_ZERO = """
    UPDATE packages
    SET popularity = 0
    WHERE popularity IS NULL;
    """

    BOUNDED_POPULARITY_UPDATE = """
    UPDATE packages
    SET bounded_popularity = s.popularity
    FROM (
      SELECT id, WIDTH_BUCKET(LOG(popularity + 1), 0, (SELECT MAX(LOG(popularity + 1)) FROM packages), 9) AS popularity
      FROM packages
    ) s
    WHERE packages.id = s.id;
    """

    # Connect to the database
    DB = psycopg2.connect(user=USER, password=PASSWORD, host=HOST)
    CUR = DB.cursor()

    # Execute popularity updates
    CUR.execute(POPULARITY_UPDATE)
    CUR.execute(POPULARITY_NULL_TO_ZERO)
    CUR.execute(BOUNDED_POPULARITY_UPDATE)

    #
    # Update trending scores
    #

    MONTHLY_DOWNLOADS_LAST_MONTH_NULL_TO_ZERO = """
    UPDATE packages
    SET monthly_downloads_last_month = 0
    WHERE monthly_downloads_last_month IS NULL;
    """

    MONTHLY_DOWNLOADS_A_YEAR_AGO_NULL_TO_ZERO = """
    UPDATE packages
    SET monthly_downloads_a_year_ago = 0
    WHERE monthly_downloads_a_year_ago IS NULL;
    """

    ABSOLUTE_TREND_UPDATE = """
    UPDATE packages
    SET absolute_trend = s.absolute_trend
    FROM (
      SELECT id, WIDTH_BUCKET(
        LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1),
        (SELECT MIN(LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1)) FROM packages),
        (SELECT MAX(LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1)) FROM packages),
        9
      ) AS absolute_trend
      FROM packages
    ) s
    WHERE packages.id = s.id;
    """

    RELATIVE_TREND_UPDATE = """
    UPDATE packages
    SET relative_trend = s.relative_trend
    FROM (
      SELECT id, WIDTH_BUCKET(
        LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1),
        (SELECT MIN(LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1)) FROM packages),
        (SELECT MAX(LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1)) FROM packages),
        9
      ) AS relative_trend
      FROM packages
    ) s
    WHERE packages.id = s.id;
    """

    # Execute trending updates
    CUR.execute(MONTHLY_DOWNLOADS_LAST_MONTH_NULL_TO_ZERO)
    CUR.execute(MONTHLY_DOWNLOADS_A_YEAR_AGO_NULL_TO_ZERO)
    CUR.execute(ABSOLUTE_TREND_UPDATE)
    CUR.execute(RELATIVE_TREND_UPDATE)

    # Commit changes and close the database connection
    DB.commit()
    CUR.close()
    DB.close()
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys

from pyspark import SparkContext

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")

from pyspark.mllib.linalg import Vector
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

def main(sc, sqlContext, isHive = True):
    pass

if __name__ == "__main__":
    os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6"
    sc = SparkContext('local[1]')
    rddRows = sc.parallelize(["1 0 2 0 0 1", "0 0 4 2 0 0"])

    rddRows.map(lambda x: Vectors.dense([float(each) for each in str(x).split(" ")]))
    mat = RowMatrix(rddRows)

    simsPerfect = mat.columnSimilarities()