def run_minhash_lsh():
    df = util.read_all_json_from_bucket(sql_context,
                                        config.S3_BUCKET_BATCH_PREPROCESSED)

    mh = MinHashLSH(inputCol="text_body_vectorized",
                    outputCol="min_hash",
                    numHashTables=config.LSH_NUM_BANDS)

    # Vectorize so we can fit to MinHashLSH model

    htf = HashingTF(inputCol="text_body_stemmed",
                    outputCol="raw_features",
                    numFeatures=1000)
    htf_df = htf.transform(df)

    vectorizer = VectorAssembler(inputCols=["raw_features"],
                                 outputCol="text_body_vectorized")
    vdf = vectorizer.transform(htf_df)

    if (config.LOG_DEBUG):
        print(colored("[MLLIB BATCH]: Fitting MinHashLSH model...", "green"))
    model = mh.fit(vdf)

    # Compute pairwise LSH similarities for questions within tags
    if (config.LOG_DEBUG):
        print(
            colored(
                "[BATCH]: Fetching questions in same tag, comparing LSH and MinHash, uploading duplicate candidates back to Redis...",
                "cyan"))
    find_dup_cands_within_tags(model)
Example #2
0
def dedup_min_hash(df, column, id_col, min_distance=0.1):
    """
    Deduplicates a dataset using MinHash on a token count basis.

    Removes all items with a distance smaller than min_distance.
    """
    @udf("long")
    def num_nonzeros(v):
        return v.numNonzeros()

    df.cache()
    tokenizer = RegexTokenizer(inputCol=column, outputCol="tokens")
    tokens = tokenizer.transform(df)
    cv = CountVectorizer(inputCol="tokens", outputCol="token_ids")
    vectorizer_model = cv.fit(tokens)
    with_token_ids = vectorizer_model.transform(tokens).drop("tokens", column)
    with_token_ids = with_token_ids.where(
        num_nonzeros(with_token_ids.token_ids) > 0).cache()
    mh = MinHashLSH(inputCol="token_ids",
                    outputCol="hashes",
                    seed=1,
                    numHashTables=10)
    dedup_model = mh.fit(with_token_ids)
    joined = dedup_model.approxSimilarityJoin(with_token_ids, with_token_ids, 1 - min_distance, distCol="dist")\
        .drop("token_ids", "hashes")\
        .filter(f"datasetA.{id_col} < datasetB.{id_col}")
    duplicate_ids = joined.rdd.flatMap(lambda row: (row.datasetA[id_col], row.datasetB[id_col]))\
        .distinct()\
        .map(lambda el: [el])\
        .toDF()
    return df.join(duplicate_ids, duplicate_ids._1 == df[id_col], "left")\
        .where(duplicate_ids._1.isNotNull())\
        .drop(duplicate_ids._1)
Example #3
0
 def similarity_matrix(self, rdd1, rdd2):
     minhash = MinHashLSH(inputCol='vectors', outputCol='LSH')
     model = minhash.fit(rdd1)
     rdd1 = model.transform(rdd1)
     rdd2 = model.transform(rdd2)
     output = model.approxSimilarityJoin(rdd1,rdd2, threshold = 0.8).filter(col('distCol') > 0) \
                 .select(col('datasetB.pid').alias('sku_id_seller'), col('datasetA.pid').alias('pid'),col('distCol').alias('similarity_score'))
     print(output)
     return output  #dataframe contains five columns id_seller(seller products ID), id(our database_products id), product_seller, product_database, similarity_score
    def training(self, spark, df, df_events):

        logging.warning("  MinHash (Model) called -  ")

        ###############################################
        ## Locality Sensitive Hashing Model (Training)
        ###############################################

        try:
            mh = MinHashLSH(inputCol="scaled_features", \
                outputCol="hashes", \
                numHashTables=3)

            model = mh.fit(df)

            #Cache the transformed columns
            #df3_t = model.transform(df3).cache()

            df.registerTempTable("df_tbl")
            df_events.registerTempTable("df_events_tbl")

            df_events_new = spark.sql('''
							select d.*,e.score  from df_tbl d, df_events_tbl e
							where 1=1
							and e.item_id=d._id
							''')
            '''
			from pyspark.sql.functions import broadcast
			df_events_new = broadcast(spark.table("df_tbl")).join(spark.table("df_events_tbl"), "_id")
			'''

            df_events_t = model.transform(df_events_new)


            df_final=model.approxSimilarityJoin(df, \
                      df_events_t, \
                      P_MODEL_THRESHOLD, \
                      distCol="JaccardDistance")\
                .selectExpr("datasetA._id as id1", \
                   "datasetB._id as id2", \
                   "JaccardDistance as similarity_score", \
                   "datasetB.score as popularity_score")

            #.filter("datasetA._id != datasetB._id")\

        except Exception as e:
            print("Error in model training logic  -  " + str(e))
            raise e

        logging.warning("  MinHash (Model) finished... returning -  ")

        return df_final
Example #5
0
def process_df(df):
    time_seq.append(['start process-df', time.time()])
    model = Pipeline(stages=[
        RegexTokenizer(pattern=" ",
                       inputCol="instruments",
                       outputCol="instruments_tokenized",
                       minTokenLength=1),
        NGram(n=1,
              inputCol="instruments_tokenized",
              outputCol="instruments_ngrams"),
        HashingTF(inputCol="instruments_ngrams",
                  outputCol="instruments_vectors"),
        MinHashLSH(inputCol="instruments_vectors",
                   outputCol="instruments_lsh",
                   numHashTables=10)
    ]).fit(df)

    df_hashed = model.transform(df)
    df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.5, distCol="distance") \
        .filter("datasetA.filename != datasetB.filename AND datasetA.filename < datasetB.filename") \
        .select(f.col('datasetA.filename').alias('filename_A'),
                f.col('datasetB.filename').alias('filename_B'),
                f.col('distance'))
    time_seq.append(['process-df df_matches', time.time()])
    write_df_to_pgsql(df_matches, 'filepair_similarity_run3')
    time_seq.append(['write pgsql', time.time()])
    print('time_seq', time_seq)
Example #6
0
def run_minhash_lsh():
    df = util.read_all_json_from_bucket(sql_context,
                                        config.S3_BUCKET_BATCH_PREPROCESSED)
    mh = MinHashLSH(inputCol="text_body_vectorized",
                    outputCol="min_hash",
                    numHashTables=config.LSH_NUM_BANDS)

    # Vectorize so we can fit to MinHashLSH model

    htf = HashingTF(inputCol="text_body_stemmed",
                    outputCol="raw_features",
                    numFeatures=1000)
    htf_df = htf.transform(df)

    vectorizer = VectorAssembler(inputCols=["raw_features"],
                                 outputCol="text_body_vectorized")
    vdf = vectorizer.transform(htf_df)

    if (config.LOG_DEBUG):
        print(colored("[MLLIB BATCH]: Fitting MinHashLSH model...", "green"))
    model = mh.fit(vdf)
    model.transform(vdf).show()

    # Approximate similarity join between pairwise elements
    find_tag = udf(lambda x, y: util.common_tag(x, y), StringType())

    if (config.LOG_DEBUG):
        print(
            colored("[MLLIB BATCH]: Computing approximate similarity join...",
                    "green"))
    sim_join = model.approxSimilarityJoin(
        vdf,
        vdf,
        config.DUP_QUESTION_MIN_HASH_THRESHOLD,
        distCol="jaccard_sim").select(
            col("datasetA.id").alias("q1_id"),
            col("datasetB.id").alias("q2_id"),
            col("datasetA.title").alias("q1_title"),
            col("datasetB.title").alias("q2_title"),
            col("datasetA.text_body_vectorized").alias("q1_text_body"),
            col("datasetB.text_body_vectorized").alias("q2_text_body"),
            find_tag("datasetA.tags", "datasetB.tags").alias("tag"),
            col("jaccard_sim"))

    # Upload LSH similarities to Redis
    sim_join.foreachPartition(store_spark_mllib_sim_redis)
Example #7
0
    def test_min_hash_lsh(self):
        data = self.spark.createDataFrame([(
            0,
            Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),
        ), (
            1,
            Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),
        ), (
            2,
            Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),
        )], ["id", "features"])
        mh = MinHashLSH(inputCol="features",
                        outputCol="hashes",
                        numHashTables=5)
        model = mh.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(
            model,
            'Sparkml MinHashLSH',
            [('features', FloatTensorType([None, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data.limit(2))
        data_np = data.limit(2).toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().hashes.apply(lambda x: pandas.Series(x).map(
                lambda y: y.values[0])).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlMinHashLSH")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['hashes'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #8
0
def main():
    input_dataset = sys.argv[1]
    output_dir = sys.argv[2]

    start_time = time.time()

    #stackoverflow_df = sqlContext.read.csv("../Datasource/stackOverFlow_ID_Title_SMALL.csv", header=True).toDF('id', 'text')

    stackoverflow_df = sqlContext.read.csv(input_dataset,
                                           header=True).toDF('id', 'text')

    # stackoverflow_df.show()

    # stackoverflow_df.head(10).show()

    # stack_df = stack_rdd.toDF(['id','text'])

    # stackoverflow_df.show()

    # stackoverflow_df.printSchema()

    model = Pipeline(stages=[
        RegexTokenizer(
            pattern="", inputCol="text", outputCol="tokens", minTokenLength=1),
        NGram(n=3, inputCol="tokens", outputCol="ngrams"),
        HashingTF(inputCol="ngrams", outputCol="vectors"),
        MinHashLSH(
            inputCol="vectors", outputCol="lsh"
        )  #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5)
    ]).fit(stackoverflow_df)

    db_hashed = model.transform(stackoverflow_df)

    # db_hashed.show()
    # query_hashed = model.transform(query)

    # db_hashed.show()
    # query_hashed.show()

    #res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.90).filter("datasetA.id < datasetB.id")

    res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed,
                                                0.70).filter("distCol > 0")

    #print res

    #print res.count()

    res.show()

    elapsed_time = time.time() - start_time

    print 'Elapsed Time ==> ', elapsed_time
Example #9
0
def jaccard_cross_join(
    input_col: str,
    output_col: str,
    df: DataFrame,
    primary_df: DataFrame,
    secondary_df: DataFrame,
):
    """Fit a jaccard index model based on all the docs in the corpus.
    Then take a subset of these (the primary docs) and cross join with a different
    subset (the secondary docs) to find any docs that are similar according to the
    minimum similarity specified."""

    hash_col = "hashes"
    min_hash_lsh = MinHashLSH(inputCol=input_col,
                              outputCol=hash_col,
                              seed=12345,
                              numHashTables=3)
    model = min_hash_lsh.fit(primary_df)

    return model.approxSimilarityJoin(primary_df,
                                      secondary_df,
                                      distCol=output_col,
                                      threshold=1.0)
Example #10
0
def main():
    potential_clones = sys.argv[1]
    outDir = sys.argv[2]

    start_time = time.time()

    potential_clones = '../Datasource/pc.xml'
    output_csv = 'csvCodes.csv'
    df = convertAndSaveAsCSV(potential_clones, output_csv, True)

    # spark context
    sc = SparkContext.getOrCreate()
    sqlContext = SQLContext(sc)
    spark_df = sqlContext.createDataFrame(df)

    transformed_spark_df = spark_df.rdd.map(distributedSourceTransform)

    pysparkdf_transformedClones = transformed_spark_df.toDF(
        ['filepath', 'startline', 'endline', 'source'])

    #pysparkdf_transformedClones.show()

    model = Pipeline(stages=[
        RegexTokenizer(pattern=" ",
                       inputCol="source",
                       outputCol="tokens",
                       minTokenLength=1),
        NGram(n=3, inputCol="tokens", outputCol="ngrams"),
        HashingTF(inputCol="ngrams", outputCol="vectors", numFeatures=262144),
        MinHashLSH(
            inputCol="vectors", outputCol="lsh", numHashTables=105
        )  #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5)
    ]).fit(pysparkdf_transformedClones)

    hashed_clones = model.transform(pysparkdf_transformedClones)

    clone_pairs = model.stages[-1].approxSimilarityJoin(
        hashed_clones, hashed_clones, 0.70).filter("distCol > 0")

    clone_pairs.show()

    elapsed_time = time.time() - start_time

    print 'Elapsed Time ==> ', elapsed_time
Example #11
0
def match_names(df_1, df_2):

    pipeline = Pipeline(stages=[
        RegexTokenizer(
            pattern="", inputCol="name", outputCol="tokens", minTokenLength=1
        ),
        NGram(n=3, inputCol="tokens", outputCol="ngrams"),
        HashingTF(inputCol="ngrams", outputCol="vectors"),
        MinHashLSH(inputCol="vectors", outputCol="lsh")
    ])

    model = pipeline.fit(df_1)

    stored_hashed = model.transform(df_1)
    landed_hashed = model.transform(df_2)

    matched_df = model.stages[-1].approxSimilarityJoin(stored_hashed, landed_hashed, 1.0, "confidence").select(
        col("datasetA.name"), col("datasetB.name"), col("confidence"))
    matched_df.show(20, False)
Example #12
0
def vectorizeDF(raw):
    raw = spark.createDataFrame(raw_groups, schema=['data', 'target'])
    raw = raw.withColumn('id', monotonically_increasing_id())

    tokenizer = Tokenizer(inputCol='data', outputCol='tokens')

    swremover = StopWordsRemover(inputCol='tokens', outputCol='words')

    cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=100)

    mh = MinHashLSH(inputCol='features',
                    outputCol='hashes',
                    numHashTables=NUM_HASH_TABLES,
                    seed=5123)

    pipeline = Pipeline(stages=[tokenizer, swremover, cv, mh])
    feat_data = pipeline.fit(dataset=raw).transform(raw)
    checkZero = udf(lambda V: V.numNonzeros() > 0, BooleanType())

    feat_data = feat_data.filter(checkZero(col('features')))
    return feat_data
Example #13
0
        line = fp.readline().split(" ")
        cnt += 1
size = len(list(shingles))
cnt = 0
for key, value in tqdm(matrix.items()):
    aux = []
    for index, sh in value.items():
        aux.append(sh)
    data.append(
        (key, Vectors.sparse(size, sorted(list(aux)),
                             np.ones(len(list(aux))))))
next_prime = sieve_of_eratosthenes(size * 2, size)
sc = spark.sparkContext
distData = sc.parallelize(data)

#df = spark.createDataFrame(data, ["id", "features"])
df = spark.createDataFrame(distData, ["id", "features"])

key = Vectors.dense([1.0, 0.0])

mh = MinHashLSH(inputCol="features",
                outputCol="hashes",
                numHashTables=5,
                seed=next_prime)
model = mh.fit(df)
dft = model.transform(df)
model.approxSimilarityJoin(dft, dft, 0.6, distCol="JaccardDistance").select(
    col("datasetA.id").alias("idA"),
    col("datasetB.id").alias("idB"),
    col("JaccardDistance")).filter("idA != idB").show()
Example #14
0
model = pipeline.fit(df1)
df2 = model.transform(df1)
df2.show()


def getsparsesize(v):
    return v.values.size


getsize_udf = udf(getsparsesize, IntegerType())
df2_with_lengths = df2.select("value", "features", getsize_udf("features").alias("vec_size"))
df2_with_lengths.show()

df2NotNull = df2_with_lengths.filter(getsize_udf(df2["features"]) != 0)

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=128)
model2 = mh.fit(df2)
transformed_df2 = model2.transform(df2NotNull)
transformed_df2.show()

edges = []
for k in range(0, transformed_df2.count()):
    edges.append(k)
print(edges)

def getHashColumns(df0, x):
    sum_of_hashes = 0
    for y in range(x, x + 4):
        sum_of_hashes += int(df0[y][0])
    return sum_of_hashes
        .getOrCreate()

    # $example on$
    dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
             (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
             (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]
    dfA = spark.createDataFrame(dataA, ["id", "features"])

    dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
             (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
             (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
    dfB = spark.createDataFrame(dataB, ["id", "features"])

    key = Vectors.sparse(6, [1, 3], [1.0, 1.0])

    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
    model = mh.fit(dfA)

    # Feature Transformation
    print("The hashed dataset where hashed values are stored in the column 'hashes':")
    model.transform(dfA).show()

    # Compute the locality sensitive hashes for the input rows, then perform approximate
    # similarity join.
    # We could avoid computing hashes by passing in the already-transformed dataset, e.g.
    # `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
    print("Approximately joining dfA and dfB on distance smaller than 0.6:")
    model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\
        .select(col("datasetA.id").alias("idA"),
                col("datasetB.id").alias("idB"),
                col("JaccardDistance")).show()
Example #16
0
# In[22]:


f2 = featurevec.withColumn("vlen", vectorlength(featurevec.features))


# In[23]:


sparsevec = f2.where(f2.vlen > 1)


# In[24]:


mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)


# In[25]:


mhmodel = mh.fit(sparsevec)


# In[26]:


transform = mhmodel.transform(sparsevec)


# In[27]:
Example #17
0

conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)

spDF = sqlContext.createDataFrame(df4)

X=spark.createDataFrame(df4['description_x']., "string").toDF("text")
X_parallelize== sc.parallelize(X)

Y=spark.createDataFrame(df4['description_y']., "string").toDF("text")
Y_parallelize== sc.parallelize(Y)

model = Pipeline(stages=[
    RegexTokenizer(
        pattern="", inputCol="text", outputCol="tokens", minTokenLength=1
    ),
    NGram(n=3, inputCol="tokens", outputCol="ngrams"),
    HashingTF(inputCol="ngrams", outputCol="vectors"),
    MinHashLSH(inputCol="vectors", outputCol="lsh")
]).fit(db)



db_hashed = model.transform(db)
query_hashed = model.transform(query)

model.stages[-1].approxSimilarityJoin(db_hashed, query_hashed, 0.75).show()


dist_slangs = slangs_.selectExpr(
    "slangs").dropna().dropDuplicates().withColumn(
        "id", monotonically_increasing_id()).withColumn(
            'slangs_lower', lower_tokens_udf(col('slangs')))

#define model pipeline
#regex tokenizer to split into characters
#featurize characters - index them
#perform MinHashLSH - jaccardian similarity
model = Pipeline(stages=[
    RegexTokenizer(pattern="",
                   inputCol="slangs_lower",
                   outputCol="tokens",
                   minTokenLength=1),
    CountVectorizer(inputCol="tokens", outputCol="features"),
    MinHashLSH(inputCol="features", outputCol="hashValues", numHashTables=20)
]).fit(dist_slangs)

#actually perform the transformation
dist_slangs_hashed = model.transform(dist_slangs)

#perform similarity join; threshold set at 85% similarity, 15% refers to distance away from perfect match
self_join = model.stages[-1].approxSimilarityJoin(dist_slangs_hashed, dist_slangs_hashed, 0.15, distCol="JaccardDistance")\
    .select(col("datasetA.slangs").alias("slangsA"),
            col("datasetB.slangs").alias("slangsB"),
            col("JaccardDistance"))

#include levenshtein distance based on fuzzy distance, threshold set at above 80% similarity
self_join = self_join.withColumn( 'LeviDistance', levenshtein(col('slangsA'),col('slangsB'))).withColumn( 'FuzzyDistance', fuzzy_wuzzy_udf(col('slangsA'),col('slangsB')))\
.where(col('FuzzyDistance')>85)
Example #19
0
dHash_dict=img_hash.map(lambda url_dHash: (url_dHash[1], url_dHash[0]))    ### python 3 code

#dHash_dict.take(5).foreach(println)


#Pickles python hash dictionary
hs.pickleHash(dHash_dict.collectAsMap())

#Converts Image dHash into Sparse Vector (Required Input for LSH)
img_sparse=img_hash.map(lambda img: (img[0], str(img[1]), hs.sparse_vectorize(img[1])))

#Converts array of sparse img vectors into dataframe
df = spark.createDataFrame(img_sparse, ["url", "dHash", "sparseHash"])

#MinHashLSH
mh = MinHashLSH(inputCol="sparseHash", outputCol="minHash", numHashTables=4, seed=69)
model = mh.fit(df)

#BucketedRandomProjectionLSH
#brp = BucketedRandomProjectionLSH(inputCol="sparseHash", outputCol="minHash", bucketLength=20.0, numHashTables=5)
#model = brp.fit(df)

#KMeans
#kmeans=KMeans(featuresCol='denseHash', predictionCol='minHash', k=12, seed=69)
#model = kmeans.fit(df)

#Transform df to model
transformed_df = model.transform(df).select("url","dHash","minHash")

#Combines LSH_minHash Arrays into List
dense_to_array_udf = F.udf(hs.dense_to_array, T.ArrayType(T.FloatType()))
Example #20
0
def jaccard_with_min_hashing(df_t_user,
                             to_compare,
                             regarding,
                             mode="dist",
                             minval=0.0,
                             maxval=1.0):
    df_t_user = df_t_user.distinct()
    #get regarding
    df_regarding = df_t_user.select(col(regarding)).distinct()
    print("regarding", df_regarding.count())

    if df_regarding == None or df_regarding.rdd.isEmpty():
        return None

    #create ids for each regarding element
    print("Creating ids")
    windowSpec = W.orderBy(regarding)
    df_regarding = df_regarding.withColumn("id",
                                           f.row_number().over(windowSpec))
    df_regarding.groupBy("id").count().orderBy(desc("count")).show()

    #window function moved df_titles to single partition --> repartition
    df_regarding.repartition(200)
    df_regarding.show()

    #join dataframes to get author/id pairs
    print("Joining...")
    df1 = df_t_user.alias("df1")
    df2 = df_regarding.alias("df2")
    df_joined = df1.join(df2,
                         col('df1.' +
                             regarding) == col('df2.' + regarding)).select(
                                 col('df1.' + to_compare).alias(to_compare),
                                 col('df2.id').alias("id"))
    df_joined.show()
    print("Join Complete")

    #create binary vectors
    print("Creating vectors")
    count = df_regarding.count() + 10
    tmp = df_regarding.select(col("id")).orderBy(desc("id")).first()
    print("max_id", tmp["id"])
    if tmp != None:
        max_index = int(tmp["id"]) + 10
    else:
        max_index = 0
    size = max(count, max_index)
    #df_joined = df_joined.rdd.map(lambda r: (r[to_compare], float(r['id']))).groupByKey().map(lambda r: sparse_vec(r, size)).toDF()
    df_joined = df_joined.groupBy(to_compare).agg(
        collect_set("id")).rdd.map(lambda r: sparse_vec(r, size)).toDF()
    print("df_joined", df_joined.count())

    df_res = df_joined.select(
        col('_1').alias(to_compare),
        col('_2').alias('features'))
    df_res.show()
    df_res = df_res.repartition(200)
    #df_res.cache()
    print("df_res", df_res.count())

    print("Creating model")
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=100)
    model = mh.fit(df_res)
    model.transform(df_res).show()

    print("Calculating Jaccard")
    df_jacc_dist = model.approxSimilarityJoin(df_res,
                                              df_res,
                                              1.0,
                                              distCol="jaccard")
    df_jacc_dist.cache()
    df_jacc_dist.show()

    print("Selecting needed columns")
    df_filtered = df_jacc_dist.select(
        col("datasetA." + to_compare).alias(to_compare + "1"),
        col("datasetB." + to_compare).alias(to_compare + "2"), col("jaccard"))
    df_filtered.show()
    df_filtered = df_filtered.where(
        col(to_compare + "1") < col(to_compare + "2"))
    df_filtered.show()
    #hier irgendwo Problem
    df_needed = df_filtered.where((col("jaccard") >= minval)
                                  & (col("jaccard") <= maxval))
    df_needed.show()

    if mode == "sim":
        df_needed = df_needed.withColumn("jaccard", 1.0 - col("jaccard"))

    return df_needed
def main():
    sc = SparkSession.builder.appName("SentencingAnalyzer")\
        .config("spark.driver.memory", "10G")\
        .getOrCreate()

    # main df
    cases = sc.read.json("../data/sentencingCases2.jsonl")
    df = cleanDf(cases)

    # read categorized csv
    categorizedCsv = sc.read.csv("../data/categorized.csv", header=True)
    categorizedCsv = categorizedCsv.select(
        'caseName',
        f.split(f.col("type"), " - ").alias('offenseType'), 'duration1',
        'sentenceType1')

    # create the search df
    df = extractOffenseKeywords(df)
    df.cache()
    dfSearch = sc.createDataFrame(searchData, ["term", "offenseKeywords"])

    # CLASSIFICATION OF OFFENSE
    hashingTF = HashingTF(inputCol="offenseKeywords",
                          outputCol="rawFeatures",
                          numFeatures=1000)
    result = hashingTF.transform(df)
    resultSearch = hashingTF.transform(dfSearch)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(result)
    rescaledData = idfModel.transform(result).filter(
        f.size('offenseKeywords') > 0)
    idfModelSearch = idf.fit(resultSearch)
    rescaledDataSearch = idfModelSearch.transform(resultSearch)

    mh = MinHashLSH(inputCol="features",
                    outputCol="hashes",
                    seed=12345,
                    numHashTables=20)
    modelMH = mh.fit(rescaledData)
    transformedData = modelMH.transform(rescaledData)

    modelMHSearch = mh.fit(rescaledDataSearch)
    transformedDataSearch = modelMH.transform(rescaledDataSearch)

    categorizedDf = modelMHSearch.approxSimilarityJoin(
        transformedDataSearch,
        transformedData,
        0.89,
        distCol="JaccardDistance")
    distanceDf = categorizedDf.select([f.col('datasetA.term')] + [f.col('datasetB.caseID')] + [f.col("JaccardDistance")]) \
        .orderBy('caseID', 'JaccardDistance')
    distanceDf = distanceDf.groupBy('caseID').agg(
        f.collect_list('term').alias('predictedOffences'),
        f.collect_list('JaccardDistance').alias('JaccardDistances'))
    distanceDf.cache()
    distanceDf.show()

    # EVALUATE CATEGORIZATION AGAINST MANUAL CATEGORIZATION
    distanceDfEval = distanceDf.join(
        categorizedCsv, distanceDf.caseID == categorizedCsv.caseName)
    distanceDfEval = distanceDfEval.filter(
        distanceDfEval.offenseType[0] != "N/A").filter(
            distanceDfEval.offenseType[0] != "multiple party sentence")
    calcuateDifferenceInPredictedVsActualOffences_udf = f.udf(
        calcuateDifferenceInPredictedVsActualOffences, FloatType())
    distanceDfEval = distanceDfEval.withColumn(
        "error",
        calcuateDifferenceInPredictedVsActualOffences_udf(
            distanceDfEval.predictedOffences, distanceDfEval.offenseType))
    calcuateDifferenceInPredictedVsActualOffencesPercentage_udf = f.udf(
        calcuateDifferenceInPredictedVsActualOffencesPercentage, FloatType())
    distanceDfEval = distanceDfEval.withColumn(
        "pctCorrect",
        calcuateDifferenceInPredictedVsActualOffencesPercentage_udf(
            distanceDfEval.predictedOffences, distanceDfEval.offenseType))
    distanceDfEval.select('caseID', 'predictedOffences', 'offenseType',
                          'JaccardDistances', 'error',
                          'pctCorrect').show(200, truncate=False)
    rmse = (distanceDfEval.groupBy().agg(f.sum('error')).collect()[0][0] /
            distanceDfEval.count())**(1.0 / 2)
    print("Offense category RMSE:", rmse)
    pctCorrectOffense = (distanceDfEval.groupBy().agg(
        f.sum('pctCorrect')).collect()[0][0] / distanceDfEval.count()) * 100
    print("Percentage of offenses correctly categorized: ", pctCorrectOffense)
Example #22
0
df = adding_titles(df)
df = drop_values(df)

df.show()
df.cache()

from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH
import pyspark.sql.functions as f

model = Pipeline(stages=[
    RegexTokenizer(
        pattern="", inputCol="title", outputCol="tokens", minTokenLength=1),
    NGram(n=3, inputCol="tokens", outputCol="ngrams"),
    HashingTF(inputCol="ngrams", outputCol="vectors"),
    MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=10)
]).fit(df)

df_hashed = model.transform(df)

df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.9)

#show all matches (including duplicates)
df_matches.select(
    f.col('datasetA.id').alias('id_A'),
    f.col('datasetB.id').alias('id_B'), f.col('distCol')).show()

#show non-duplicate matches
df_matches.select(
    f.col('datasetA.id').alias('id_A'),
    f.col('datasetB.id').alias('id_B'),
dataB = [(
    3,
    Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),
), (
    4,
    Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),
), (
    5,
    Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),
)]
dfB = spark.createDataFrame(dataB, ["id", "features"])

key = Vectors.sparse(6, [1, 3], [1.0, 1.0])

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model = mh.fit(dfA)

# Feature Transformation
print(
    "The hashed dataset where hashed values are stored in the column 'hashes':"
)
model.transform(dfA).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
print("Approximately joining dfA and dfB on distance smaller than 0.6:")
model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\
    .select(col("datasetA.id").alias("idA"),
ng = NGram(n=2, inputCol="words", outputCol="ngrams")
dataset = ng.transform(dataset)
dataset.show()

#[8]
#fitting the model to our dataset like we do in unsupervised learning
cvect = CountVectorizer(inputCol="ngrams",
                        outputCol="features",
                        vocabSize=100000,
                        minDF=2)
model = cvect.fit(dataset)
dataset = model.transform(dataset)

#[9]
#LSH class for Jaccard distance.
minhash = MinHashLSH(inputCol="features", outputCol="hashValues",
                     seed=12345).setNumHashTables(3)
model = minhash.fit(dataset)
model.transform(dataset)

#[10]
#Printing Values
print("Total no. of Files: ", dataset.count())
print("Column Data:  ", dataset.dtypes)
dataset.show()

#[11]

matrix = model.approxSimilarityJoin(dataset, dataset, 3.0).select(
    col("datasetA.title").alias("A"),
    col("datasetB.title").alias("B"),
    col("distCol")).sort(desc("distCol")).dropDuplicates(['distCol'])
Example #25
0
    def get_similar_word(self,
                         column,
                         text,
                         n_words=10,
                         n_hash=5,
                         verbose=True):
        """
        Get similar strings in a column by MinHash

        target_col: target column to search
        text: input string
        n_words: number of similar strings
        n_hash: number of hash functions for MinHash
        verbose:True if you want to see interactive output

        Output:
            DataFrame of Nearest Neighbours
        """
        rdd = self.data.rdd
        rdd = rdd.filter(lambda row: row[column] != None)
        rdd = rdd.filter(lambda row: row[column] != "")
        rdd = rdd.filter(lambda row: len(row[column]) > 1)
        cdf = self.ss.createDataFrame(
            rdd.map(lambda row: (row[column] if row[column] != None else " ",
                                 list(row[column].lower())
                                 if row[column] != None else [" "])))

        ngram = NGram(n=2, inputCol="_2", outputCol="ngrams")
        if verbose:
            print("Counting Ngram...")
        ngramDataFrame = ngram.transform(cdf)
        if verbose:
            print("Vectorizing...")
        # fit a CountVectorizerModel from the corpus.
        cv = CountVectorizer(inputCol="ngrams",
                             outputCol="features",
                             vocabSize=3000,
                             minDF=0)

        cv_model = cv.fit(ngramDataFrame)

        result = cv_model.transform(ngramDataFrame)

        mh = MinHashLSH(inputCol="features",
                        outputCol="hashes",
                        numHashTables=n_hash)
        if verbose:
            print("Min Hashing...")
        model = mh.fit(result)

        input_text = text
        input_df = [{'text': input_text, 'characters': list(input_text)}]
        input_df = self.ss.createDataFrame(input_df)

        ngram = NGram(n=2, inputCol="characters", outputCol="ngrams")
        input_df = ngram.transform(input_df)

        key = cv_model.transform(input_df).first()['features']

        if (key.toArray().sum() < 1):
            print("No Match! Please try another input..")
            return

        if verbose:
            print("Finding nearest neighbors...")

        NNs = model.approxNearestNeighbors(result, key, n_words)
        NNs.show()

        self.out = NNs
        #self.out=NNs.select('_1').distinct()
        return
Example #26
0
	col("sponsoring_country").alias("country"),
	concat_string_arrays("hashtags", "urls", "related_tweetids").alias("combined"))

dfTrainRelatedUsers = dfTrain.select(col("userid"),
	col("sponsoring_country").alias("country"),
	col("related_userids"))

dfTestTweets = dfTest.select(col("userid"), 
	concat_string_arrays("hashtags", "urls", "related_tweetids").alias("combined"))

dfTestRelatedUsers = dfTest.select(col("userid"),
	col("related_userids"))

model = Pipeline(stages=[
	HashingTF(inputCol="combined", outputCol="vectors"),
	MinHashLSH(inputCol="vectors", outputCol="lsh")]).fit(dfTrainTweets)

trainTweetsHashed = model.transform(dfTrainTweets)
testTweetsHashed = model.transform(dfTestTweets)

combined = model.stages[-1].approxSimilarityJoin(trainTweetsHashed, testTweetsHashed, 0.9)

combined.write.parquet('combined_hashed.parquet')

model2 = Pipeline(stages=[
	HashingTF(inputCol="related_userids", outputCol="vectors"),
	MinHashLSH(inputCol="vectors", outputCol="lsh")]).fit(dfTrainRelatedUsers)

trainUsersHashed = model2.transform(dfTrainRelatedUsers)
testUsersHashed = model2.transform(dfTestRelatedUsers)