Example #1
0
    dataB = [(
        3,
        Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),
    ), (
        4,
        Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),
    ), (
        5,
        Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),
    )]
    dfB = spark.createDataFrame(dataB, ["id", "features"])

    key = Vectors.sparse(6, [1, 3], [1.0, 1.0])

    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
    model = mh.fit(dfA)

    # Feature Transformation
    print(
        "The hashed dataset where hashed values are stored in the column 'hashes':"
    )
    model.transform(dfA).show()

    # Compute the locality sensitive hashes for the input rows, then perform approximate
    # similarity join.
    # We could avoid computing hashes by passing in the already-transformed dataset, e.g.
    # `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
    print("Approximately joining dfA and dfB on distance smaller than 0.6:")
    model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\
        .select(col("datasetA.id").alias("idA"),
                col("datasetB.id").alias("idB"),
    # $example on$
    dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
             (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
             (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]
    dfA = spark.createDataFrame(dataA, ["id", "features"])

    dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
             (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
             (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
    dfB = spark.createDataFrame(dataB, ["id", "features"])

    key = Vectors.sparse(6, [1, 3], [1.0, 1.0])

    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
    model = mh.fit(dfA)

    # Feature Transformation
    print("The hashed dataset where hashed values are stored in the column 'hashes':")
    model.transform(dfA).show()

    # Compute the locality sensitive hashes for the input rows, then perform approximate
    # similarity join.
    # We could avoid computing hashes by passing in the already-transformed dataset, e.g.
    # `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
    print("Approximately joining dfA and dfB on distance smaller than 0.6:")
    model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\
        .select(col("datasetA.id").alias("idA"),
                col("datasetB.id").alias("idB"),
                col("JaccardDistance")).show()
Example #3
0
def jaccard_with_min_hashing(df_t_user,
                             to_compare,
                             regarding,
                             mode="dist",
                             minval=0.0,
                             maxval=1.0):
    df_t_user = df_t_user.distinct()
    #get regarding
    df_regarding = df_t_user.select(col(regarding)).distinct()
    print("regarding", df_regarding.count())

    if df_regarding == None or df_regarding.rdd.isEmpty():
        return None

    #create ids for each regarding element
    print("Creating ids")
    windowSpec = W.orderBy(regarding)
    df_regarding = df_regarding.withColumn("id",
                                           f.row_number().over(windowSpec))
    df_regarding.groupBy("id").count().orderBy(desc("count")).show()

    #window function moved df_titles to single partition --> repartition
    df_regarding.repartition(200)
    df_regarding.show()

    #join dataframes to get author/id pairs
    print("Joining...")
    df1 = df_t_user.alias("df1")
    df2 = df_regarding.alias("df2")
    df_joined = df1.join(df2,
                         col('df1.' +
                             regarding) == col('df2.' + regarding)).select(
                                 col('df1.' + to_compare).alias(to_compare),
                                 col('df2.id').alias("id"))
    df_joined.show()
    print("Join Complete")

    #create binary vectors
    print("Creating vectors")
    count = df_regarding.count() + 10
    tmp = df_regarding.select(col("id")).orderBy(desc("id")).first()
    print("max_id", tmp["id"])
    if tmp != None:
        max_index = int(tmp["id"]) + 10
    else:
        max_index = 0
    size = max(count, max_index)
    #df_joined = df_joined.rdd.map(lambda r: (r[to_compare], float(r['id']))).groupByKey().map(lambda r: sparse_vec(r, size)).toDF()
    df_joined = df_joined.groupBy(to_compare).agg(
        collect_set("id")).rdd.map(lambda r: sparse_vec(r, size)).toDF()
    print("df_joined", df_joined.count())

    df_res = df_joined.select(
        col('_1').alias(to_compare),
        col('_2').alias('features'))
    df_res.show()
    df_res = df_res.repartition(200)
    #df_res.cache()
    print("df_res", df_res.count())

    print("Creating model")
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=100)
    model = mh.fit(df_res)
    model.transform(df_res).show()

    print("Calculating Jaccard")
    df_jacc_dist = model.approxSimilarityJoin(df_res,
                                              df_res,
                                              1.0,
                                              distCol="jaccard")
    df_jacc_dist.cache()
    df_jacc_dist.show()

    print("Selecting needed columns")
    df_filtered = df_jacc_dist.select(
        col("datasetA." + to_compare).alias(to_compare + "1"),
        col("datasetB." + to_compare).alias(to_compare + "2"), col("jaccard"))
    df_filtered.show()
    df_filtered = df_filtered.where(
        col(to_compare + "1") < col(to_compare + "2"))
    df_filtered.show()
    #hier irgendwo Problem
    df_needed = df_filtered.where((col("jaccard") >= minval)
                                  & (col("jaccard") <= maxval))
    df_needed.show()

    if mode == "sim":
        df_needed = df_needed.withColumn("jaccard", 1.0 - col("jaccard"))

    return df_needed
Example #4
0
    def get_similar_word(self,
                         column,
                         text,
                         n_words=10,
                         n_hash=5,
                         verbose=True):
        """
        Get similar strings in a column by MinHash

        target_col: target column to search
        text: input string
        n_words: number of similar strings
        n_hash: number of hash functions for MinHash
        verbose:True if you want to see interactive output
        """
        rdd = self.data.rdd
        rdd = rdd.filter(lambda row: row[column] != None)
        rdd = rdd.filter(lambda row: row[column] != "")
        rdd = rdd.filter(lambda row: len(row[column]) > 1)
        cdf = self.ss.createDataFrame(
            rdd.map(lambda row: (row[column] if row[column] != None else " ",
                                 list(row[column].lower())
                                 if row[column] != None else [" "])))

        ngram = NGram(n=2, inputCol="_2", outputCol="ngrams")
        if verbose:
            print("Counting Ngram...")
        ngramDataFrame = ngram.transform(cdf)
        if verbose:
            print("Vectorizing...")
        # fit a CountVectorizerModel from the corpus.
        cv = CountVectorizer(inputCol="ngrams",
                             outputCol="features",
                             vocabSize=3000,
                             minDF=0)

        cv_model = cv.fit(ngramDataFrame)

        result = cv_model.transform(ngramDataFrame)

        mh = MinHashLSH(inputCol="features",
                        outputCol="hashes",
                        numHashTables=n_hash)
        if verbose:
            print("Min Hashing...")
        model = mh.fit(result)

        input_text = text
        input_df = [{'text': input_text, 'characters': list(input_text)}]
        input_df = self.ss.createDataFrame(input_df)

        ngram = NGram(n=2, inputCol="characters", outputCol="ngrams")
        input_df = ngram.transform(input_df)

        key = cv_model.transform(input_df).first()['features']

        if (key.toArray().sum()) < 1:
            print("No Match! Try another input...")
            return

        if verbose:
            print("Finding nearest neighbors...")

        NNs = model.approxNearestNeighbors(result, key, n_words)
        NNs.show()
        #self.out=NNs.select('_1').distinct()
        return
Example #5
0
    dataA.show()

    Item = Row('id', 'features')
    Item_seq = []
    for index, row in df.iterrows():
        print(index)
        feature = sparseify(users_num, row["user_index"], row["ratings"])
        row = Item(row['item_id'], feature)
        Item_seq.append(row)

    dataB = spark.createDataFrame(Item_seq)
    dataB.show()

    start = time.time()
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
    model = mh.fit(dataB)

    print(
        "The hashed dataset where hashed values are stored in the column 'hashes':"
    )
    model.transform(dataB).show()

    # start experiment
    ratingdata = pd.read_csv('../users_items_100.csv')
    ratingdata['playtime_forever'] = round(
        np.log(ratingdata['playtime_forever'] + 1), 2)
    y = ratingdata['playtime_forever']
    X = ratingdata[['user_id', 'item_index']]
    print(X.shape)
    print(y.shape)
    traindata, testdata = train_test_split(ratingdata, train_size=0.9999)
Example #6
0
df2 = model.transform(df1)
df2.show()


def getsparsesize(v):
    return v.values.size


getsize_udf = udf(getsparsesize, IntegerType())
df2_with_lengths = df2.select("value", "features", getsize_udf("features").alias("vec_size"))
df2_with_lengths.show()

df2NotNull = df2_with_lengths.filter(getsize_udf(df2["features"]) != 0)

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=128)
model2 = mh.fit(df2)
transformed_df2 = model2.transform(df2NotNull)
transformed_df2.show()

edges = []
for k in range(0, transformed_df2.count()):
    edges.append(k)
print(edges)

def getHashColumns(df0, x):
    sum_of_hashes = 0
    for y in range(x, x + 4):
        sum_of_hashes += int(df0[y][0])
    return sum_of_hashes

def main():
    sc = SparkSession.builder.appName("SentencingAnalyzer")\
        .config("spark.driver.memory", "10G")\
        .getOrCreate()

    # main df
    cases = sc.read.json("../data/sentencingCases2.jsonl")
    df = cleanDf(cases)

    # read categorized csv
    categorizedCsv = sc.read.csv("../data/categorized.csv", header=True)
    categorizedCsv = categorizedCsv.select(
        'caseName',
        f.split(f.col("type"), " - ").alias('offenseType'), 'duration1',
        'sentenceType1')

    # create the search df
    df = extractOffenseKeywords(df)
    df.cache()
    dfSearch = sc.createDataFrame(searchData, ["term", "offenseKeywords"])

    # CLASSIFICATION OF OFFENSE
    hashingTF = HashingTF(inputCol="offenseKeywords",
                          outputCol="rawFeatures",
                          numFeatures=1000)
    result = hashingTF.transform(df)
    resultSearch = hashingTF.transform(dfSearch)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(result)
    rescaledData = idfModel.transform(result).filter(
        f.size('offenseKeywords') > 0)
    idfModelSearch = idf.fit(resultSearch)
    rescaledDataSearch = idfModelSearch.transform(resultSearch)

    mh = MinHashLSH(inputCol="features",
                    outputCol="hashes",
                    seed=12345,
                    numHashTables=20)
    modelMH = mh.fit(rescaledData)
    transformedData = modelMH.transform(rescaledData)

    modelMHSearch = mh.fit(rescaledDataSearch)
    transformedDataSearch = modelMH.transform(rescaledDataSearch)

    categorizedDf = modelMHSearch.approxSimilarityJoin(
        transformedDataSearch,
        transformedData,
        0.89,
        distCol="JaccardDistance")
    distanceDf = categorizedDf.select([f.col('datasetA.term')] + [f.col('datasetB.caseID')] + [f.col("JaccardDistance")]) \
        .orderBy('caseID', 'JaccardDistance')
    distanceDf = distanceDf.groupBy('caseID').agg(
        f.collect_list('term').alias('predictedOffences'),
        f.collect_list('JaccardDistance').alias('JaccardDistances'))
    distanceDf.cache()
    distanceDf.show()

    # EVALUATE CATEGORIZATION AGAINST MANUAL CATEGORIZATION
    distanceDfEval = distanceDf.join(
        categorizedCsv, distanceDf.caseID == categorizedCsv.caseName)
    distanceDfEval = distanceDfEval.filter(
        distanceDfEval.offenseType[0] != "N/A").filter(
            distanceDfEval.offenseType[0] != "multiple party sentence")
    calcuateDifferenceInPredictedVsActualOffences_udf = f.udf(
        calcuateDifferenceInPredictedVsActualOffences, FloatType())
    distanceDfEval = distanceDfEval.withColumn(
        "error",
        calcuateDifferenceInPredictedVsActualOffences_udf(
            distanceDfEval.predictedOffences, distanceDfEval.offenseType))
    calcuateDifferenceInPredictedVsActualOffencesPercentage_udf = f.udf(
        calcuateDifferenceInPredictedVsActualOffencesPercentage, FloatType())
    distanceDfEval = distanceDfEval.withColumn(
        "pctCorrect",
        calcuateDifferenceInPredictedVsActualOffencesPercentage_udf(
            distanceDfEval.predictedOffences, distanceDfEval.offenseType))
    distanceDfEval.select('caseID', 'predictedOffences', 'offenseType',
                          'JaccardDistances', 'error',
                          'pctCorrect').show(200, truncate=False)
    rmse = (distanceDfEval.groupBy().agg(f.sum('error')).collect()[0][0] /
            distanceDfEval.count())**(1.0 / 2)
    print("Offense category RMSE:", rmse)
    pctCorrectOffense = (distanceDfEval.groupBy().agg(
        f.sum('pctCorrect')).collect()[0][0] / distanceDfEval.count()) * 100
    print("Percentage of offenses correctly categorized: ", pctCorrectOffense)
Example #8
0
df_joined = df1_a.join(df2_a,
                       col('df1_a.author') == col('df2_a.author')).select(
                           'df1_a.title', 'df2_a.id')
df_joined.show(20)

# create a binary vector

dfWithFeat = df_joined.rdd.map(lambda r: (r['title'], (float(r['id'])))).groupByKey()\
    .map(lambda r: sparse_vec(r)).toDF()
df_res = dfWithFeat.select(
    col("_1").alias("title"),
    col("_2").alias("features"))
df_res.show()

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model = mh.fit(df_res)

# Feature Transformation
print(
    "The hashed dataset where hashed values are stored in the column 'hashes':"
)
model.transform(df_res).show()

print("Approximately distance smaller than 0.6:")
df_jacc_dist = model.approxSimilarityJoin(df_res, df_res, 0.6, distCol="JaccardDistance")\
    .select(col("datasetA.title").alias("title"),
            col("JaccardDistance")).filter("JaccardDistance != 0").orderBy(desc("JaccardDistance"))
df_jacc_dist.show()

df_hist = df_jacc_dist.select(col("JaccardDistance"))
Example #9
0
cv = CountVectorizer(inputCol="words",
                     outputCol="rawFeatures",
                     vocabSize=100000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

#pipeline dos processos declarados para os dados de teste e treino
pipeline = Pipeline(stages=[tk, swr, cv, idf])
model_pipe = pipeline.fit(data_treino)
data_treino = model_pipe.transform(data_treino)

model_pipe = pipeline.fit(data_test)
data_test = model_pipe.transform(data_test)

#Geracao do modelo e teste
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model = mh.fit(data_treino)
data_treino = model.transform(data_treino)
data_treino.show()
#Modelo de dados treinado
'''
te = data_test.select("features").collect()
tr = data_treino.select("features").collect()
'''
data_test.select("features").show()
dadosTef = data_test.select("features").rdd.flatMap(lambda x: x).collect()
print("	Features dos dados de teste")
dadosTr = data_treino.select("NewsGroup",
                             "features").rdd.flatMap(lambda x: x).collect()

#model.approxNearestNeighbors(SparseVector(str(tr)),SparseVector(str(te[4])),2),show()