Ejemplo n.º 1
0
def levenshtein_matrix(df, col_name):
    """
    Create a couple of column with all the string combination
    :param df:
    :param col_name:
    :return:
    """
    df = keycollision.fingerprint(df, col_name)

    col_fingerprint = col_name + "_FINGERPRINT"
    col_distance = col_name + "_LEVENSHTEIN_DISTANCE"

    temp_col_1 = col_name + "_LEVENSHTEIN_1"
    temp_col_2 = col_name + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    df = df.select(col_fingerprint).distinct().select(
        F.col(col_fingerprint).alias(temp_col_1),
        F.col(col_fingerprint).alias(temp_col_2))

    #  Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(col_distance, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    return df
Ejemplo n.º 2
0
def levenshtein_matrix(df, input_col):
    """
    Create a couple of column with all the string combination
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    df = keycollision.fingerprint(df, input_col)
    # df.table()
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    df = df.select(
        F.col(fingerprint_col).alias(temp_col_1),
        F.col(fingerprint_col).alias(temp_col_2)).distinct()

    #  Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    if Optimus.cache:
        df = df.cache()

    return df
def leven_helper(df, ref_df, cut_off, type_str):
    #print("size of reference_df",ref_df.count())
    df_columns = df.columns
    # grab the non typed entries in the input df
    new_df = df.filter(df["true_type"].isNull())
    #crossjoin null values with reference columns
    ref_columns = ref_df.columns
    levy_df = new_df.crossJoin(ref_df)
    #compute levy distance
    levy_df = levy_df.withColumn("word1_word2_levenshtein",\
        levenshtein(lower(col(df_columns[0])), lower(col(ref_columns[0]))))
    #collect rows that were less than cutoff
    levy_df = levy_df.filter(levy_df["word1_word2_levenshtein"] <= cut_off)
    levy_columns = levy_df.columns
    levy_df = levy_df.groupBy(levy_columns[0]).min("word1_word2_levenshtein")
    levy_columns = levy_df.columns
    levy_df = levy_df.select(col(levy_columns[0]), \
        col(levy_columns[1]).alias("min"))
    levy_columns = levy_df.columns
    levy_df = levy_df.drop("min")
    #df = df.withColumn("true_type", when(col(df_columns[0]).isin(levy_df[levy_columns[0]]), type_str).otherwise(df["true_type"]))
    levy_df = levy_df.collect()
    levy_df = [x[0] for x in levy_df]
    rdf = df.withColumn(
        "true_type",
        when(df[df_columns[0]].isin(levy_df),
             type_str).otherwise(df["true_type"]))
    return rdf
Ejemplo n.º 4
0
def count_city(df_to_process):
    df_processed = df_to_process.join(
        df_pre_city,
        F.levenshtein(F.lower(df_to_process._c0), F.lower(df_pre_city._c0)) <
        3)
    df_left = df_to_process.filter(F.col("id").isin(df_processed["id"]))
    return 'city', df_left, df_processed.select(
        F.sum("_c1"),
        F.lit('city').alias("sem_type"))
Ejemplo n.º 5
0
def get_neighbors_notes(song, featureDF):
    comparator_value = song[0]["notes"]
    df_merged = featureDF.withColumn("compare", lit(comparator_value))
    df_levenshtein = df_merged.withColumn(
        "distances_levenshtein", levenshtein(col("notes"), col("compare")))
    #df_levenshtein.sort(col("word1_word2_levenshtein").asc()).show()
    result = df_levenshtein.select("id", "key", "scale",
                                   "distances_levenshtein")
    return result
Ejemplo n.º 6
0
def count_car_make(df_to_process):
    df_processed = df_to_process.join(
        df_pre_car_make,
        F.levenshtein(F.lower(df_to_process._c0), F.lower(df_pre_car_make._c0))
        < 3)
    # df_processed = calc_jaccard_sim(df_to_process, df_pre_car_make)
    df_left = df_to_process.filter(F.col("id").isin(df_processed["id"]))
    return 'car_make', df_left, df_processed.select(
        F.sum("_c1"),
        F.lit('car_make').alias("sem_type"))
Ejemplo n.º 7
0
def count_city_agency_abbrev(df_to_process):
    df_processed = df_to_process.join(
        df_pre_city_agency_abbrev,
        F.levenshtein(F.lower(df_to_process._c0),
                      F.lower(df_pre_city_agency_abbrev._c0)) < 1)
    # df_processed = calc_jaccard_sim(df_left, df_pre_city_agency_abbrev)
    df_left = df_to_process.filter(F.col("id").isin(df_processed["id"]))
    return 'city_agency', df_left, df_processed.select(
        F.sum("_c1"),
        F.lit('city_agency').alias("sem_type"))
Ejemplo n.º 8
0
def count_school_level(df_to_process):
    df_processed = df_to_process.join(
        df_pre_school_level,
        F.levenshtein(F.lower(df_to_process._c0),
                      F.lower(df_pre_school_level._c0)) < 3)
    # df_processed = calc_jaccard_sim(df_to_process, df_pre_school_level)
    df_left = df_to_process.filter(F.col("id").isin(df_processed["id"]))
    return 'school_level', df_left, df_processed.select(
        F.sum("_c1"),
        F.lit('school_level').alias("sem_type"))
Ejemplo n.º 9
0
def spark_ratio(left, right):
    # TODO: sparkify this function
    df = df(['left', 'right'])
    df = df.withColumn('len', F.min(F.length('left'), F.length('right')))
    df = df.withColumn('levenshtein', F.levenshtein('left', 'right'))
    df = df.withColumn('inv_edit_distance',
                       F.col('len') - F.col('levenshtein'))
    df = df.withColumn('ratio', F.col('inv_edit_distance') / F.col('len'))
    df = df.withColumnRenamed('ratio', 'fuzzy')
    df = df.select(['fuzzy'])
    return df
Ejemplo n.º 10
0
def fuzzyspark(df, on, value):
    q_val = value
    df = df.select([on])
    # TODO: Rework that part
    df = df.withColumn('query', F.lit(q_val).cast(F.StringType()))
    df = df.withColumn(
        'len', F.min(F.length(on),
                     F.lit(len(q_val)).cast(T.IntegerType())))
    df = df.withColumn('levenshtein', F.levenshtein(on, 'query'))
    df = df.withColumn('score', F.col('levenshtein') / F.col('len'))
    df = df.select(['score'])
    return df
Ejemplo n.º 11
0
def load_audio_id_text_id_mapping(spark, input_catalogue_path: str):
    audio_df, text_df = load_audio_and_text_dfs(spark, input_catalogue_path)

    joined_df = audio_df.join(text_df, "identifier")
    joined_df = joined_df.withColumn(
        "levenshtein",
        F.levenshtein(joined_df.audio_document_id, joined_df.text_document_id),
    )
    audio_to_text_mapping_df = joined_df.groupBy("identifier").applyInPandas(
        fuzzy_matching, schema=FUZZY_MATCHING_RETURN_TYPE
    )
    return audio_to_text_mapping_df
 def assign_alternative_match_word_based_on_lavenshtein(
         self, not_matched_df, df_vector_filler):
     not_matched_df_x_filler = not_matched_df.crossJoin(
         df_vector_filler.select(col(self.word_col_name).alias('match')))
     df1_x_df2 = not_matched_df_x_filler.withColumn(
         "levenshtein", levenshtein(col('word'), col('match')))
     return df1_x_df2.withColumn('overall_min', min(col("levenshtein")).over(
         Window.partitionBy(self.sentence_col_id, 'word'))) \
         .where(col('overall_min') == col('levenshtein')) \
         .withColumn('rank', row_number().over(Window.partitionBy(self.sentence_col_id, 'word').orderBy('match'))) \
         .where(col('rank') == 1) \
         .drop('levenshtein', 'overall_min', 'rank')
Ejemplo n.º 13
0
def anonimization(dataframe, marca):
    marca_control = marca.lower().replace(" ", "")
    stringDistanceDf = dataframe.\
        withColumn("marca_limpia", regexp_replace(lower(col("marca")), " ", "")).\
        withColumn("control_str", lit(marca_control)).\
        withColumn("string_distance", levenshtein(col("marca_limpia"), col("control_str")))

    new_column_2 = when(col("string_distance") <= 7,
                        lit("marca")).otherwise(lit("desconocido"))
    finalDf = stringDistanceDf.\
        withColumn("marca_anom", new_column_2).\
        drop("marca","marca_limpia","control_str","string_distance")
    return finalDf
def user_ratings_match(df_movies,dfUserRatings):

    #function to find the closest match to user input movie title based on levenshtein distance

    myRatings = dfUserRatings.join(df_movies).select('*',levenshtein(dfUserRatings.User_Title,df_movies.Movies_Title).alias('distance')).cache()


    myRatings_best_title_match = myRatings.groupBy('User_Title').agg({'distance':'min'}).withColumnRenamed('min(distance)','min_dis')

    join_condition = [myRatings.User_Title == myRatings_best_title_match.User_Title  ,myRatings.distance == myRatings_best_title_match.min_dis  ]

    myRatings_movie_id = myRatings_best_title_match.join(myRatings,join_condition).select('movie_id','User_Ratings').withColumnRenamed('User_Ratings','Rating')
    myRatings_user_id  = myRatings_movie_id.withColumn('user_id',myRatings_movie_id.Rating - myRatings_movie_id.Rating)

    return(myRatings_user_id)
Ejemplo n.º 15
0
def best_choice(dict, i, PG, seedArray, genome, sc):
    SC = []
    for z in range(len(PG)):
        for pos_gen in PG[z]:
            seq = (dict[i], genome[pos_gen - seedArray[z]: pos_gen - seedArray[z] + len(dict[i])], seedArray[z], pos_gen)
            SC.append(seq)
    rddSeq = sc.parallelize(SC)
    schemaSeqDF = rddSeq.map(lambda x: Row(SEQ=x[0], GEN=x[1], POS_SEQ=x[2], POS_GEN=x[3]))
    df = sqlContext.createDataFrame(schemaSeqDF)
    df = df.withColumn("dist", F.levenshtein(F.col("SEQ"), F.col("GEN")))
    val = (1 / float(len(dict[i]))) * 100
    df = df.withColumn("percentage", val*F.col( "dist")).drop("dist")
    minDF = df.agg(min(col("percentage")).alias("percentage"))
    min_percentage = [x["percentage"] for x in minDF.rdd.collect()]
    df = df.filter(df.percentage == min_percentage[0])
    return df,min_percentage
Ejemplo n.º 16
0
def get_neighbors_notes(song):
    df = spark.createDataFrame(notes, ["id", "key", "scale", "notes"])
    filterDF = df.filter(df.id == song)
    comparator_value = filterDF.collect()[0][3]
    df_merged = df.withColumn("compare", lit(comparator_value))
    df_levenshtein = df_merged.withColumn(
        "distances_levenshtein", levenshtein(col("notes"), col("compare")))
    #df_levenshtein.sort(col("word1_word2_levenshtein").asc()).show()
    result = df_levenshtein.select("id", "key", "scale",
                                   "distances_levenshtein")
    aggregated = result.agg(F.min(result.distances_levenshtein),
                            F.max(result.distances_levenshtein))
    max_val = aggregated.collect()[0]["max(distances_levenshtein)"]
    min_val = aggregated.collect()[0]["min(distances_levenshtein)"]
    return result.withColumn('scaled_levenshtein',
                             (result.distances_levenshtein - min_val) /
                             (max_val - min_val)).select(
                                 "id", "key", "scale", "scaled_levenshtein")
    def LEVEN(df):
        print("Computing Levenshtein for:", colName)
        types = {}
        df_columns = df.columns
        ###############
        # Cities
        ###############
        cities_columns = cities_df.columns
        cities_crossjoin = df.crossJoin(cities_df)
        cities_levy = cities_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('cities')))
        cities_count = cities_levy.filter(
            cities_levy["word1_word2_levenshtein"] <= 2)
        if len(cities_count.take(1)) > 0:
            cities_frequency = cities_count.groupBy().sum().collect()[0][0]
            types['cities'] = cities_frequency

        ###############
        # Neighborhoods
        ###############
        neighborhood_columns = neighborhood_df.columns
        neighborhood_crossjoin = df.crossJoin(neighborhood_df)
        neighborhood_levy = neighborhood_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('neighborhood')))
        neighborhood_count = neighborhood_levy.filter(
            neighborhood_levy["word1_word2_levenshtein"] <= 2)
        if len(neighborhood_count.take(1)) > 0:
            neighborhood_frequency = neighborhood_count.groupBy().sum(
            ).collect()[0][0]
            types['neighborhood'] = neighborhood_frequency

        ###############
        # Borough
        ###############
        borough_columns = borough_df.columns
        borough_crossjoin = df.crossJoin(borough_df)
        borough_levy = borough_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('borough')))
        borough_count = borough_levy.filter(
            borough_levy["word1_word2_levenshtein"] <= 2)
        if len(borough_count.take(1)) > 0:
            borough_frequency = borough_count.groupBy().sum().collect()[0][0]
            types['borough'] = borough_frequency

        ###############
        # School Name
        ###############
        schoolname_columns = schoolname_df.columns
        schoolname_crossjoin = df.crossJoin(schoolname_df)
        schoolname_levy = schoolname_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('schoolname')))
        schoolname_count = schoolname_levy.filter(
            schoolname_levy["word1_word2_levenshtein"] <= 2)
        if len(schoolname_count.take(1)) > 0:
            schoolname_frequency = schoolname_count.groupBy().sum().collect(
            )[0][0]
            types['schoolname'] = schoolname_frequency

        ###############
        # Color
        ###############
        color_columns = color_df.columns
        color_crossjoin = df.crossJoin(color_df)
        color_levy = color_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('color')))
        color_count = color_levy.filter(
            color_levy["word1_word2_levenshtein"] <= 2)
        if len(color_count.take(1)) > 0:
            color_frequency = color_count.groupBy().sum().collect()[0][0]
            types['color'] = color_frequency

        ###############
        # Carmake
        ###############
        carmake_columns = carmake_df.columns
        carmake_crossjoin = df.crossJoin(carmake_df)
        carmake_levy = carmake_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('carmake')))
        carmake_count = carmake_levy.filter(
            carmake_levy["word1_word2_levenshtein"] <= 2)
        if len(carmake_count.take(1)) > 0:
            carmake_frequency = carmake_count.groupBy().sum().collect()[0][0]
            types['carmake'] = carmake_frequency

        ###############
        # City Agency
        ###############
        cityagency_columns = cityagency_df.columns
        cityagency_crossjoin = df.crossJoin(cityagency_df)
        cityagency_levy = cityagency_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('cityagency')))
        cityagency_count = cityagency_levy.filter(
            cityagency_levy["word1_word2_levenshtein"] <= 2)
        if len(cityagency_count.take(1)) > 0:
            cityagency_frequency = cityagency_count.groupBy().sum().collect(
            )[0][0]
            types['cityagency'] = cityagency_frequency

        ##############
        # Area of Study
        ##############
        areastudy_columns = areastudy_df.columns
        areastudy_crossjoin = df.crossJoin(areastudy_df)
        areastudy_levy = areastudy_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('areastudy')))
        areastudy_count = areastudy_levy.filter(
            areastudy_levy["word1_word2_levenshtein"] <= 2)
        if len(areastudy_count.take(1)) > 0:
            areastudy_frequency = areastudy_count.groupBy().sum().collect(
            )[0][0]
            types['areastudy'] = areastudy_frequency

        ##############
        # Subjects
        ##############
        subjects_columns = subjects_df.columns
        subjects_crossjoin = df.crossJoin(subjects_df)
        subjects_levy = subjects_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('subjects')))
        subjects_count = subjects_levy.filter(
            subjects_levy["word1_word2_levenshtein"] <= 2)
        if len(subjects_count.take(1)) > 0:
            subjects_frequency = subjects_count.groupBy().sum().collect()[0][0]
            types['subjects'] = subjects_frequency

        ##############
        # School Levels
        ##############
        schoollevels_columns = schoollevels_df.columns
        schoollevels_crossjoin = df.crossJoin(schoollevels_df)
        schoollevels_levy = schoollevels_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('schoollevels')))
        schoollevels_count = schoollevels_levy.filter(
            schoollevels_levy["word1_word2_levenshtein"] <= 2)
        if len(schoollevels_count.take(1)) > 0:
            schoollevels_frequency = schoollevels_count.groupBy().sum(
            ).collect()[0][0]
            types['schoollevels'] = schoollevels_frequency

        ##############
        # Colleges
        ##############
        colleges_columns = college_df.columns
        college_crossjoin = df.crossJoin(college_df)
        college_levy = college_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('college')))
        college_counts = college_levy.filter(
            college_levy["word1_word2_levenshtein"] <= 2)
        if len(college_counts.take(1)) > 0:
            college_frequency = college_counts.groupBy().sum().collect()[0][0]
            types['college'] = college_frequency

        ##############
        # Vehicle Type
        ##############
        vehicletype_columns = vehicletype_df.columns
        vehicletype_crossjoin = df.crossJoin(vehicletype_df)
        vehicletype_levy = vehicletype_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('vehicletype')))
        vehicletype_counts = vehicletype_levy.filter(
            vehicletype_levy["word1_word2_levenshtein"] <= 2)
        if len(vehicletype_counts.take(1)) > 0:
            vehicletype_frequency = vehicletype_counts.groupBy().sum().collect(
            )[0][0]
            types['vehicletype'] = vehicletype_frequency

        ##############
        # Type of Location
        ##############
        typelocation_columns = typelocation_df.columns
        typelocation_crossjoin = df.crossJoin(typelocation_df)
        typelocation_levy = typelocation_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('typelocation')))
        typelocation_counts = typelocation_levy.filter(
            typelocation_levy["word1_word2_levenshtein"] <= 2)
        if len(typelocation_counts.take(1)) > 0:
            typelocation_frequency = typelocation_counts.groupBy().sum(
            ).collect()[0][0]
            types['typelocation'] = typelocation_frequency

        ##############
        # Parks
        ##############
        parks_columns = parks_df.columns
        parks_crossjoin = df.crossJoin(parks_df)
        parks_levy = parks_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('parks')))
        park_counts = parks_levy.filter(
            parks_levy['word1_word2_levenshtein'] <= 2)
        if len(park_counts.take(1)) > 0:
            #will this indexing cause issues if first column is integer schema?
            parks_frequency = park_counts.groupBy().sum().collect()[0][0]
            types['parks'] = parks_frequency

################
# Building Codes
################
        building_columns = building_code_df.columns
        building_crossjoin = df.crossJoin(building_code_df)
        building_code_levy = building_crossjoin.withColumn(
            "word1_word2_levenshtein",
            levenshtein(col(df_columns[0]), col('building_codes')))
        building_counts = building_code_levy.filter(
            building_code_levy['word1_word2_levenshtein'] <= 1)
        if len(building_counts.take(1)) > 0:
            building_code_frequency = building_counts.groupBy().sum().collect(
            )[0][0]
            types['building_code'] = building_code_frequency
        return types
Ejemplo n.º 18
0
#joining steps:
# 1 join both lv and yelp df
# 2 find leven. distance
# 3 find min leven distance group by lv id
# 4 join step 3 with step 2 based on leven distance
# 5 remove lv_ids where count is more than 1, since there is a tie in step 3
# 6 save result as parquet with lv_id and yelp_id

#step 1
combined_data = lv_df.join(yelp_df)

#step 2
combined_data = combined_data.select("yelp_id",\
                combined_data.id.alias("lv_id"), \
                levenshtein("lv_full_form", "yelp_full_form").alias("leven1"))

#step 3
min_leven  = combined_data.groupby("lv_id").min("leven1")\
             .select(col("lv_id").alias("lv_id2"), \
                     col("min(leven1)").alias("m_leven"))

#step 4
combined_data = combined_data.select(combined_data.lv_id, \
                                     combined_data.yelp_id, \
                                     combined_data.leven1.alias("lev"))

combined2 = min_leven.join(combined_data,\
                           [min_leven.lv_id2 == combined_data.lv_id, \
                            min_leven.m_leven == combined_data.lev])
#step 5
Ejemplo n.º 19
0
def levenshtein_json(df, input_col):
    """
    Output the levenshtein distance in json format
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    df = keycollision.fingerprint(df, input_col)
    # df.table()
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    result = df.select(input_col,
                       F.col(fingerprint_col).alias(temp_col_1)).distinct()

    df = df.select(input_col,
                   F.col(fingerprint_col).alias(temp_col_1),
                   F.col(fingerprint_col).alias(temp_col_2)).distinct()

    # Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    # if Optimus.cache:
    #     df = df.cache()

    # Select only the string with shortest path
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"
    temp_r = "TEMP_R"

    df_r = (df.rows.drop(F.col(distance_col) == 0).groupby(temp_col_1).agg(
        F.min(distance_col).alias(distance_r_col)).cols.rename(
            temp_col_1, temp_r)).repartition(1)

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \
        .select(temp_col_1, distance_col, temp_col_2).repartition(1)

    # Create the clusters/lists

    df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2)))

    kv_dict = {}
    for row in result.collect():
        _row = list(row.asDict().values())
        kv_dict[_row[1]] = _row[0]

    kv_result_df = {}
    for row in df.collect():
        _row = list(row.asDict().values())
        kv_result_df[_row[0]] = _row[1]

    result = {}
    for k, v in kv_result_df.items():
        a = result[kv_dict[k]] = []
        for iv in v:
            a.append(kv_dict[iv])

    return result
Ejemplo n.º 20
0
def main():
	inputs = sys.argv[1]
	rating_file = sys.argv[2]
	output = sys.argv[3]

	conf = SparkConf().setAppName('movie recommendation')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'

	sqlContext = SQLContext(sc)
	
	""" sbaronia - getting files from directory and 
	reading from it and using parse_rating_movie and parse_my_input for parsing the
	content of the files to an rdd"""

	movies_path = join(inputs, "movies.dat")
	ratings_path = join(inputs, "ratings.dat")
	
	read_ratings = sc.textFile(ratings_path)
	read_movies  = sc.textFile(movies_path)
	read_mymovies = sc.textFile(rating_file)

	parse_ratings = read_ratings.map(lambda line : parse_rating_movie(line, "ratings.dat")).cache()
	parse_movies = read_movies.map(lambda line : parse_rating_movie(line, "movies.dat")).cache()
	parse_mymovies = read_mymovies.map(lambda line: parse_my_input(line)).cache()
	
	""" sbaronia - converting movie and rating data to dataframes """

	schema_movie = StructType([StructField('movie_id', IntegerType(), True),
								StructField('movie_name', StringType(), True)])

	movie_df = sqlContext.createDataFrame(parse_movies, schema=schema_movie).cache()


	schema_mymovie = StructType([StructField('ip_uid', IntegerType(), True),
								StructField('ip_mname', StringType(), True),
								StructField('ip_rating', IntegerType(), True),
								StructField('ldistance', IntegerType(), True)])

	mymovie_df = sqlContext.createDataFrame(parse_mymovies, schema=schema_mymovie).cache()

	""" sbaronia - combining user input movies with movies data
	then finding Levenshtein distance with every movie and then finding
	the one with minimum Levenshtein distance as our best match"""

	movie_plus_ip = movie_df.join(mymovie_df, None, 'inner').cache()
		
	movie_plus_ip_distance = movie_plus_ip.withColumn('ldistance', levenshtein('movie_name','ip_mname'))

	mymovie_distance = movie_plus_ip_distance \
							  .groupBy('ip_uid', 'ip_mname') \
							  .min('ldistance') \
							  .withColumnRenamed('min(ldistance)','ldistance') \
							  .cache()

	""" sbaronia - join the tables to get only those movies with minimum 
	Levenshtein distance and then from that table select columns 
	necessary. Then create a test data for all movies with new user 0"""
	refined_movies = movie_plus_ip_distance.join(mymovie_distance, ['ip_uid', 'ip_mname', 'ldistance'], 'inner').cache()
	
	input_rating = refined_movies.select('ip_uid', 'movie_id', 'ip_rating').cache()

	input_rating_rdd = input_rating.rdd.map(lambda row1: (row1.ip_uid, row1.movie_id, float(row1.ip_rating))).cache()
	
	input_with_train = sc.union([input_rating_rdd, parse_ratings]).cache()
	
	test_newuser = parse_movies.map(lambda line: (0, line[0])).cache()
	
	""" sbaronia - train on all data including new one and then 
	test on all movies for new user and sort them in descending 
	order of ratings"""
	model = ALS.train(input_with_train, 10, 10, 0.1)	
	predictions = model.predictAll(test_newuser) \
					   .map(lambda row1: (row1.rating, row1.product)) \
					   .sortByKey(ascending=False) \
					   .map(lambda row: (row[1], row[0])) \
					   .cache()

	final_rating = sqlContext.createDataFrame(predictions, ['movie_id', 'movie_rating']).cache()

	final_movie_rating = movie_df.join(final_rating, ['movie_id'], 'inner').sort("movie_rating", ascending=False).cache()

	final_movie_rating_rdd = final_movie_rating.rdd.map(lambda row: (str(row.movie_id) + ' :: ' + str(row.movie_name)) + ' :: ' + str(row.movie_rating)).coalesce(1).cache()
	final_movie_rating_rdd.saveAsTextFile(output)
Ejemplo n.º 21
0
#joining steps:
# 1 join both lv and yelp df
# 2 find leven. distance on name of restaurant/business
# 3 find min leven distance group by lv id 
# 4 join step 3 with step 2 based on leven distance and lv_id
# 5 repeat steps 2-4 now with leven on address
# 6 remove lv_ids where count is more than 1, since there is a tie in step 5
# 7 save result as parquet with lv_id and yelp_id

#step 1
combined_data = lv_df.join(yelp_df)

#step 2, leven on names
combined_data = combined_data.withColumn("leven_name", 
                levenshtein(col("lv_name"), col("yelp_name")))

#step 3
min_leven  = combined_data.groupby("lv_id").min("leven_name")\
             .select(col("lv_id").alias("lv_id2"), \
             col("min(leven_name)").alias("m_leven_name")) 

combined_data = combined_data.select(combined_data.lv_id, \
                                     combined_data.yelp_id, \
                                     combined_data.lv_addr,\
                                     combined_data.yelp_addr,\
                                   combined_data.leven_name.alias("lev_name"))

#step 4
combined2 = min_leven.join(combined_data,\
                           [min_leven.lv_id2 == combined_data.lv_id, \
Ejemplo n.º 22
0
dist_udf = udf(tfidfDist, DoubleType())
res = res.withColumn('dist', dist_udf(res['idf1'], res['idf2']))

# Drop unnessesary columns from `data` and join in a new feature *"tfidfDistance"*.
data = data.drop('words1', 'words2')
data = data.join(res.selectExpr('id', 'dist as tfidfDistance'),
                 on='id',
                 how='inner')
#data.select('id','tfidfDistance').show(6)

print("created feature TF-IDF")

#Add Levenshtein Distance as last feature, for both lemmas and questions
from pyspark.sql.functions import levenshtein

data = data.withColumn('lemma_leven', levenshtein('lemma1', 'lemma2'))
data = data.withColumn('question_leven', levenshtein('question1', 'question2'))

print('All Features Created in %d Minutes' %
      (float(format(time.time() - start_time)) / 60))

#output features
outData = data.select(['id'] + featureNames + ['is_duplicate'])
outData = outData.cache()

print("Cached outData in %d Minutes" %
      (float(format(time.time() - start_time)) / 60))

outTrainFileName = "./AML_Project2_Data/train_features.csv"
outTestFileName = "./AML_Project2_Data/test_features.csv"
# (5) Create a new column containing the full name for each driver.

from pyspark.sql.functions import concat_ws
drivers \
  .withColumn("full_name", concat_ws(" ", "first_name", "last_name")) \
  .select("first_name", "last_name", "full_name") \
  .show(5)

# (6) Create a new column containing the average star rating for each driver.

drivers \
  .withColumn("star_rating", round(col("stars") / col("rides"), 2)) \
  .select("rides", "stars", "star_rating") \
  .show(5)

# (7) Find the rider names that are most similar to `Brian`.  **Hint:** Use the
# Levenshtein function.

from pyspark.sql.functions import lit, levenshtein
riders \
  .select("first_name") \
  .distinct() \
  .withColumn("distance", levenshtein(col("first_name"), lit("Brian"))) \
  .sort("distance") \
  .show()

# ## Cleanup

# Stop the SparkSession:
spark.stop()
Ejemplo n.º 24
0
def calculate_simillarity(new_df):
    new_df = new_df.withColumn(\
    "matching_levenshtein_dist",
    (levenshtein(col("description_x"), col("description_y"))))
    print(new_df.collect())
Ejemplo n.º 25
0
 cur_dataset = cur_dataset.withColumn('sem_type',
                                      get_semantic_type(f.col('_c0')))
 # rule based mechanism
 cur_dataset = cur_dataset.withColumn(
     'sem_type',
     f.when(f.col('sem_type') == 'null',
            get_rule_based(f.col('_c0'))).otherwise(f.col('sem_type')))
 # based on soundex and edit distance
 cur_dataset = cur_dataset.withColumn(
     'soundex_phon',
     f.when(f.col('sem_type') == 'null', f.soundex(f.col('_c0'))).otherwise(
         f.lit('null').cast(StringType())))
 cur_dataset = cur_dataset.join(
     merged_df, [f.col('soundex_phon_cur') == f.col('soundex_phon')],
     'left_outer').withColumn(
         'edit_dist', f.levenshtein(f.col('column_value'), f.col('_c0')))
 min_dataset = cur_dataset.groupBy('_c0').agg(
     f.min(f.col('edit_dist')).alias('min_edit_dist')).filter(
         f.col('min_edit_dist') <= 3).withColumnRenamed('_c0', 'c_value')
 temp_dataset = cur_dataset.join(
     min_dataset, [cur_dataset._c0 == min_dataset.c_value],
     'left_outer').filter(
         cur_dataset.edit_dist == min_dataset.min_edit_dist)
 temp_dataset = temp_dataset.select(
     f.col('_c0').alias('c_value'),
     f.col('column_value').alias('col_value'),
     f.col('column_name').alias('col_name'), 'min_edit_dist')
 temp_dataset = temp_dataset.groupBy(f.col('c_value')).agg(
     f.first(f.col('col_name')).alias('col_name'))
 cur_dataset = cur_dataset.join(
     temp_dataset, [cur_dataset._c0 == temp_dataset.c_value],
Ejemplo n.º 26
0
#sort the rest alphabetically
sortYT = sortYT.orderBy('title', ascending=True)

#rename columns
sortYT = sortYT.withColumnRenamed('asset_id', 'YT_ID')
sortYT = sortYT.withColumnRenamed('title', 'YT_Title')
sortYT = sortYT.withColumnRenamed('writers', 'YT_Writers')

#----------------------------------------------
#Merging by title
#----------------------------------------------
#Join DFs on titles with levenshtein distance

from pyspark.sql.functions import levenshtein
joinedDF = sortdt.join(sortYT,
                       levenshtein(sortdt["Title"], sortYT["YT_Title"]) < 3)
YTDT = joinedDF[[
    'Downtown_ID', 'YT_ID', 'Title', 'Downtown_Composer', 'YT_Writers', 'ratio'
]]

#do levenshtein distance
from pyspark.sql.functions import levenshtein
ratioYTDT = YTDT.withColumn(
    'ratio', levenshtein(col('Downtown_Composer'), col('YT_Writers')))

#keep all rows with ratio >= 85
#YTDT = YTDT.filter(YTDT['ratio']<= 15) #whats a good ld to stop at?
#drop ratio column
#YTDT = YTDT.drop('ratio')
#save to output file
YTDT.write.csv("matches.csv")
Ejemplo n.º 27
0
    StructField('movieid', IntegerType(), False),
    StructField('rating', StringType(), False)
])

userrating_sql = sqlContext.createDataFrame(userrating_split, userschema)
movies_sql = sqlContext.createDataFrame(movies_split, movieschema).cache()
rating_sql = sqlContext.createDataFrame(rating_split, ratingschema)

movie_prep = movies_sql.select('movieid')

movies_join_usersrating = userrating_sql.join(movies_sql)
rating_join_movies = rating_sql.join(movie_prep, ['movieid'])

distmovies = movies_join_usersrating.select(
    'movieid', 'tweetmovietitle', 'userrating', 'usermovietitle', 'userid',
    levenshtein('usermovietitle',
                'tweetmovietitle').alias('min-dist')).cache()

mindistmovies = distmovies.groupBy('usermovietitle').min(
    'min-dist').withColumnRenamed('min(min-dist)', 'min-dist')

user_joined = mindistmovies.join(distmovies,
                                 ['usermovietitle', 'min-dist']).select(
                                     'userid', 'movieid', 'userrating')

train_data = user_joined.unionAll(rating_join_movies).cache()
rank = 10
numIterations = 10
model = ALS.train(train_data, rank, numIterations)

movies = model.recommendProducts(0, 10)
    StructField('userid', IntegerType(), False),
    StructField('movieid', IntegerType(), False),
    StructField('rating', StringType(), False)
])

userrating_sql = sqlContext.createDataFrame(userrating_split, userschema)
movies_sql = sqlContext.createDataFrame(movies_split, movieschema).cache()
rating_sql = sqlContext.createDataFrame(rating_split, ratingschema)

movie_prep = movies_sql.select('movieid')

movies_join_usersrating = userrating_sql.join(movies_sql)
rating_join_movies = rating_sql.join(movie_prep,['movieid'])


distmovies = movies_join_usersrating.select('movieid', 'tweetmovietitle', 'userrating', 'usermovietitle', 'userid',levenshtein('usermovietitle', 'tweetmovietitle').alias('min-dist')).cache()

mindistmovies =  distmovies.groupBy('usermovietitle').min('min-dist').withColumnRenamed('min(min-dist)', 'min-dist')

user_joined = mindistmovies.join(distmovies, ['usermovietitle', 'min-dist']).select('userid', 'movieid', 'userrating')

train_data = user_joined.unionAll(rating_join_movies).cache()
rank = 10
numIterations = 10
model = ALS.train(train_data, rank, numIterations)

movies = model.recommendProducts(0, 10)

movies_rdd = sc.parallelize(movies, 1)

moviespredict = sqlContext.createDataFrame(movies_rdd, ratingschema)
Ejemplo n.º 29
0
def main(argv=None):
    if argv is None:
        inputs = sys.argv[1]
        user = sys.argv[2]
        output = sys.argv[3]
    
    # Initialize Spark
    os.environ['PYSPARK_PYTHON'] = "python2"
    os.environ['PYTHONPATH'] = ':'.join(sys.path)
    
    conf = SparkConf().setAppName('movie_recommendation')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    
    # Load ratings data
    ratings_data = sc.textFile(inputs+"/ratings.dat")
    ratings = ratings_data.map(lambda l: l.split('::'))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    
    # Load movies data
    movies_data = sc.textFile(inputs+"/movies.dat")
    movies = movies_data.map(lambda l: l.split('::'))\
        .map(lambda l: (int(l[0]), l[1].encode('utf-8').strip()))
    movies.cache()
    
    # Load user rating
    user_rate_data = sc.textFile(user)
    user_rate = user_rate_data.map(lambda l: l.split(' '))\
        .map(lambda l: (int(l[0]), list_to_string(l[1:]).encode('utf-8').strip()))
    user_rate_list = user_rate.collect()
    
    # Match movies name input by user to movie id in movies data
    user_rate_list_2 = []
    for item in user_rate_list:
        user_title = sc.broadcast(item[1])
        df_movie = sqlContext.createDataFrame(movies,['movieId','title'])
        df_movie.registerTempTable('movies')
        df_movie2 = sqlContext.sql("SELECT *, \"" + user_title.value + "\" as user_title FROM movies") \
            .select('movieId','title',levenshtein('title', 'user_title').alias('distance'))
        
        movie_id = df_movie2.rdd.map(lambda x: (x['movieId'], x['title'], x['distance'])) \
            .reduce(find_min)
        user_rate_list_2.append([0, movie_id[0], item[0]])
    user_rate_rdd = sc.parallelize(user_rate_list_2) \
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    ratings_all = user_rate_rdd.union(ratings)
    ratings_all.cache()

    # Build the recommendation model using Alternating Least Squares
    ranks = [8, 20]
    numIters = [10, 20]
    bestModel = None
    bestMSE = float("inf")
    bestRank = 0
    bestNumIter = -1

    for rank, numIter in itertools.product(ranks, numIters):
        #Train Model
        model = ALS.train(ratings_all, rank, numIter)
        
        #Evaluate MSE
        testdata = ratings.map(lambda x: (x[0], x[1]))
        predictions = model.predictAll(testdata).map(lambda x: ((x[0], x[1]), x[2]))
        ratesAndPreds = ratings.map(lambda x: ((x[0], x[1]), x[2])).join(predictions)
        MSE = ratesAndPreds.map(lambda x: (x[1][0] - x[1][1])**2).mean()
        #print "MSE = %f for the model trained with " % MSE + \
        #      "rank = %d, and numIter = %d." % (rank, numIter)
        if (MSE < bestMSE):
            bestModel = model
            bestMSE = MSE
            bestRank = rank
            bestNumIter = numIter
   
    
    # Generate list recommended movies in descending order
    rated_movies = set([x[1] for x in user_rate_list_2])
    candidates = movies.filter(lambda x: x[0] not in rated_movies) #only recommend movie not rated yet
    predictions = bestModel.predictAll(candidates.map(lambda x: (0, x[0]))) \
        .map(lambda x: (x[1],x[2])).join(movies) \
        .sortBy(lambda (movieid,(score,title)): score, ascending=False)

    #write recommendation
    outdata = predictions.map(lambda (movieId, (score, title)): "Movies: %s - Score:%0.2f" % (title, score))
    outdata.saveAsTextFile(output)
Ejemplo n.º 30
0
gdp = spark.read.csv("final.csv",
                     header=True,
                     mode="DROPMALFORMED",
                     schema=schema)

gdpCountry = gdp.select("country").distinct()

for country in pycountry.countries:
    countries.append([unicodedata.normalize('NFKD', country.name).encode('ascii','ignore'),\
                      unicodedata.normalize('NFKD', country.alpha_3).encode('ascii','ignore')])

countries = sc.parallelize(countries).toDF(['name', 'code'])

joinedDF = gdpCountry.join(countries)

joinedDF = joinedDF.select("country", "name", "code",
                           levenshtein("country", "name").alias("lev"))

cleanedData = joinedDF.filter(joinedDF.lev == 0)

arr = cleanedData.select("name").rdd.map(lambda data: data["name"]).collect()
con = cleanedData.select("country").rdd.map(
    lambda data: data["country"]).collect()

countries = countries.filter(countries.name.isin(*arr) == False)
gdpCountry = gdpCountry.filter(gdpCountry.country.isin(*con) == False)

cleanedData.coalesce(1).write.csv("countryClean.csv", header=True)
countries.coalesce(1).write.csv("ccode.csv", header=True)
gdpCountry.coalesce(1).write.csv("leftover.csv", header=True)
Ejemplo n.º 31
0
    def colsNameSimilarity(self, df, category=None, df2=None):
        """
            :param df: A Spark Dataframe
            :param category: A string keyword to match
            :df2 : A second dataframe to match column names
            :return result_df : A dataframe having column_1, column_2, path similarity, levenshtein distance,soundex_equality
        """
        # Clean up column names so that we can prevent future errors
        for colName, dtype in df.dtypes:
            if '.' in colName or '`' in colName or colName.strip() != colName:
                df = df.withColumnRenamed(
                    colName,
                    colName.strip().replace(".", "", "_").replace("`", ""))
        if (df2 == None):
            result_df = pd.DataFrame(columns=['Column_1', 'Path Similarity'])
            category_sys = wn.synsets(category)
            if (category_sys != []):
                cnt = 0
                # put column names into appropriate bin
                for colName, dtype in df.dtypes:
                    colName_ = colName.split("_")
                    score = []
                    for i in range(len(colName_)):
                        colName_sys = wn.synsets(colName_[i])
                        if (colName_sys != []):
                            score.append(colName_sys[0].path_similarity(
                                category_sys[0]))
                    if (score != []):
                        score = max(score)
                    else:
                        score = 0
                    result_df.loc[cnt] = [colName, score]
                    cnt += 1
            else:
                print("Similarity cannot be calculated")
        else:
            for colName, dtype in df2.dtypes:
                if '.' in colName or '`' in colName or colName.strip(
                ) != colName:
                    df2 = df2.withColumnRenamed(
                        colName,
                        colName.strip().replace(".", "", "_").replace("`", ""))
            result_df = pd.DataFrame(
                columns=['Column_1', 'Column_2', 'Path Similarity'])
            cnt = 0
            # put column names into appropriate bin
            for colName1, dtype in df.dtypes:
                colName_1 = colName1.split("_")
                for colName2, dtype2 in df2.dtypes:
                    colName_2 = colName2.split("_")
                    score = []
                    #print(colName_1, colName_2, score)
                    for i in range(len(colName_1)):
                        colName_sys_1 = wn.synsets(colName_1[i])
                        for j in range(len(colName_2)):
                            colName_sys_2 = wn.synsets(colName_2[j])
                            if (colName_sys_1 != [] and colName_sys_2 != []):
                                score.append(colName_sys_1[0].path_similarity(
                                    colName_sys_2[0]))
                    score = [i for i in score if i != None]
                    if (score != []):
                        score = max(score)
                    else:
                        score = 0
                    result_df.loc[cnt] = [colName1, colName2, score]
                    cnt += 1
        result_df = result_df[result_df['Path Similarity'] > 0.5]
        if (result_df.empty is not True):
            result_df = self.spark.createDataFrame(result_df)
            if (category is None):
                result_df = result_df.withColumn("levenshtein distance", f.levenshtein(result_df["Column_1"],\
                                                                                       result_df["Column_2"]))
                result_df = result_df.withColumn("soundex_equality", f.soundex(result_df["Column_1"]) ==\
                                                 f.soundex(result_df["Column_2"]))
            else:
                result_df = result_df.withColumn("levenshtein distance", \
                                                 f.levenshtein(result_df["Column_1"],f.lit(category)))
                result_df = result_df.withColumn("soundex_equality", f.soundex(result_df["Column_1"]) ==\
                                                 f.soundex(f.lit(category)))

        else:
            schema = StructType([
                StructField("Column_1", StringType(), True),
                StructField("Path Similarity", DoubleType(), True),
                StructField("levenshtein distance", DoubleType(), True),
                StructField("soundex_equality", DoubleType(), True),
            ])
            result_df = self.spark.createDataFrame(self.sc.emptyRDD(),
                                                   schema=schema)
        return result_df
Ejemplo n.º 32
0
# Generate Dataframe 2 for testing
df2 = spark.createDataFrame(
    [
        (['dan',    'ocean',        '05/25/1983',   'medical code AAA']),
        (['danny',  'oceans11',     '04/26/1982',   'medical code BBB']),
        (['tess',   'ocean',        '02/10/1988',   'medical code CCC']),
        (['john',   'smith',        '01/30/1980',   'medical code DDD']),
        (['john',   'smith',        '09/30/1981',   'medical code EEE'])
    ], 
    ['firstname','lastname','dob','medical_code']
    )

df2.show(10,False)

# 1) Concat relevant fields used for fuzzy matching into a field called join_id
# 2) Apply levenshtein distance (which generates a score)
# 3) Use this score as a join criteria
# 4) Join on join_id

joinedDF = df.join(df2,
            levenshtein( concat(df.dob,df.firstname,df.lastname), concat(df2.dob,df2.firstname,df2.lastname) ) < 5,
            how='left_outer'
            )

joinedDF.show(10,False)




#ZEND