def ColorStats(rdd):
    total_count = rdd.count()
    analyse_color_df = spark.createDataFrame(rdd, StringType())
    a_name = analyse_color_df.select("value")
    soundex_udf = udf(lambda x: soundex_lookup(x), StringType())
    analyse_color_df = analyse_color_df.withColumn(
        'color_lower', lower_udf('value')).withColumn('color_soundex',
                                                      soundex('color_lower'))
    analyse_color_df = analyse_color_df.withColumn(
        'isColor', soundex_udf('color_soundex'))
    color_count = analyse_color_df.rdd.filter(lambda x: x[3] == 'true').count()
    return {'semantic_type': 'Color', 'count': color_count}
Example #2
0
    def colsNameSimilarity(self, df, category=None, df2=None):
        """
            :param df: A Spark Dataframe
            :param category: A string keyword to match
            :df2 : A second dataframe to match column names
            :return result_df : A dataframe having column_1, column_2, path similarity, levenshtein distance,soundex_equality
        """
        # Clean up column names so that we can prevent future errors
        for colName, dtype in df.dtypes:
            if '.' in colName or '`' in colName or colName.strip() != colName:
                df = df.withColumnRenamed(
                    colName,
                    colName.strip().replace(".", "", "_").replace("`", ""))
        if (df2 == None):
            result_df = pd.DataFrame(columns=['Column_1', 'Path Similarity'])
            category_sys = wn.synsets(category)
            if (category_sys != []):
                cnt = 0
                # put column names into appropriate bin
                for colName, dtype in df.dtypes:
                    colName_ = colName.split("_")
                    score = []
                    for i in range(len(colName_)):
                        colName_sys = wn.synsets(colName_[i])
                        if (colName_sys != []):
                            score.append(colName_sys[0].path_similarity(
                                category_sys[0]))
                    if (score != []):
                        score = max(score)
                    else:
                        score = 0
                    result_df.loc[cnt] = [colName, score]
                    cnt += 1
            else:
                print("Similarity cannot be calculated")
        else:
            for colName, dtype in df2.dtypes:
                if '.' in colName or '`' in colName or colName.strip(
                ) != colName:
                    df2 = df2.withColumnRenamed(
                        colName,
                        colName.strip().replace(".", "", "_").replace("`", ""))
            result_df = pd.DataFrame(
                columns=['Column_1', 'Column_2', 'Path Similarity'])
            cnt = 0
            # put column names into appropriate bin
            for colName1, dtype in df.dtypes:
                colName_1 = colName1.split("_")
                for colName2, dtype2 in df2.dtypes:
                    colName_2 = colName2.split("_")
                    score = []
                    #print(colName_1, colName_2, score)
                    for i in range(len(colName_1)):
                        colName_sys_1 = wn.synsets(colName_1[i])
                        for j in range(len(colName_2)):
                            colName_sys_2 = wn.synsets(colName_2[j])
                            if (colName_sys_1 != [] and colName_sys_2 != []):
                                score.append(colName_sys_1[0].path_similarity(
                                    colName_sys_2[0]))
                    score = [i for i in score if i != None]
                    if (score != []):
                        score = max(score)
                    else:
                        score = 0
                    result_df.loc[cnt] = [colName1, colName2, score]
                    cnt += 1
        result_df = result_df[result_df['Path Similarity'] > 0.5]
        if (result_df.empty is not True):
            result_df = self.spark.createDataFrame(result_df)
            if (category is None):
                result_df = result_df.withColumn("levenshtein distance", f.levenshtein(result_df["Column_1"],\
                                                                                       result_df["Column_2"]))
                result_df = result_df.withColumn("soundex_equality", f.soundex(result_df["Column_1"]) ==\
                                                 f.soundex(result_df["Column_2"]))
            else:
                result_df = result_df.withColumn("levenshtein distance", \
                                                 f.levenshtein(result_df["Column_1"],f.lit(category)))
                result_df = result_df.withColumn("soundex_equality", f.soundex(result_df["Column_1"]) ==\
                                                 f.soundex(f.lit(category)))

        else:
            schema = StructType([
                StructField("Column_1", StringType(), True),
                StructField("Path Similarity", DoubleType(), True),
                StructField("levenshtein distance", DoubleType(), True),
                StructField("soundex_equality", DoubleType(), True),
            ])
            result_df = self.spark.createDataFrame(self.sc.emptyRDD(),
                                                   schema=schema)
        return result_df
#all_filenames = ['jz4z-kudi.Violation_Location__City_.txt.gz', '2sps-j9st.PERSON_FIRST_NAME.txt.gz', 'w7w3-xahh.Address_City.txt.gz', '7crd-d9xh.website.txt.gz', 'erm2-nwe9.City.txt.gz']

# city preprocessing
us_cities_df = spark.read.load("/user/ts3813/uscities.csv",format="csv", delimiter=",", inferSchema="true", header="true")
nyc_cities_df = us_cities_df.filter(us_cities_df.state_id=='NY').select('city')

lower_udf = udf(lambda x: x.lower() if x else '', StringType())

udf_1 = udf(lambda x: 1, IntegerType())
nyc_cities_df = nyc_cities_df.withColumn('city', lower_udf('city'))
nyc_cities_df = nyc_cities_df.withColumn('city_bool', udf_1('city'))
wiki_colors_df = spark.read.load("/user/ts3813/wikipedia_color_names.csv",format="csv", delimiter=",", inferSchema="true", header="true")     
w_name = wiki_colors_df.select("Name")
w_name_lower = w_name.withColumn('name_lower', lower_udf('Name'))
w_name_lower = w_name_lower.withColumn('w_name_soundex', soundex('name_lower'))
w_soundex_vals = [w_soundex[0] for w_soundex in w_name_lower.select('w_name_soundex').collect()]

phoneNumRegex = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')

school_levels= ['Preschool','pre-kindergarten','elementary school','middle school','senior school','junior high school','high school','middle school','K-8','K-1','K-2','K-3','K-4','K-5','K-6','K-7']
school_levels_df = spark.createDataFrame(school_levels, StringType())
school_levels_df = school_levels_df.withColumn('levels', lower_udf('value'))
school_levels_df = school_levels_df.withColumn('school_level_bool', udf_1('levels'))


boroughs=['the bronx','staten island','manhattan','brooklyn','queens','bronx', 'K','B','R','Q','M']
boroughs_df=spark.createDataFrame(boroughs,StringType())
boroughs_df = boroughs_df.withColumn('levels', lower_udf('value'))
boroughs_df = boroughs_df.withColumn('boroughs_bool', udf_1('levels'))
Example #4
0
    ['firstname','lastname','state','dob']
    )

df.show(10,False)

# Step 1: Resolve any known name aliases, states, etc (i.e. dan, daniel, danny)
# For this POC code, I chose not to include this step since it's straight-forward to add a dictionary for matching and resolving known aliases.

# Step 2: Clean & Process other fields (ie. convert dates)
df = df.withColumn('dob_formatted', from_unixtime(unix_timestamp('dob', 'MM/dd/yyyy'), format='yyyy_MMMMMMMM_dd') )

# Step 3: Concat all relevant matching fields
df = df.withColumn('record_uid', concat(df.state, df.dob_formatted, df.firstname, df.lastname))

# Step 4: Soundex encoding (score record_uid for similarities)
df.withColumn('score_soundex', soundex(df.record_uid)).show(10,False)


#####################################################################################################################
#
#   Fuzzy Matching Join using Levenshtein
#
#####################################################################################################################

# Generate Dataframe for testing
df = spark.createDataFrame(
    [
        (['dan',    'ocean',        'nc', '05/25/1983']),
        (['daniel', 'ocean',        'nc', '05/25/1983']),
        (['danny',  'ocean',        'nc', '05/26/1983']),
        (['danny',  'ocen',         'nc', '05/26/1983']),
Example #5
0
    if file.endswith('.csv'):
        cur_key = file.replace('.csv', '')
        #print(cur_key)
        final_path = 'file://' + os.path.abspath(
            '.') + '/datasets_file/' + file
        print(final_path)
        dataset = spark.read.format('csv').options(
            header='true',
            inferschema='false',
            multiLine='true',
            delimiter=',',
            encoding="ISO-8859-1").load(final_path)
        dataset = dataset.select(
            f.col(dataset.columns[0]).alias('column_value'))
        dataset = dataset.withColumn('soundex_phon_cur',
                                     f.soundex(f.col(dataset.columns[0])))
        dataset = dataset.withColumn('column_name',
                                     f.lit(cur_key).cast(StringType()))
        list_of_all_datasets.append(dataset)


def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)


merged_df = unionAll(*list_of_all_datasets)

start_time = time.time()
# file_names = ['dtmw-avzj.BOROUGH.txt.gz']
for file in file_names:
    cur_dataset = spark.read.format('csv').options(
# In[40]:

import pyspark.sql.functions as f

# In[58]:

df.select(f.collect_set(df['state'])).collect()

# In[62]:

df.select(f.countDistinct('state').alias('states')).show()

# In[70]:

df.select(f.md5('street').alias('hash')).collect()

# In[72]:

df.select(f.reverse(df.state).alias('state-reverse')).collect()

# In[75]:

df.select(f.soundex(df.name).alias('soundex')).collect()

# In[76]:

spark.stop()

# In[ ]: