def ColorStats(rdd): total_count = rdd.count() analyse_color_df = spark.createDataFrame(rdd, StringType()) a_name = analyse_color_df.select("value") soundex_udf = udf(lambda x: soundex_lookup(x), StringType()) analyse_color_df = analyse_color_df.withColumn( 'color_lower', lower_udf('value')).withColumn('color_soundex', soundex('color_lower')) analyse_color_df = analyse_color_df.withColumn( 'isColor', soundex_udf('color_soundex')) color_count = analyse_color_df.rdd.filter(lambda x: x[3] == 'true').count() return {'semantic_type': 'Color', 'count': color_count}
def colsNameSimilarity(self, df, category=None, df2=None): """ :param df: A Spark Dataframe :param category: A string keyword to match :df2 : A second dataframe to match column names :return result_df : A dataframe having column_1, column_2, path similarity, levenshtein distance,soundex_equality """ # Clean up column names so that we can prevent future errors for colName, dtype in df.dtypes: if '.' in colName or '`' in colName or colName.strip() != colName: df = df.withColumnRenamed( colName, colName.strip().replace(".", "", "_").replace("`", "")) if (df2 == None): result_df = pd.DataFrame(columns=['Column_1', 'Path Similarity']) category_sys = wn.synsets(category) if (category_sys != []): cnt = 0 # put column names into appropriate bin for colName, dtype in df.dtypes: colName_ = colName.split("_") score = [] for i in range(len(colName_)): colName_sys = wn.synsets(colName_[i]) if (colName_sys != []): score.append(colName_sys[0].path_similarity( category_sys[0])) if (score != []): score = max(score) else: score = 0 result_df.loc[cnt] = [colName, score] cnt += 1 else: print("Similarity cannot be calculated") else: for colName, dtype in df2.dtypes: if '.' in colName or '`' in colName or colName.strip( ) != colName: df2 = df2.withColumnRenamed( colName, colName.strip().replace(".", "", "_").replace("`", "")) result_df = pd.DataFrame( columns=['Column_1', 'Column_2', 'Path Similarity']) cnt = 0 # put column names into appropriate bin for colName1, dtype in df.dtypes: colName_1 = colName1.split("_") for colName2, dtype2 in df2.dtypes: colName_2 = colName2.split("_") score = [] #print(colName_1, colName_2, score) for i in range(len(colName_1)): colName_sys_1 = wn.synsets(colName_1[i]) for j in range(len(colName_2)): colName_sys_2 = wn.synsets(colName_2[j]) if (colName_sys_1 != [] and colName_sys_2 != []): score.append(colName_sys_1[0].path_similarity( colName_sys_2[0])) score = [i for i in score if i != None] if (score != []): score = max(score) else: score = 0 result_df.loc[cnt] = [colName1, colName2, score] cnt += 1 result_df = result_df[result_df['Path Similarity'] > 0.5] if (result_df.empty is not True): result_df = self.spark.createDataFrame(result_df) if (category is None): result_df = result_df.withColumn("levenshtein distance", f.levenshtein(result_df["Column_1"],\ result_df["Column_2"])) result_df = result_df.withColumn("soundex_equality", f.soundex(result_df["Column_1"]) ==\ f.soundex(result_df["Column_2"])) else: result_df = result_df.withColumn("levenshtein distance", \ f.levenshtein(result_df["Column_1"],f.lit(category))) result_df = result_df.withColumn("soundex_equality", f.soundex(result_df["Column_1"]) ==\ f.soundex(f.lit(category))) else: schema = StructType([ StructField("Column_1", StringType(), True), StructField("Path Similarity", DoubleType(), True), StructField("levenshtein distance", DoubleType(), True), StructField("soundex_equality", DoubleType(), True), ]) result_df = self.spark.createDataFrame(self.sc.emptyRDD(), schema=schema) return result_df
#all_filenames = ['jz4z-kudi.Violation_Location__City_.txt.gz', '2sps-j9st.PERSON_FIRST_NAME.txt.gz', 'w7w3-xahh.Address_City.txt.gz', '7crd-d9xh.website.txt.gz', 'erm2-nwe9.City.txt.gz'] # city preprocessing us_cities_df = spark.read.load("/user/ts3813/uscities.csv",format="csv", delimiter=",", inferSchema="true", header="true") nyc_cities_df = us_cities_df.filter(us_cities_df.state_id=='NY').select('city') lower_udf = udf(lambda x: x.lower() if x else '', StringType()) udf_1 = udf(lambda x: 1, IntegerType()) nyc_cities_df = nyc_cities_df.withColumn('city', lower_udf('city')) nyc_cities_df = nyc_cities_df.withColumn('city_bool', udf_1('city')) wiki_colors_df = spark.read.load("/user/ts3813/wikipedia_color_names.csv",format="csv", delimiter=",", inferSchema="true", header="true") w_name = wiki_colors_df.select("Name") w_name_lower = w_name.withColumn('name_lower', lower_udf('Name')) w_name_lower = w_name_lower.withColumn('w_name_soundex', soundex('name_lower')) w_soundex_vals = [w_soundex[0] for w_soundex in w_name_lower.select('w_name_soundex').collect()] phoneNumRegex = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})') school_levels= ['Preschool','pre-kindergarten','elementary school','middle school','senior school','junior high school','high school','middle school','K-8','K-1','K-2','K-3','K-4','K-5','K-6','K-7'] school_levels_df = spark.createDataFrame(school_levels, StringType()) school_levels_df = school_levels_df.withColumn('levels', lower_udf('value')) school_levels_df = school_levels_df.withColumn('school_level_bool', udf_1('levels')) boroughs=['the bronx','staten island','manhattan','brooklyn','queens','bronx', 'K','B','R','Q','M'] boroughs_df=spark.createDataFrame(boroughs,StringType()) boroughs_df = boroughs_df.withColumn('levels', lower_udf('value')) boroughs_df = boroughs_df.withColumn('boroughs_bool', udf_1('levels'))
['firstname','lastname','state','dob'] ) df.show(10,False) # Step 1: Resolve any known name aliases, states, etc (i.e. dan, daniel, danny) # For this POC code, I chose not to include this step since it's straight-forward to add a dictionary for matching and resolving known aliases. # Step 2: Clean & Process other fields (ie. convert dates) df = df.withColumn('dob_formatted', from_unixtime(unix_timestamp('dob', 'MM/dd/yyyy'), format='yyyy_MMMMMMMM_dd') ) # Step 3: Concat all relevant matching fields df = df.withColumn('record_uid', concat(df.state, df.dob_formatted, df.firstname, df.lastname)) # Step 4: Soundex encoding (score record_uid for similarities) df.withColumn('score_soundex', soundex(df.record_uid)).show(10,False) ##################################################################################################################### # # Fuzzy Matching Join using Levenshtein # ##################################################################################################################### # Generate Dataframe for testing df = spark.createDataFrame( [ (['dan', 'ocean', 'nc', '05/25/1983']), (['daniel', 'ocean', 'nc', '05/25/1983']), (['danny', 'ocean', 'nc', '05/26/1983']), (['danny', 'ocen', 'nc', '05/26/1983']),
if file.endswith('.csv'): cur_key = file.replace('.csv', '') #print(cur_key) final_path = 'file://' + os.path.abspath( '.') + '/datasets_file/' + file print(final_path) dataset = spark.read.format('csv').options( header='true', inferschema='false', multiLine='true', delimiter=',', encoding="ISO-8859-1").load(final_path) dataset = dataset.select( f.col(dataset.columns[0]).alias('column_value')) dataset = dataset.withColumn('soundex_phon_cur', f.soundex(f.col(dataset.columns[0]))) dataset = dataset.withColumn('column_name', f.lit(cur_key).cast(StringType())) list_of_all_datasets.append(dataset) def unionAll(*dfs): return reduce(DataFrame.unionAll, dfs) merged_df = unionAll(*list_of_all_datasets) start_time = time.time() # file_names = ['dtmw-avzj.BOROUGH.txt.gz'] for file in file_names: cur_dataset = spark.read.format('csv').options(
# In[40]: import pyspark.sql.functions as f # In[58]: df.select(f.collect_set(df['state'])).collect() # In[62]: df.select(f.countDistinct('state').alias('states')).show() # In[70]: df.select(f.md5('street').alias('hash')).collect() # In[72]: df.select(f.reverse(df.state).alias('state-reverse')).collect() # In[75]: df.select(f.soundex(df.name).alias('soundex')).collect() # In[76]: spark.stop() # In[ ]: