def test(): """ test_timing with standardized interface in timing module """ from timing import timed # master_dct contains all output to be used in json master_dct = {} def _run(df, i): ### MAIN ### # dct belongs to a dataset dct = {'dataset_name': df.ds_name, OUTPUT_KEY: {}} df_cols = map_cols(df) get_basic_metadata(df_cols, dct) # main driver ### test num_cells (counts) ### # rows = reduce_cols(df_cols, 'number_cells', lambda c: sum(lit(1))) # append_output(dct, 'number_cells', rows) # for col in dct[OUTPUT_KEY].keys(): # if dct[OUTPUT_KEY][col]['number_cells'] != dct[OUTPUT_KEY][col]['number_non_empty_cells'] + dct[OUTPUT_KEY][col]['number_empty_cells']: # raise ValueError('Error in dataset: {}. Column: {}'.format( # dct.keys()[0], str(dct[OUTPUT_KEY][col]))) # update master_dct with dataset master_dct.update({dct['dataset_name']: dct}) return dct timed(_run)
def test(): from timing import timed # get print print_schema = '' try: print_schema = sys.argv[4] except IndexError: pass def _run(df, i): if print_schema == 'print': df.printSchema() timed(_run)
def test(n_iter=100): from random import randint from timing import timed for _ in range(n_iter): k = randint(0, 10**5) n = randint(1000, 10**5) data = [randint(0, 10**9) for _ in range(n)] t = timed(binary_search, k, data, n) assert t < 3
def run(): spark = SparkSession.builder.getOrCreate() # master_dct contains all output to be used in json master_dct = {} # master_itemset contains all itemsets to be used in json master_itemset = {} def _run(df, i): # BASIC METADATA # ds_dct belongs to a dataset ds_dct = { 'dataset_name': df.ds_name, OUTPUT_KEY: {} } df_cols = map_cols(df) get_basic_metadata(df_cols, ds_dct) # main driver ### BASIC PROFILE ### dct_output, dct_itemset = get_dataset_profile(spark, df_cols) append_data_profile(ds_dct, dct_output) # print(ds_dct) ds_dct['itemset'] = dct_itemset # add the counts for df_itemset to master combine_itemsets(dct_itemset, master_itemset) ### DONE: update master_dct with dataset ### master_dct.update({ds_dct['dataset_name']: ds_dct}) # Save the output for the df to json for each run with open("results_times/master_dct.json", "w") as json_file: json.dump(master_dct, json_file, indent=4) json_file.write("\n") print("Master Itemset: ", master_itemset) with open("master_itemset.json", "w") as json_file: json.dump(master_itemset, json_file, indent=4) return ds_dct timed(_run) result_set = most_frequent_itemsets(master_itemset) print("Most Frequent Itemsets: ", result_set)
def test(): assert fractional_knapsack(0, [(60, 20)]) == 0.0 for attempt in range(100): n = randint(1, 1000) capacity = randint(0, 2 * 10**6) val_and_weights = [] for i in range(n): val_and_weights.append((randint(0, 2 * 10**6), randint(1, 2 * 10**6))) t = timed(fractional_knapsack, capacity, val_and_weights) assert t < 5
def test(): from random import randint from timing import timed for attemps in range(100): n = randint(0, 1000) capacity = randint(0, 2 * 10**6) values_and_weights = [] for i in range(n): values_and_weights.append( (randint(0, 2 * 10**6), randint(1, 2 * 10**6))) t = timed(fractional_knapsack, capacity, values_and_weights) assert t < 5 print("{:.10f}".format(t))
def test(): assert fractional_knapsack(0, [(60, 20)]) == 0.0 assert fractional_knapsack(25, [(60, 20)]) == 60.0 assert fractional_knapsack(25, [(60, 20), (0, 100)]) == 60.0 assert fractional_knapsack(25, [(60, 20), (50, 50)]) == 60.0 + 5.0 assert fractional_knapsack(50, [(60, 20), (100, 50), (120, 30)]) == 180.0 from random import randint from timing import timed for attempt in range(100): n = randint(1, 1000) capacity = randint(0, 2 * 10**6) values_and_weights = [] values_and_weights.append((randint(0, 2 * 10**6), randint(1, 2 * 10**6))) t = timed(fractional_knapsack, capacity, values_and_weights) assert t < 5
def test(): assert fractional_knapsack(0, [(60, 20)]) == 0 assert fractional_knapsack(25, [(60, 20)]) == 60.0 assert fractional_knapsack(25, [(60, 20), (0, 100)]) == 60.0 assert fractional_knapsack(25, [(60, 20), (50, 50)]) == 65.0 assert fractional_knapsack(50, [(60, 20), (100, 50), (120, 30)]) == 180.0 from random import randint from timing import timed for attempt in range(100): n = randint(1, 1000) m = 2 * 10**6 capacity = randint(0, m) values_and_weights = [] for _ in range(n): v = randint(0, m) w = randint(1, m) values_and_weights.append((v, w)) t = timed(fractional_knapsack, (capacity, values_and_weights)) assert t < 1 print("OK")
def t2_get_n_frequents( gz_paths_cols: List[Tuple[str, str]], top_n: int = 10 ) -> Union[List[str], List[Dict[str, Any]], List[Dict[str, str]]]: """ take in a List of Tuple[ds_name in hdfs, column name of interest from ta] and get the top n frequent values from the column. it returns the column names of the output df for reconstructing into a df, the output df as a list of dictionaries (converted from rows from df.collect), and the missing columns (weren't matched in the ds and aren't present in output). the reason a df isn't output is because the output dfs at each iteration are converted into python objects. concantenating dfs using Union is a bad idea since lazy evaluation means that all the columns will eventually be loaded into memory somewhere and an error is thrown for lack of memory. this function outputs good quality representative values for the column """ rand = get_rand_arg() if rand: random.shuffle(gz_paths_cols) # unzip basically gz_paths: List[str] = [gz_paths for gz_paths, _, _ in gz_paths_cols] cols: List[str] = [cols for _, cols, _ in gz_paths_cols] records = [] missing = set() columns = None def _run(df, i): nonlocal records nonlocal missing nonlocal columns print("col_name:", cols[i]) # match cols (they removed replaced space in column names when saving them to the file name) col = cols[i] ds_name = os.path.basename(gz_paths[i]) result = match_preprocess(cols[i], {'foo': df.columns}, match_jacc_min) if result is not None: c = result[COL] print('found:', c) col = c try: df = df.select(spark_col(col)) # remove all but col of interest except Exception: missing.add(str({'ds_name': ds_name, 'col_name_ta': cols[i]})) raise ValueError('missing:', (ds_name, cols[i]), 'cols:', df.columns) df_cols = map_cols(df) df_counts = get_counts(df_cols) df_output = get_n_freq_str(df_counts, top_n) df_output = df_output.select( lit(ds_name).alias('ds_path'), lit(cols[i]).alias('col_name_ta'), '*') if columns is None: columns = df_output.columns # concat records.append([row.asDict() for row in df_output.collect()][0]) return df_output timed(_run, gz_paths) return columns, records, list(missing)
def run(gz_paths_cols: List[Tuple[str, str]], ref_set_cols, ref_set_vals): spark = SparkSession.builder.getOrCreate() if not os.path.isdir('results_similarities'): os.makedirs('results_similarities') rand = get_rand_arg() if rand: random.shuffle(gz_paths_cols) # unzip gz_paths: List[str] = [gz_paths for gz_paths, _, _ in gz_paths_cols] to_sort = get_sort_arg() if to_sort: files_n_sizes = get_ds_file_sizes(gz_paths) sizes = [size for f, size in files_n_sizes] gz_paths_cols = list( zip(*sorted(list(zip(gz_paths_cols, sizes)), key=lambda x: x[1])))[ 0] # sort on sizes gz_paths: List[str] = [gz_paths for gz_paths, _, _ in gz_paths_cols] # unzip cols: List[str] = [cols for _, cols, _ in gz_paths_cols] ta_path: List[str] = [ta_path for _, _, ta_path in gz_paths_cols] cache_col_name = { } # {col_name: semantic_type} | may help if cols are repeated across dfs cache_col_val = { } # there are many values repeated in a column so this helps # def _match_semantic_cols(col_name): # if not col_name in cache_col_name: # cache_col_name[col_name] = str(match_preprocess(col_name, ref_set_cols)[S_TYPE]) # return cache_col_name[col_name] def _match_semantic_vals(col_val, s_type_col, is_spark=True): """ stage 1: run value matcher ('match_preprocess') on only the matched s_type_col if the cutoff not passed (avg distance from column is too high): stage 2: use heuristics (from manually examining frequent data for each col (ref_set)) to limit the amount of s_type_vals in ref_set_vals to compare to. I.e. null is automatically assigned the matched s_type_col I.e. check for subtrings, like if 'com' is in the val, then check 'website' s_type_vals for similarity. 'co' is implicitly in 'com' so check business_name as well, etc. this is to minimize misclassifications place them in 'check' to later build another s_type_vals using only those s_types stage 3: run 'match_preprocess' again on all s_types except the match s_type_col, or only on the heuristic matches in stage 2 (if they exist (if the heuristic check yielded results)) stage 4: check whether the stage 3 result is significantly better than the stage 1 result--by checking whether the avg_dist is some percentage ('IMPROVE_RATIO') better than what it was. If not, assign the val to the matched s_type_col as would happen if the value was null stage 5 (doesn't work in spark): if the min_dist is less than some similarity cutoff: 'MIN_DIST' (meaning it is sufficiently small) and is larger than some similarity cutoff: 'IDENTICAL_CUTOFF' (meaning it isn't nearly identical to something already in the ref_set) add it to the ref_set. if initial matches are correct, later matches should be more accurate. the ref_set tops out at some sufficient size as to prevent slow down and redundant matching all {col_val: s_type} combinations are cached so that identical column values aren't recomputed, and so that spark can assign each to the dataframe by using a udf after they are computed outside of Spark. the cache is cleared after each dataset """ col_val = str(col_val) s_type_col = str(s_type_col) add = False # print(col_val, s_type_col, {s_type_col: [ref_set_vals[s_type_col]]}) if not col_val in cache_col_val: AVG_CUTOFF = 0.9 # similarity measure worse than this triggers second more general run # MIN_CUTOFF = 0.65 # IDENTICAL_CUTOFF = 0.10 IMPROVE_RATIO = 0.2 # second run improved by some percent str_col_val = str(col_val).lower() # print(str_col_val) if str_col_val == 'null' or str_col_val == '-' or str_col_val == '_' or str_col_val == '0' or str_col_val == 'none' or str_col_val == '' or col_val is None: res_final = (s_type_col, col_val, 0.0, 0.0 ) # default to s_type_col else: res0 = match_preprocess( col_val, {s_type_col: ref_set_vals[s_type_col]}, match_jacc_avg ) # compare to values of matched (based on col_name) semantic type # print('res0:', res0) # res0[MIN_DIST] != 0.0 if res0 is None or AVG_CUTOFF < res0[ AVG_DIST]: # was the cutoff passed, i.e. was the value present for this semantic type based on the col_name match? # check only these semantic types based on the content of the col_val (more explicit rules after examining data) check = [] remove = [] is_alpha = str_col_val.isalpha() is_digit = str_col_val.isdigit() if len(str_col_val) == 1 and is_alpha: possibles = ['person_name (middle_initial)', 'borough'] for pos_s_type in possibles: if s_type_col == pos_s_type: # which of these is the s_type of the col? check.extend([pos_s_type]) break if len(str_col_val) == 2 and is_alpha: check.extend(['color']) if len(str_col_val) == 5 and is_digit: check.extend(['zip_code']) if len(str_col_val) >= 3 and is_digit: check.extend([ 'city_agency', 'street_number', 'phone_number', 'building_classification' ]) if len(str_col_val) >= 1 and is_digit: check.extend(['street_number']) if 'ps ' in str_col_val or 'is ' in str_col_val or 'js ' in str_col_val or 'hs ' in str_col_val: check.extend(['school_name']) if len(str_col_val ) >= 3: # can have numbers and other chars if 'llc' in str_col_val or 'inc' in str_col_val or 'co' in str_col_val: check.extend(['business_name']) if 'http' in str_col_val or 'www' in str_col_val or 'org' in str_col_val or 'com' in str_col_val: check.extend(['website']) if 'ave' in str_col_val or 'str' in str_col_val: if str_col_val[0].isdigit(): check.extend(['address']) # if len(check) > 0: # print('check:', check) check = list(set(check)) remove = list(set(remove)) if len(check) == 0: # compare to every semantic type but already checked if is_spark: # check for expensive unnecessary operation ref_set_diff = ref_set_vals else: ref_set_diff = copy.deepcopy(ref_set_vals) # clone else: # compare to only those in check ref_set_diff = {} for s_type in check: if is_spark: ref_set_diff[s_type] = ref_set_vals[s_type] else: ref_set_diff[s_type] = copy.deepcopy( ref_set_vals[s_type]) # for key, val in ref_set_cols.items(): # compare to column names as well (for ms_core) # if key in ref_set_diff: # ref_set_diff[key].extend(val) # ref_set_diff[s_type_col] = [] # prevent key error and delete all values for already matched ref_set_diff[s_type_col] = ref_set_vals[ s_type_col] # prevent key error and delete all values for already matched for rm in remove: if rm in ref_set_diff: ref_set_diff[rm] = [] res1 = match_preprocess( col_val, ref_set_diff, match_jacc_avg ) # find similarity with other semantic value types res_final = res1 if res0 is None and res1 is None: res_final = (s_type_col, col_val, 0.0, 0.0) elif res0 is None: # print('res0:', res0, res1) res_final = res1 elif res1 is None: # print('res1:', res0, res1) res_final = res0 else: # neither are None res_final = min([res0, res1], key=lambda x: x[AVG_DIST]) # if AVG_CUTOFF < res_final[AVG_DIST]: # still greater than cutoff and therefore unknown if not ( res_final[AVG_DIST] <= (res0[AVG_DIST] * (1 - IMPROVE_RATIO))): # dist has not improved res_final = _default( s_type_col, col_val) # default to s_type_col # ^ should the distance be non-0 to add to ref_set? else: # print('FALSE') res_final = res0 # cutoff passed, return initial result # # not an exact match and up to n different values stored # if res_final[MIN_DIST] <= MIN_CUTOFF and res_final[MIN_DIST] >= IDENTICAL_CUTOFF and len(ref_set_vals[res_final[S_TYPE]]) < 30: # if is_spark: # add = True # else: # ref_set_vals[res_final[S_TYPE]].append(col_val) # append to ref_set cache_col_val[col_val] = str(res_final[S_TYPE]) # # print('res_final:', res_final) return (cache_col_val[col_val], add) # match_semantic_cols = udf(_match_semantic_cols, StringType()) match_semantic_vals = udf( _match_semantic_vals, StructType([ StructField('s_type_val', StringType(), False), StructField('add', BooleanType(), False) ])) master_lst = [] def _run(df, i): print("col_name:", cols[i]) col = None match_col = match_preprocess( cols[i], {'foo': df.columns}) # match the col from ta name to ds cols name if match_col is not None: col = match_col[COL] else: # shouldn't exec raise Exception(f'{cols[i]} not matched in {str(df.columns)}') df_cols = map_cols(df.select(col)) # filter single col # df_cols = df_cols.sample(0.5, seed=3).limit(500) # TEST if not col in cache_col_name: # currently uneccessary since cache_col_name is cleared after every ds cache_col_name[col] = match_preprocess( col, ref_set_cols)[S_TYPE] # match col to s_type s_type_col = cache_col_name[col] # print('s_type_col:', s_type_col) # print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col]) df_cols = df_cols.withColumn( 's_type_col', lit(s_type_col)) # populate df with col s_type # s_types_distinct = [(s_type_col, df_cols.count())] # ### Python method: no spark to add to ref_set_vals # if i > -10: # run on small datasets (before it gets slow) # s_types_all = [] # for row in df_cols.select('value', 's_type_col').collect(): # s_type_i = _match_semantic_vals(row['value'], row['s_type_col'], False) # s_types_all.append(s_type_i[0]) # # get (s_type, count) # s_types_distinct = sc.parallelize(s_types_all).countByValue().items() # ### if i >= -10: ### Spark method df_cols = df_cols.withColumn( 's_type_val_add', match_semantic_vals( 'value', 's_type_col')) # match uknown col value to semantic type # add to ref set with Spark df_cols = df_cols.select('*', 's_type_val_add.s_type_val') df_cols = df_cols.select('*', 's_type_val_add.add') s_types_distinct = df_cols.select('s_type_val').rdd.map( lambda x: x['s_type_val']).countByValue().items() # for row in df_cols.filter('add == True').select('value', 's_type_val').distinct().collect(): # if len(ref_set_vals[row['s_type_val']]) < 30: # # print('ADD') # # print(row['s_type_val'], 'row:', ref_set_vals[row['s_type_val']][-5:], 'val:', row['value']) # ref_set_vals[row['s_type_val']].append(row['value']) # else: # break # # DEBUG # df_test = df_cols.groupby('s_type_col', 'value', 's_type_val', 'add').count() # df_test = df_test.sort('count', ascending=False) # print() # print('25 top vals') # df_test.show(25) # print('s_type_val different than s_type_col') # df_test.filter('s_type_val != s_type_col').show(25) # ### ds_dict = {'column_name': ta_path[i], 'semantic_types': []} for s_type, count in s_types_distinct: if s_type in LABEL_LIST_TA: ds_dict['semantic_types'].append({ 'semantic_type': s_type, 'count': count }) else: ds_dict['semantic_types'].append({ 'semantic_type': 'other', 'label': s_type, 'count': count }) master_lst.append(ds_dict) print('ta_path[i]:', ds_dict) with open("results_similarities/master_dct_0.json", "w") as json_file: json.dump(master_lst, json_file, indent=4) cache_col_name.clear() cache_col_val.clear() timed(_run, gz_paths)