Ejemplo n.º 1
0
    def _run(df, i):
        nonlocal records
        nonlocal missing
        nonlocal columns

        print("col_name:", cols[i])

        # match cols (they removed replaced space in column names when saving them to the file name)
        col = cols[i]
        ds_name = os.path.basename(gz_paths[i])

        result = match_preprocess(cols[i], {'foo': df.columns}, match_jacc_min)
        if result is not None:
            c = result[COL]
            print('found:', c)
            col = c

        try:
            df = df.select(spark_col(col))  # remove all but col of interest
        except Exception:
            missing.add(str({'ds_name': ds_name, 'col_name_ta': cols[i]}))
            raise ValueError('missing:', (ds_name, cols[i]), 'cols:',
                             df.columns)
        df_cols = map_cols(df)
        df_counts = get_counts(df_cols)
        df_output = get_n_freq_str(df_counts, top_n)
        df_output = df_output.select(
            lit(ds_name).alias('ds_path'),
            lit(cols[i]).alias('col_name_ta'), '*')

        if columns is None:
            columns = df_output.columns

        # concat
        records.append([row.asDict() for row in df_output.collect()][0])

        return df_output
Ejemplo n.º 2
0
    def _match_semantic_vals(col_val, s_type_col, is_spark=True):
        """
        stage 1:
        run value matcher ('match_preprocess') on only the matched s_type_col

        if the cutoff not passed (avg distance from column is too high):
            stage 2:
            use heuristics (from manually examining frequent data for each col (ref_set)) to limit the amount of s_type_vals in ref_set_vals to compare to.
            I.e. null is automatically assigned the matched s_type_col
            I.e. check for subtrings, like if 'com' is in the val, then check 'website' s_type_vals for similarity. 'co' is implicitly in 'com' so check business_name as well, etc.
            this is to minimize misclassifications
            place them in 'check' to later build another s_type_vals using only those s_types

            stage 3:
            run 'match_preprocess' again on all s_types except the match s_type_col, or only on the heuristic matches in stage 2 (if they exist (if the heuristic check yielded results))
        
            stage 4:
            check whether the stage 3 result is significantly better than the stage 1 result--by checking whether the avg_dist is some percentage ('IMPROVE_RATIO') better than what it was. If not, assign the val to the matched s_type_col as would happen if the value was null

            stage 5 (doesn't work in spark):
            if the min_dist is less than some similarity cutoff: 'MIN_DIST' (meaning it is sufficiently small) and is larger than some similarity cutoff: 'IDENTICAL_CUTOFF' (meaning it isn't nearly identical to something already in the ref_set) add it to the ref_set. if initial matches are correct, later matches should be more accurate. the ref_set tops out at some sufficient size as to prevent slow down and redundant matching

        all {col_val: s_type} combinations are cached so that identical column values aren't recomputed, and so that spark can assign each to the dataframe by using a udf after they are computed outside of Spark. the cache is cleared after each dataset
        """
        col_val = str(col_val)
        s_type_col = str(s_type_col)

        add = False

        # print(col_val, s_type_col, {s_type_col: [ref_set_vals[s_type_col]]})
        if not col_val in cache_col_val:
            AVG_CUTOFF = 0.9  # similarity measure worse than this triggers second more general run
            # MIN_CUTOFF = 0.65
            # IDENTICAL_CUTOFF = 0.10
            IMPROVE_RATIO = 0.2  # second run improved by some percent

            str_col_val = str(col_val).lower()
            # print(str_col_val)
            if str_col_val == 'null' or str_col_val == '-' or str_col_val == '_' or str_col_val == '0' or str_col_val == 'none' or str_col_val == '' or col_val is None:
                res_final = (s_type_col, col_val, 0.0, 0.0
                             )  # default to s_type_col
            else:
                res0 = match_preprocess(
                    col_val, {s_type_col: ref_set_vals[s_type_col]},
                    match_jacc_avg
                )  # compare to values of matched (based on col_name) semantic type
                # print('res0:', res0)
                # res0[MIN_DIST] != 0.0
                if res0 is None or AVG_CUTOFF < res0[
                        AVG_DIST]:  # was the cutoff passed, i.e. was the value present for this semantic type based on the col_name match?
                    # check only these semantic types based on the content of the col_val (more explicit rules after examining data)
                    check = []
                    remove = []
                    is_alpha = str_col_val.isalpha()
                    is_digit = str_col_val.isdigit()
                    if len(str_col_val) == 1 and is_alpha:
                        possibles = ['person_name (middle_initial)', 'borough']
                        for pos_s_type in possibles:
                            if s_type_col == pos_s_type:  # which of these is the s_type of the col?
                                check.extend([pos_s_type])
                                break
                    if len(str_col_val) == 2 and is_alpha:
                        check.extend(['color'])
                    if len(str_col_val) == 5 and is_digit:
                        check.extend(['zip_code'])
                    if len(str_col_val) >= 3 and is_digit:
                        check.extend([
                            'city_agency', 'street_number', 'phone_number',
                            'building_classification'
                        ])
                    if len(str_col_val) >= 1 and is_digit:
                        check.extend(['street_number'])
                    if 'ps ' in str_col_val or 'is ' in str_col_val or 'js ' in str_col_val or 'hs ' in str_col_val:
                        check.extend(['school_name'])
                    if len(str_col_val
                           ) >= 3:  # can have numbers and other chars
                        if 'llc' in str_col_val or 'inc' in str_col_val or 'co' in str_col_val:
                            check.extend(['business_name'])
                        if 'http' in str_col_val or 'www' in str_col_val or 'org' in str_col_val or 'com' in str_col_val:
                            check.extend(['website'])
                        if 'ave' in str_col_val or 'str' in str_col_val:
                            if str_col_val[0].isdigit():
                                check.extend(['address'])

                    # if len(check) > 0:
                    #     print('check:', check)

                    check = list(set(check))
                    remove = list(set(remove))

                    if len(check) == 0:
                        # compare to every semantic type but already checked
                        if is_spark:  # check for expensive unnecessary operation
                            ref_set_diff = ref_set_vals
                        else:
                            ref_set_diff = copy.deepcopy(ref_set_vals)  # clone
                    else:
                        # compare to only those in check
                        ref_set_diff = {}
                        for s_type in check:
                            if is_spark:
                                ref_set_diff[s_type] = ref_set_vals[s_type]
                            else:
                                ref_set_diff[s_type] = copy.deepcopy(
                                    ref_set_vals[s_type])
                    # for key, val in ref_set_cols.items():  # compare to column names as well (for ms_core)
                    #     if key in ref_set_diff:
                    #         ref_set_diff[key].extend(val)
                    # ref_set_diff[s_type_col] = []  # prevent key error and delete all values for already matched
                    ref_set_diff[s_type_col] = ref_set_vals[
                        s_type_col]  # prevent key error and delete all values for already matched
                    for rm in remove:
                        if rm in ref_set_diff:
                            ref_set_diff[rm] = []

                    res1 = match_preprocess(
                        col_val, ref_set_diff, match_jacc_avg
                    )  # find similarity with other semantic value types
                    res_final = res1

                    if res0 is None and res1 is None:
                        res_final = (s_type_col, col_val, 0.0, 0.0)
                    elif res0 is None:
                        # print('res0:', res0, res1)
                        res_final = res1
                    elif res1 is None:
                        # print('res1:', res0, res1)
                        res_final = res0
                    else:  # neither are None
                        res_final = min([res0, res1],
                                        key=lambda x: x[AVG_DIST])

                        # if AVG_CUTOFF < res_final[AVG_DIST]:  # still greater than cutoff and therefore unknown
                        if not (
                                res_final[AVG_DIST] <=
                            (res0[AVG_DIST] *
                             (1 - IMPROVE_RATIO))):  # dist has not improved
                            res_final = _default(
                                s_type_col, col_val)  # default to s_type_col
                            # ^ should the distance be non-0 to add to ref_set?
                else:
                    # print('FALSE')
                    res_final = res0  # cutoff passed, return initial result

            # # not an exact match and up to n different values stored
            # if res_final[MIN_DIST] <= MIN_CUTOFF and res_final[MIN_DIST] >= IDENTICAL_CUTOFF and len(ref_set_vals[res_final[S_TYPE]]) < 30:
            #     if is_spark:
            #         add = True
            #     else:
            #         ref_set_vals[res_final[S_TYPE]].append(col_val)  # append to ref_set
            cache_col_val[col_val] = str(res_final[S_TYPE])
            # # print('res_final:', res_final)

        return (cache_col_val[col_val], add)
Ejemplo n.º 3
0
    def _run(df, i):
        print("col_name:", cols[i])

        col = None

        match_col = match_preprocess(
            cols[i],
            {'foo': df.columns})  # match the col from ta name to ds cols name
        if match_col is not None:
            col = match_col[COL]
        else:  # shouldn't exec
            raise Exception(f'{cols[i]} not matched in {str(df.columns)}')

        df_cols = map_cols(df.select(col))  # filter single col
        # df_cols = df_cols.sample(0.5, seed=3).limit(500)  # TEST

        if not col in cache_col_name:  # currently uneccessary since cache_col_name is cleared after every ds
            cache_col_name[col] = match_preprocess(
                col, ref_set_cols)[S_TYPE]  # match col to s_type
        s_type_col = cache_col_name[col]

        # print('s_type_col:', s_type_col)
        # print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col])

        df_cols = df_cols.withColumn(
            's_type_col', lit(s_type_col))  # populate df with col s_type

        # s_types_distinct = [(s_type_col, df_cols.count())]

        # ### Python method: no spark to add to ref_set_vals
        # if i > -10: # run on small datasets (before it gets slow)
        #     s_types_all = []
        #     for row in df_cols.select('value', 's_type_col').collect():
        #         s_type_i = _match_semantic_vals(row['value'], row['s_type_col'], False)
        #         s_types_all.append(s_type_i[0])
        #     # get (s_type, count)
        #     s_types_distinct = sc.parallelize(s_types_all).countByValue().items()
        # ###

        if i >= -10:
            ### Spark method
            df_cols = df_cols.withColumn(
                's_type_val_add', match_semantic_vals(
                    'value',
                    's_type_col'))  # match uknown col value to semantic type

            # add to ref set with Spark
            df_cols = df_cols.select('*', 's_type_val_add.s_type_val')
            df_cols = df_cols.select('*', 's_type_val_add.add')
            s_types_distinct = df_cols.select('s_type_val').rdd.map(
                lambda x: x['s_type_val']).countByValue().items()
            # for row in df_cols.filter('add == True').select('value', 's_type_val').distinct().collect():
            #     if len(ref_set_vals[row['s_type_val']]) < 30:
            #         # print('ADD')
            #         # print(row['s_type_val'], 'row:', ref_set_vals[row['s_type_val']][-5:], 'val:', row['value'])
            #         ref_set_vals[row['s_type_val']].append(row['value'])
            #     else:
            #         break

        # # DEBUG
        # df_test = df_cols.groupby('s_type_col', 'value', 's_type_val', 'add').count()
        # df_test = df_test.sort('count', ascending=False)
        # print()
        # print('25 top vals')
        # df_test.show(25)
        # print('s_type_val different than s_type_col')
        # df_test.filter('s_type_val != s_type_col').show(25)
        # ###

        ds_dict = {'column_name': ta_path[i], 'semantic_types': []}
        for s_type, count in s_types_distinct:
            if s_type in LABEL_LIST_TA:
                ds_dict['semantic_types'].append({
                    'semantic_type': s_type,
                    'count': count
                })
            else:
                ds_dict['semantic_types'].append({
                    'semantic_type': 'other',
                    'label': s_type,
                    'count': count
                })
        master_lst.append(ds_dict)

        print('ta_path[i]:', ds_dict)

        with open("results_similarities/master_dct_0.json", "w") as json_file:
            json.dump(master_lst, json_file, indent=4)

        cache_col_name.clear()
        cache_col_val.clear()
    def _run(df, i):
        print("col_name:", cols[i])

        col = None

        match_col = match_preprocess(cols[i], {'foo': df.columns})  # match the col from ta name to ds cols name
        if match_col is not None:
            col = match_col[COL]
        else:  # shouldn't exec
            raise Exception(f'{cols[i]} not matched in {str(df.columns)}')

        df_cols = map_cols(df.select(col))  # filter single col
        # df_cols = df_cols.sample(0.5, seed=3).limit(500)  # TEST

        if not col in cache_col_name:  # currently uneccessary since cache_col_name is cleared after every ds
            cache_col_name[col] = match_preprocess(col, ref_set_cols)[S_TYPE]  # match col to s_type
        s_type_col = cache_col_name[col]

        print('s_type_col:', s_type_col)
        print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col])

        df_cols = df_cols.withColumn('s_type_col', lit(s_type_col))  # populate df with col s_type
        
        # if i < 35: # run on small datasets (before it gets slow)
        s_types_all = []
        ### Python method: no spark to add to ref_set_vals
        for row in df_cols.select('value', 's_type_col').collect():
            s_type_i = _match_semantic_vals(row['value'], row['s_type_col'])
            s_types_all.append(s_type_i)
        # get (s_type, count)
        s_types_distinct = sc.parallelize(s_types_all).countByValue().items()
        ###

        # the below udf call just pulls out the s_types from the cache
        df_cols = df_cols.withColumn('s_type_val', match_semantic_vals('value', 's_type_col'))  # match uknown col value to semantic type
        df_test = df_cols.groupby('s_type_col', 'value', 's_type_val').count()
        df_test = df_test.sort('count', ascending=False)
        df_test.filter('s_type_val != s_type_col').show(25)
        df_test.show(25)
        # results = [str(list(row.asDict().values())) + '\n' for row in df_test.collect()]
        # print(results[:10])
        # with open('results_similarities/test.txt', '+a') as f:
        #     for s in results:
        #         f.write(s)

        ds_dict = {
            'column_name': col,
            'semantic_types': []
        }
        for s_type, count in s_types_distinct:
            if s_type in LABEL_LIST_TA:
                ds_dict['semantic_types'].append({
                    'semantic_type': s_type,
                    'count': count
                })
            else:
                ds_dict['semantic_types'].append({
                    'semantic_type': 'other',
                    'label': s_type,
                    'count': count
                })
        if gz_paths[i] not in master_dct:
            master_dct[gz_paths[i]] = {}
        master_dct[gz_paths[i]].update({col: ds_dict})

        print('gz_paths[i]:', {gz_paths[i]: master_dct[gz_paths[i]]})

        with open("results_similarities/master_dct.json", "w") as json_file:
            json.dump(master_dct, json_file, indent=4)

        cache_col_name.clear()
        cache_col_val.clear()