Esempio n. 1
0
    def _run(df, i):
        # BASIC METADATA
        # ds_dct belongs to a dataset
        ds_dct = {
            'dataset_name': df.ds_name,
            OUTPUT_KEY: {}
        }
        df_cols = map_cols(df)
        get_basic_metadata(df_cols, ds_dct)  # main driver

        ### BASIC PROFILE ###
        dct_output, dct_itemset = get_dataset_profile(spark, df_cols)
        append_data_profile(ds_dct, dct_output)
        # print(ds_dct)
        ds_dct['itemset'] = dct_itemset

        # add the counts for df_itemset to master
        combine_itemsets(dct_itemset, master_itemset)

        ### DONE: update master_dct with dataset ###
        master_dct.update({ds_dct['dataset_name']: ds_dct})

        # Save the output for the df to json for each run
        with open("results_times/master_dct.json", "w") as json_file:
            json.dump(master_dct, json_file, indent=4)
            json_file.write("\n")

        print("Master Itemset: ", master_itemset)
        with open("master_itemset.json", "w") as json_file:
            json.dump(master_itemset, json_file, indent=4)

        return ds_dct
Esempio n. 2
0
    def _run(df, i):
        nonlocal records
        nonlocal missing
        nonlocal columns

        print("col_name:", cols[i])

        # match cols (they removed replaced space in column names when saving them to the file name)
        col = cols[i]
        ds_name = os.path.basename(gz_paths[i])

        result = match_preprocess(cols[i], {'foo': df.columns}, match_jacc_min)
        if result is not None:
            c = result[COL]
            print('found:', c)
            col = c

        try:
            df = df.select(spark_col(col))  # remove all but col of interest
        except Exception:
            missing.add(str({'ds_name': ds_name, 'col_name_ta': cols[i]}))
            raise ValueError('missing:', (ds_name, cols[i]), 'cols:',
                             df.columns)
        df_cols = map_cols(df)
        df_counts = get_counts(df_cols)
        df_output = get_n_freq_str(df_counts, top_n)
        df_output = df_output.select(
            lit(ds_name).alias('ds_path'),
            lit(cols[i]).alias('col_name_ta'), '*')

        if columns is None:
            columns = df_output.columns

        # concat
        records.append([row.asDict() for row in df_output.collect()][0])

        return df_output
Esempio n. 3
0
    def _run(df, i):
        print("col_name:", cols[i])

        col = None

        match_col = match_preprocess(
            cols[i],
            {'foo': df.columns})  # match the col from ta name to ds cols name
        if match_col is not None:
            col = match_col[COL]
        else:  # shouldn't exec
            raise Exception(f'{cols[i]} not matched in {str(df.columns)}')

        df_cols = map_cols(df.select(col))  # filter single col
        # df_cols = df_cols.sample(0.5, seed=3).limit(500)  # TEST

        if not col in cache_col_name:  # currently uneccessary since cache_col_name is cleared after every ds
            cache_col_name[col] = match_preprocess(
                col, ref_set_cols)[S_TYPE]  # match col to s_type
        s_type_col = cache_col_name[col]

        # print('s_type_col:', s_type_col)
        # print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col])

        df_cols = df_cols.withColumn(
            's_type_col', lit(s_type_col))  # populate df with col s_type

        # s_types_distinct = [(s_type_col, df_cols.count())]

        # ### Python method: no spark to add to ref_set_vals
        # if i > -10: # run on small datasets (before it gets slow)
        #     s_types_all = []
        #     for row in df_cols.select('value', 's_type_col').collect():
        #         s_type_i = _match_semantic_vals(row['value'], row['s_type_col'], False)
        #         s_types_all.append(s_type_i[0])
        #     # get (s_type, count)
        #     s_types_distinct = sc.parallelize(s_types_all).countByValue().items()
        # ###

        if i >= -10:
            ### Spark method
            df_cols = df_cols.withColumn(
                's_type_val_add', match_semantic_vals(
                    'value',
                    's_type_col'))  # match uknown col value to semantic type

            # add to ref set with Spark
            df_cols = df_cols.select('*', 's_type_val_add.s_type_val')
            df_cols = df_cols.select('*', 's_type_val_add.add')
            s_types_distinct = df_cols.select('s_type_val').rdd.map(
                lambda x: x['s_type_val']).countByValue().items()
            # for row in df_cols.filter('add == True').select('value', 's_type_val').distinct().collect():
            #     if len(ref_set_vals[row['s_type_val']]) < 30:
            #         # print('ADD')
            #         # print(row['s_type_val'], 'row:', ref_set_vals[row['s_type_val']][-5:], 'val:', row['value'])
            #         ref_set_vals[row['s_type_val']].append(row['value'])
            #     else:
            #         break

        # # DEBUG
        # df_test = df_cols.groupby('s_type_col', 'value', 's_type_val', 'add').count()
        # df_test = df_test.sort('count', ascending=False)
        # print()
        # print('25 top vals')
        # df_test.show(25)
        # print('s_type_val different than s_type_col')
        # df_test.filter('s_type_val != s_type_col').show(25)
        # ###

        ds_dict = {'column_name': ta_path[i], 'semantic_types': []}
        for s_type, count in s_types_distinct:
            if s_type in LABEL_LIST_TA:
                ds_dict['semantic_types'].append({
                    'semantic_type': s_type,
                    'count': count
                })
            else:
                ds_dict['semantic_types'].append({
                    'semantic_type': 'other',
                    'label': s_type,
                    'count': count
                })
        master_lst.append(ds_dict)

        print('ta_path[i]:', ds_dict)

        with open("results_similarities/master_dct_0.json", "w") as json_file:
            json.dump(master_lst, json_file, indent=4)

        cache_col_name.clear()
        cache_col_val.clear()
    def _run(df, i):
        print("col_name:", cols[i])

        col = None

        match_col = match_preprocess(cols[i], {'foo': df.columns})  # match the col from ta name to ds cols name
        if match_col is not None:
            col = match_col[COL]
        else:  # shouldn't exec
            raise Exception(f'{cols[i]} not matched in {str(df.columns)}')

        df_cols = map_cols(df.select(col))  # filter single col
        # df_cols = df_cols.sample(0.5, seed=3).limit(500)  # TEST

        if not col in cache_col_name:  # currently uneccessary since cache_col_name is cleared after every ds
            cache_col_name[col] = match_preprocess(col, ref_set_cols)[S_TYPE]  # match col to s_type
        s_type_col = cache_col_name[col]

        print('s_type_col:', s_type_col)
        print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col])

        df_cols = df_cols.withColumn('s_type_col', lit(s_type_col))  # populate df with col s_type
        
        # if i < 35: # run on small datasets (before it gets slow)
        s_types_all = []
        ### Python method: no spark to add to ref_set_vals
        for row in df_cols.select('value', 's_type_col').collect():
            s_type_i = _match_semantic_vals(row['value'], row['s_type_col'])
            s_types_all.append(s_type_i)
        # get (s_type, count)
        s_types_distinct = sc.parallelize(s_types_all).countByValue().items()
        ###

        # the below udf call just pulls out the s_types from the cache
        df_cols = df_cols.withColumn('s_type_val', match_semantic_vals('value', 's_type_col'))  # match uknown col value to semantic type
        df_test = df_cols.groupby('s_type_col', 'value', 's_type_val').count()
        df_test = df_test.sort('count', ascending=False)
        df_test.filter('s_type_val != s_type_col').show(25)
        df_test.show(25)
        # results = [str(list(row.asDict().values())) + '\n' for row in df_test.collect()]
        # print(results[:10])
        # with open('results_similarities/test.txt', '+a') as f:
        #     for s in results:
        #         f.write(s)

        ds_dict = {
            'column_name': col,
            'semantic_types': []
        }
        for s_type, count in s_types_distinct:
            if s_type in LABEL_LIST_TA:
                ds_dict['semantic_types'].append({
                    'semantic_type': s_type,
                    'count': count
                })
            else:
                ds_dict['semantic_types'].append({
                    'semantic_type': 'other',
                    'label': s_type,
                    'count': count
                })
        if gz_paths[i] not in master_dct:
            master_dct[gz_paths[i]] = {}
        master_dct[gz_paths[i]].update({col: ds_dict})

        print('gz_paths[i]:', {gz_paths[i]: master_dct[gz_paths[i]]})

        with open("results_similarities/master_dct.json", "w") as json_file:
            json.dump(master_dct, json_file, indent=4)

        cache_col_name.clear()
        cache_col_val.clear()