def test():
    """
    test_timing with standardized interface in timing module
    """
    from timing import timed

    # master_dct contains all output to be used in json
    master_dct = {}

    def _run(df, i):
        ### MAIN ###
        # dct belongs to a dataset
        dct = {'dataset_name': df.ds_name, OUTPUT_KEY: {}}
        df_cols = map_cols(df)
        get_basic_metadata(df_cols, dct)  # main driver

        ### test num_cells (counts) ###
        # rows = reduce_cols(df_cols, 'number_cells', lambda c: sum(lit(1)))
        # append_output(dct, 'number_cells', rows)

        # for col in dct[OUTPUT_KEY].keys():
        #     if dct[OUTPUT_KEY][col]['number_cells'] != dct[OUTPUT_KEY][col]['number_non_empty_cells'] + dct[OUTPUT_KEY][col]['number_empty_cells']:
        #         raise ValueError('Error in dataset: {}. Column: {}'.format(
        #             dct.keys()[0], str(dct[OUTPUT_KEY][col])))

        # update master_dct with dataset
        master_dct.update({dct['dataset_name']: dct})

        return dct

    timed(_run)
def test():
    from timing import timed

    # get print
    print_schema = ''
    try:
        print_schema = sys.argv[4]
    except IndexError:
        pass
    
    def _run(df, i):
        if print_schema == 'print':
            df.printSchema()

    timed(_run)
Ejemplo n.º 3
0
def test(n_iter=100):
    from random import randint
    from timing import timed
    for _ in range(n_iter):
        k = randint(0, 10**5)
        n = randint(1000, 10**5)
        data = [randint(0, 10**9) for _ in range(n)]
        t = timed(binary_search, k, data, n)
        assert t < 3
Ejemplo n.º 4
0
def run():
    spark = SparkSession.builder.getOrCreate()

    # master_dct contains all output to be used in json
    master_dct = {}
    # master_itemset contains all itemsets to be used in json
    master_itemset = {}
    def _run(df, i):
        # BASIC METADATA
        # ds_dct belongs to a dataset
        ds_dct = {
            'dataset_name': df.ds_name,
            OUTPUT_KEY: {}
        }
        df_cols = map_cols(df)
        get_basic_metadata(df_cols, ds_dct)  # main driver

        ### BASIC PROFILE ###
        dct_output, dct_itemset = get_dataset_profile(spark, df_cols)
        append_data_profile(ds_dct, dct_output)
        # print(ds_dct)
        ds_dct['itemset'] = dct_itemset

        # add the counts for df_itemset to master
        combine_itemsets(dct_itemset, master_itemset)

        ### DONE: update master_dct with dataset ###
        master_dct.update({ds_dct['dataset_name']: ds_dct})

        # Save the output for the df to json for each run
        with open("results_times/master_dct.json", "w") as json_file:
            json.dump(master_dct, json_file, indent=4)
            json_file.write("\n")

        print("Master Itemset: ", master_itemset)
        with open("master_itemset.json", "w") as json_file:
            json.dump(master_itemset, json_file, indent=4)

        return ds_dct

    timed(_run)

    result_set = most_frequent_itemsets(master_itemset)
    print("Most Frequent Itemsets: ", result_set)
Ejemplo n.º 5
0
def test():
    assert fractional_knapsack(0, [(60, 20)]) == 0.0
    for attempt in range(100):
        n = randint(1, 1000)
        capacity = randint(0, 2 * 10**6)
        val_and_weights = []
        for i in range(n):
            val_and_weights.append((randint(0,
                                            2 * 10**6), randint(1, 2 * 10**6)))
        t = timed(fractional_knapsack, capacity, val_and_weights)
        assert t < 5
Ejemplo n.º 6
0
def test():
    from random import randint
    from timing import timed

    for attemps in range(100):
        n = randint(0, 1000)
        capacity = randint(0, 2 * 10**6)
        values_and_weights = []
        for i in range(n):
            values_and_weights.append(
                (randint(0, 2 * 10**6), randint(1, 2 * 10**6)))
        t = timed(fractional_knapsack, capacity, values_and_weights)
        assert t < 5
        print("{:.10f}".format(t))
Ejemplo n.º 7
0
def test():
    assert fractional_knapsack(0, [(60, 20)]) == 0.0
    assert fractional_knapsack(25, [(60, 20)]) == 60.0
    assert fractional_knapsack(25, [(60, 20), (0, 100)]) == 60.0
    assert fractional_knapsack(25, [(60, 20), (50, 50)]) == 60.0 + 5.0

    assert fractional_knapsack(50, [(60, 20), (100, 50), (120, 30)]) == 180.0

    from random import randint
    from timing import timed
    for attempt in range(100):
        n = randint(1, 1000)
        capacity = randint(0, 2 * 10**6)
        values_and_weights = []
        values_and_weights.append((randint(0,
                                           2 * 10**6), randint(1, 2 * 10**6)))
        t = timed(fractional_knapsack, capacity, values_and_weights)
        assert t < 5
Ejemplo n.º 8
0
def test():
    assert fractional_knapsack(0, [(60, 20)]) == 0
    assert fractional_knapsack(25, [(60, 20)]) == 60.0
    assert fractional_knapsack(25, [(60, 20), (0, 100)]) == 60.0
    assert fractional_knapsack(25, [(60, 20), (50, 50)]) == 65.0
    assert fractional_knapsack(50, [(60, 20), (100, 50), (120, 30)]) == 180.0

    from random import randint
    from timing import timed
    for attempt in range(100):
        n = randint(1, 1000)
        m = 2 * 10**6
        capacity = randint(0, m)
        values_and_weights = []
        for _ in range(n):
            v = randint(0, m)
            w = randint(1, m)
            values_and_weights.append((v, w))
        t = timed(fractional_knapsack, (capacity, values_and_weights))
        assert t < 1
    print("OK")
Ejemplo n.º 9
0
def t2_get_n_frequents(
    gz_paths_cols: List[Tuple[str, str]],
    top_n: int = 10
) -> Union[List[str], List[Dict[str, Any]], List[Dict[str, str]]]:
    """
    take in a List of Tuple[ds_name in hdfs, column name of interest from ta] and get the top n frequent values from the column.
    it returns the column names of the output df for reconstructing into a df, the output df as a list of dictionaries (converted from rows from df.collect), 
    and the missing columns (weren't matched in the ds and aren't present in output). the reason a df isn't output is because the output dfs at each iteration are converted into python objects.
    concantenating dfs using Union is a bad idea since lazy evaluation means that all the columns will eventually be loaded into memory somewhere and an error is thrown for lack of memory. 

    this function outputs good quality representative values for the column
    """
    rand = get_rand_arg()
    if rand:
        random.shuffle(gz_paths_cols)

    # unzip basically
    gz_paths: List[str] = [gz_paths for gz_paths, _, _ in gz_paths_cols]
    cols: List[str] = [cols for _, cols, _ in gz_paths_cols]

    records = []
    missing = set()

    columns = None

    def _run(df, i):
        nonlocal records
        nonlocal missing
        nonlocal columns

        print("col_name:", cols[i])

        # match cols (they removed replaced space in column names when saving them to the file name)
        col = cols[i]
        ds_name = os.path.basename(gz_paths[i])

        result = match_preprocess(cols[i], {'foo': df.columns}, match_jacc_min)
        if result is not None:
            c = result[COL]
            print('found:', c)
            col = c

        try:
            df = df.select(spark_col(col))  # remove all but col of interest
        except Exception:
            missing.add(str({'ds_name': ds_name, 'col_name_ta': cols[i]}))
            raise ValueError('missing:', (ds_name, cols[i]), 'cols:',
                             df.columns)
        df_cols = map_cols(df)
        df_counts = get_counts(df_cols)
        df_output = get_n_freq_str(df_counts, top_n)
        df_output = df_output.select(
            lit(ds_name).alias('ds_path'),
            lit(cols[i]).alias('col_name_ta'), '*')

        if columns is None:
            columns = df_output.columns

        # concat
        records.append([row.asDict() for row in df_output.collect()][0])

        return df_output

    timed(_run, gz_paths)

    return columns, records, list(missing)
Ejemplo n.º 10
0
def run(gz_paths_cols: List[Tuple[str, str]], ref_set_cols, ref_set_vals):
    spark = SparkSession.builder.getOrCreate()

    if not os.path.isdir('results_similarities'):
        os.makedirs('results_similarities')

    rand = get_rand_arg()
    if rand:
        random.shuffle(gz_paths_cols)

    # unzip
    gz_paths: List[str] = [gz_paths for gz_paths, _, _ in gz_paths_cols]

    to_sort = get_sort_arg()
    if to_sort:
        files_n_sizes = get_ds_file_sizes(gz_paths)
        sizes = [size for f, size in files_n_sizes]
        gz_paths_cols = list(
            zip(*sorted(list(zip(gz_paths_cols, sizes)), key=lambda x: x[1])))[
                0]  # sort on sizes
        gz_paths: List[str] = [gz_paths for gz_paths, _, _ in gz_paths_cols]

    # unzip
    cols: List[str] = [cols for _, cols, _ in gz_paths_cols]
    ta_path: List[str] = [ta_path for _, _, ta_path in gz_paths_cols]

    cache_col_name = {
    }  # {col_name: semantic_type} | may help if cols are repeated across dfs
    cache_col_val = {
    }  # there are many values repeated in a column so this helps

    # def _match_semantic_cols(col_name):
    #     if not col_name in cache_col_name:
    #         cache_col_name[col_name] = str(match_preprocess(col_name, ref_set_cols)[S_TYPE])
    #     return cache_col_name[col_name]

    def _match_semantic_vals(col_val, s_type_col, is_spark=True):
        """
        stage 1:
        run value matcher ('match_preprocess') on only the matched s_type_col

        if the cutoff not passed (avg distance from column is too high):
            stage 2:
            use heuristics (from manually examining frequent data for each col (ref_set)) to limit the amount of s_type_vals in ref_set_vals to compare to.
            I.e. null is automatically assigned the matched s_type_col
            I.e. check for subtrings, like if 'com' is in the val, then check 'website' s_type_vals for similarity. 'co' is implicitly in 'com' so check business_name as well, etc.
            this is to minimize misclassifications
            place them in 'check' to later build another s_type_vals using only those s_types

            stage 3:
            run 'match_preprocess' again on all s_types except the match s_type_col, or only on the heuristic matches in stage 2 (if they exist (if the heuristic check yielded results))
        
            stage 4:
            check whether the stage 3 result is significantly better than the stage 1 result--by checking whether the avg_dist is some percentage ('IMPROVE_RATIO') better than what it was. If not, assign the val to the matched s_type_col as would happen if the value was null

            stage 5 (doesn't work in spark):
            if the min_dist is less than some similarity cutoff: 'MIN_DIST' (meaning it is sufficiently small) and is larger than some similarity cutoff: 'IDENTICAL_CUTOFF' (meaning it isn't nearly identical to something already in the ref_set) add it to the ref_set. if initial matches are correct, later matches should be more accurate. the ref_set tops out at some sufficient size as to prevent slow down and redundant matching

        all {col_val: s_type} combinations are cached so that identical column values aren't recomputed, and so that spark can assign each to the dataframe by using a udf after they are computed outside of Spark. the cache is cleared after each dataset
        """
        col_val = str(col_val)
        s_type_col = str(s_type_col)

        add = False

        # print(col_val, s_type_col, {s_type_col: [ref_set_vals[s_type_col]]})
        if not col_val in cache_col_val:
            AVG_CUTOFF = 0.9  # similarity measure worse than this triggers second more general run
            # MIN_CUTOFF = 0.65
            # IDENTICAL_CUTOFF = 0.10
            IMPROVE_RATIO = 0.2  # second run improved by some percent

            str_col_val = str(col_val).lower()
            # print(str_col_val)
            if str_col_val == 'null' or str_col_val == '-' or str_col_val == '_' or str_col_val == '0' or str_col_val == 'none' or str_col_val == '' or col_val is None:
                res_final = (s_type_col, col_val, 0.0, 0.0
                             )  # default to s_type_col
            else:
                res0 = match_preprocess(
                    col_val, {s_type_col: ref_set_vals[s_type_col]},
                    match_jacc_avg
                )  # compare to values of matched (based on col_name) semantic type
                # print('res0:', res0)
                # res0[MIN_DIST] != 0.0
                if res0 is None or AVG_CUTOFF < res0[
                        AVG_DIST]:  # was the cutoff passed, i.e. was the value present for this semantic type based on the col_name match?
                    # check only these semantic types based on the content of the col_val (more explicit rules after examining data)
                    check = []
                    remove = []
                    is_alpha = str_col_val.isalpha()
                    is_digit = str_col_val.isdigit()
                    if len(str_col_val) == 1 and is_alpha:
                        possibles = ['person_name (middle_initial)', 'borough']
                        for pos_s_type in possibles:
                            if s_type_col == pos_s_type:  # which of these is the s_type of the col?
                                check.extend([pos_s_type])
                                break
                    if len(str_col_val) == 2 and is_alpha:
                        check.extend(['color'])
                    if len(str_col_val) == 5 and is_digit:
                        check.extend(['zip_code'])
                    if len(str_col_val) >= 3 and is_digit:
                        check.extend([
                            'city_agency', 'street_number', 'phone_number',
                            'building_classification'
                        ])
                    if len(str_col_val) >= 1 and is_digit:
                        check.extend(['street_number'])
                    if 'ps ' in str_col_val or 'is ' in str_col_val or 'js ' in str_col_val or 'hs ' in str_col_val:
                        check.extend(['school_name'])
                    if len(str_col_val
                           ) >= 3:  # can have numbers and other chars
                        if 'llc' in str_col_val or 'inc' in str_col_val or 'co' in str_col_val:
                            check.extend(['business_name'])
                        if 'http' in str_col_val or 'www' in str_col_val or 'org' in str_col_val or 'com' in str_col_val:
                            check.extend(['website'])
                        if 'ave' in str_col_val or 'str' in str_col_val:
                            if str_col_val[0].isdigit():
                                check.extend(['address'])

                    # if len(check) > 0:
                    #     print('check:', check)

                    check = list(set(check))
                    remove = list(set(remove))

                    if len(check) == 0:
                        # compare to every semantic type but already checked
                        if is_spark:  # check for expensive unnecessary operation
                            ref_set_diff = ref_set_vals
                        else:
                            ref_set_diff = copy.deepcopy(ref_set_vals)  # clone
                    else:
                        # compare to only those in check
                        ref_set_diff = {}
                        for s_type in check:
                            if is_spark:
                                ref_set_diff[s_type] = ref_set_vals[s_type]
                            else:
                                ref_set_diff[s_type] = copy.deepcopy(
                                    ref_set_vals[s_type])
                    # for key, val in ref_set_cols.items():  # compare to column names as well (for ms_core)
                    #     if key in ref_set_diff:
                    #         ref_set_diff[key].extend(val)
                    # ref_set_diff[s_type_col] = []  # prevent key error and delete all values for already matched
                    ref_set_diff[s_type_col] = ref_set_vals[
                        s_type_col]  # prevent key error and delete all values for already matched
                    for rm in remove:
                        if rm in ref_set_diff:
                            ref_set_diff[rm] = []

                    res1 = match_preprocess(
                        col_val, ref_set_diff, match_jacc_avg
                    )  # find similarity with other semantic value types
                    res_final = res1

                    if res0 is None and res1 is None:
                        res_final = (s_type_col, col_val, 0.0, 0.0)
                    elif res0 is None:
                        # print('res0:', res0, res1)
                        res_final = res1
                    elif res1 is None:
                        # print('res1:', res0, res1)
                        res_final = res0
                    else:  # neither are None
                        res_final = min([res0, res1],
                                        key=lambda x: x[AVG_DIST])

                        # if AVG_CUTOFF < res_final[AVG_DIST]:  # still greater than cutoff and therefore unknown
                        if not (
                                res_final[AVG_DIST] <=
                            (res0[AVG_DIST] *
                             (1 - IMPROVE_RATIO))):  # dist has not improved
                            res_final = _default(
                                s_type_col, col_val)  # default to s_type_col
                            # ^ should the distance be non-0 to add to ref_set?
                else:
                    # print('FALSE')
                    res_final = res0  # cutoff passed, return initial result

            # # not an exact match and up to n different values stored
            # if res_final[MIN_DIST] <= MIN_CUTOFF and res_final[MIN_DIST] >= IDENTICAL_CUTOFF and len(ref_set_vals[res_final[S_TYPE]]) < 30:
            #     if is_spark:
            #         add = True
            #     else:
            #         ref_set_vals[res_final[S_TYPE]].append(col_val)  # append to ref_set
            cache_col_val[col_val] = str(res_final[S_TYPE])
            # # print('res_final:', res_final)

        return (cache_col_val[col_val], add)

    # match_semantic_cols = udf(_match_semantic_cols, StringType())
    match_semantic_vals = udf(
        _match_semantic_vals,
        StructType([
            StructField('s_type_val', StringType(), False),
            StructField('add', BooleanType(), False)
        ]))

    master_lst = []

    def _run(df, i):
        print("col_name:", cols[i])

        col = None

        match_col = match_preprocess(
            cols[i],
            {'foo': df.columns})  # match the col from ta name to ds cols name
        if match_col is not None:
            col = match_col[COL]
        else:  # shouldn't exec
            raise Exception(f'{cols[i]} not matched in {str(df.columns)}')

        df_cols = map_cols(df.select(col))  # filter single col
        # df_cols = df_cols.sample(0.5, seed=3).limit(500)  # TEST

        if not col in cache_col_name:  # currently uneccessary since cache_col_name is cleared after every ds
            cache_col_name[col] = match_preprocess(
                col, ref_set_cols)[S_TYPE]  # match col to s_type
        s_type_col = cache_col_name[col]

        # print('s_type_col:', s_type_col)
        # print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col])

        df_cols = df_cols.withColumn(
            's_type_col', lit(s_type_col))  # populate df with col s_type

        # s_types_distinct = [(s_type_col, df_cols.count())]

        # ### Python method: no spark to add to ref_set_vals
        # if i > -10: # run on small datasets (before it gets slow)
        #     s_types_all = []
        #     for row in df_cols.select('value', 's_type_col').collect():
        #         s_type_i = _match_semantic_vals(row['value'], row['s_type_col'], False)
        #         s_types_all.append(s_type_i[0])
        #     # get (s_type, count)
        #     s_types_distinct = sc.parallelize(s_types_all).countByValue().items()
        # ###

        if i >= -10:
            ### Spark method
            df_cols = df_cols.withColumn(
                's_type_val_add', match_semantic_vals(
                    'value',
                    's_type_col'))  # match uknown col value to semantic type

            # add to ref set with Spark
            df_cols = df_cols.select('*', 's_type_val_add.s_type_val')
            df_cols = df_cols.select('*', 's_type_val_add.add')
            s_types_distinct = df_cols.select('s_type_val').rdd.map(
                lambda x: x['s_type_val']).countByValue().items()
            # for row in df_cols.filter('add == True').select('value', 's_type_val').distinct().collect():
            #     if len(ref_set_vals[row['s_type_val']]) < 30:
            #         # print('ADD')
            #         # print(row['s_type_val'], 'row:', ref_set_vals[row['s_type_val']][-5:], 'val:', row['value'])
            #         ref_set_vals[row['s_type_val']].append(row['value'])
            #     else:
            #         break

        # # DEBUG
        # df_test = df_cols.groupby('s_type_col', 'value', 's_type_val', 'add').count()
        # df_test = df_test.sort('count', ascending=False)
        # print()
        # print('25 top vals')
        # df_test.show(25)
        # print('s_type_val different than s_type_col')
        # df_test.filter('s_type_val != s_type_col').show(25)
        # ###

        ds_dict = {'column_name': ta_path[i], 'semantic_types': []}
        for s_type, count in s_types_distinct:
            if s_type in LABEL_LIST_TA:
                ds_dict['semantic_types'].append({
                    'semantic_type': s_type,
                    'count': count
                })
            else:
                ds_dict['semantic_types'].append({
                    'semantic_type': 'other',
                    'label': s_type,
                    'count': count
                })
        master_lst.append(ds_dict)

        print('ta_path[i]:', ds_dict)

        with open("results_similarities/master_dct_0.json", "w") as json_file:
            json.dump(master_lst, json_file, indent=4)

        cache_col_name.clear()
        cache_col_val.clear()

    timed(_run, gz_paths)