예제 #1
0
def test1():
    """ Test to find F1 score by manually selecting product id(s) from original data to test """

    script_dir = os.path.dirname(__file__)
    file_to_learn = os.path.join(script_dir,
                                 '../autofunc/assets/consumer_systems.csv')

    train_data = make_df(file_to_learn)

    # Use a threshold to get the top XX% of frequency values
    threshold = 0.7

    ## Choose ID(s) from learning file to separate into the testing set
    test_ids = [691, 169]

    test_df, train_df = split_learning_verification(train_data, test_ids)

    test_list = df_to_list(test_df)

    comb_sort = counter_pandas(train_df)
    thresh_results = get_top_results(comb_sort, threshold)

    # Find the F1 score
    learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(
        thresh_results, test_list)

    assert len(learned_dict) != 0
    assert f1 > 0
예제 #2
0
# Dataset used for data mining
script_dir = os.path.dirname(__file__)
file_to_learn = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv')

train_data = pd.read_csv(file_to_learn)
combos_sorted = counter_pandas(train_data)

# Use a threshold to get the top XX% of confidence values
threshold = 0.5
thresh_results = get_top_results(combos_sorted, threshold)

# Use a known product for verification
test_file = os.path.join(script_dir, '../autofunc/assets/jigsawQuery_headers.csv')
test_data = pd.read_csv(test_file)
test_list = df_to_list(test_data)

learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(thresh_results,
                                                                                                    test_list)


# Optional write to file - uncomment and rename to write file
# write_results_from_dict(learned_dict, 'test1.csv')



print('Recall = {0:.5f}'.format(recall))
print('Precision = {0:.5f}'.format(precision))
print('F1 = {0:.5f}'.format(f1))

예제 #3
0
for i in range(iters):
    random.shuffle(test_ids)
    # Split into folds
    n = floor(len(test_ids) / k)
    # Making folds using list comprehension
    folds = [
        test_ids[i * n:(i + 1) * n]
        for i in range((len(test_ids) + n - 1) // n)
    ]

    for e in folds:
        verification_ids = e

        ver_df, learn_df = split_learning_verification(df, verification_ids)

        ver_list = df_to_list(ver_df)

        if not bd:
            comb_sort, counts, combos = counter_pandas_with_counts(learn_df)
            thresh_results = get_top_results(comb_sort, threshold)

            # Find the F1 score of the verification test by comparing the learned results with the known function/flows
            learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(
                thresh_results, ver_list)

        if bd:
            bd_comb_sort = counter_pandas(bd_df)
            bd_thresh_results = get_top_results(bd_comb_sort, threshold)
            learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(
                bd_thresh_results, ver_list)
예제 #4
0
## End 1

## 2. Reading in dataframe as computed above
# similarity_df = pd.read_csv('consumer_similarity.csv')
# reading = True
## End 2

all_comps = list(train_df_whole.comp.unique())
num_all_comps = len(all_comps)

## Main loop, comment if not reading in all_data
for test_id in all_train_ids:
    # print(test_id)
    #
    test_df, train_df = split_learning_verification(train_df_whole, [test_id])
    test_list = df_to_list(test_df)
    train_ids = list(map(int, train_df.id.unique()))

    id_comp_ratios.append(
        (test_id, len(list(test_df.comp.unique())) / num_all_comps))
    #
    # Outer loop through percent similar
    for i in range(0, 100, 10):

        ps_start = time.time()

        f1_plot = []
        thresh_plot = []
        ps_plot = []

        keep_ids = []