def test1(): """ Test to find F1 score by manually selecting product id(s) from original data to test """ script_dir = os.path.dirname(__file__) file_to_learn = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') train_data = make_df(file_to_learn) # Use a threshold to get the top XX% of frequency values threshold = 0.7 ## Choose ID(s) from learning file to separate into the testing set test_ids = [691, 169] test_df, train_df = split_learning_verification(train_data, test_ids) test_list = df_to_list(test_df) comb_sort = counter_pandas(train_df) thresh_results = get_top_results(comb_sort, threshold) # Find the F1 score learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( thresh_results, test_list) assert len(learned_dict) != 0 assert f1 > 0
def test_1(): """ Testing that the highest confidence result for the screw component is couple solid, which is what a screw does almost exclusively """ script_dir = os.path.dirname(__file__) file_to_test = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') test_data = pd.read_csv(file_to_test) combos_sorted = counter_pandas(test_data) assert combos_sorted['screw'][0][0] == 'couple solid'
def test_1(): """ Example showing how to automate functional representation with frequency values""" # Dataset used for data mining script_dir = os.path.dirname(__file__) file_to_test = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') test_data = pd.read_csv(file_to_test) combos_sorted = counter_pandas(test_data) # Use a threshold to get the top XX% of confidence values threshold = 0.5 thresh_results = get_top_results(combos_sorted, threshold) # Use a known product for verification input_file = os.path.join(script_dir, '../autofunc/assets/InputExample.csv') # Get dictionary of functions and flows for each component based on data mining results, unmatched = get_func_rep(thresh_results, input_file, True) assert results['screw'][0][0] == 'couple solid' assert 'cheese' in unmatched
from autofunc.get_top_results import get_top_results from autofunc.counter_pandas import counter_pandas from autofunc.get_precision_recall import precision_recall from autofunc.df_to_list import df_to_list import os.path import pandas as pd """ Example showing how to find F1 score using separate file of input components """ # Dataset used for data mining script_dir = os.path.dirname(__file__) file_to_learn = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') train_data = pd.read_csv(file_to_learn) combos_sorted = counter_pandas(train_data) # Use a threshold to get the top XX% of confidence values threshold = 0.5 thresh_results = get_top_results(combos_sorted, threshold) # Use a known product for verification test_file = os.path.join(script_dir, '../autofunc/assets/jigsawQuery_headers.csv') test_data = pd.read_csv(test_file) test_list = df_to_list(test_data) learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(thresh_results, test_list) # Optional write to file - uncomment and rename to write file
verification_ids = e ver_df, learn_df = split_learning_verification(df, verification_ids) ver_list = df_to_list(ver_df) if not bd: comb_sort, counts, combos = counter_pandas_with_counts(learn_df) thresh_results = get_top_results(comb_sort, threshold) # Find the F1 score of the verification test by comparing the learned results with the known function/flows learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( thresh_results, ver_list) if bd: bd_comb_sort = counter_pandas(bd_df) bd_thresh_results = get_top_results(bd_comb_sort, threshold) learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( bd_thresh_results, ver_list) precisions.append(precision) recalls.append(recall) print(e) f1s += f1 keep.append([e, f1]) plots.append(f1) avg_f1 = f1s / len(keep)
ps_thresh = i / 100 if reading: keep_ids = similarity_df[ similarity_df[str(test_id)] > ps_thresh].index.tolist() else: keep_ids = similarity_df[ similarity_df[test_id] > ps_thresh].index.tolist() keep_ids.remove(test_id) # Only keep rows from data frame that have an id that is in the keep_ids list keep_df = train_df[train_df['id'].isin(keep_ids)] comb_sort = counter_pandas(keep_df) # Component counting and fractions train_comps = list(keep_df.comp.unique()) if train_comps: comp_ratio = len(train_comps) / num_all_comps comp_ratios.append((len(keep_ids), i, comp_ratio)) if comp_ratio > 0.7 and len(keep_ids) < 40: keepers.append(keep_ids) scatter_keep.append((comp_ratio, len(keep_ids))) for t in range(10, 100, 5): threshold = t / 100
ps_thresh = i / 100 if reading: keep_ids = similarity_df[ similarity_df[str(test_id)] > ps_thresh].index.tolist() else: keep_ids = similarity_df[ similarity_df[test_id] > ps_thresh].index.tolist() keep_ids.remove(test_id) # Only keep rows from data frame that have an id that is in the keep_ids list keep_df = train_df[train_df['id'].isin(keep_ids)] comb_sort = counter_pandas(keep_df) # Component counting and fractions train_comps = list(keep_df.comp.unique()) if train_comps: comp_ratios.append( (len(keep_ids), i, len(train_comps) / num_all_comps)) for t in range(10, 100, 5): threshold = t / 100 print(test_id, ' ', ps_thresh, ' ', threshold) thresh_results = get_top_results(comb_sort, threshold) if not keep_ids: