def test_1(): """ Tests that the match factor for a known learning set and test case is close to the known value """ script_dir = os.path.dirname(__file__) file1 = os.path.join(script_dir, '../assets/bladeCombined.csv') store_data, records = get_data(file1) conf_results, results = find_associations(store_data, records, support=0.0003, confidence=0.01, lift=0.1) thresh_results = get_top_results(conf_results, 0.7) test_file = os.path.join(script_dir, '../assets/jigsawQuery.csv') test_data, test_records = get_data(test_file) learned_dict, matched, overmatched, unmatched, match_factor = match( thresh_results, test_records) assert np.allclose(0.82051, match_factor)
def test1(): """ Test to find F1 score by manually selecting product id(s) from original data to test """ script_dir = os.path.dirname(__file__) file_to_learn = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') train_data = make_df(file_to_learn) # Use a threshold to get the top XX% of frequency values threshold = 0.7 ## Choose ID(s) from learning file to separate into the testing set test_ids = [691, 169] test_df, train_df = split_learning_verification(train_data, test_ids) test_list = df_to_list(test_df) comb_sort = counter_pandas(train_df) thresh_results = get_top_results(comb_sort, threshold) # Find the F1 score learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( thresh_results, test_list) assert len(learned_dict) != 0 assert f1 > 0
def test_1(): """ Tests that the top function-flow combination for the component "screw" is "couple solid" """ script_dir = os.path.dirname(__file__) file1 = os.path.join(script_dir, '../assets/bladeCombined.csv') store_data, records = get_data(file1) conf_results, results = find_associations(store_data, records) thresh_results = get_top_results(conf_results, 0.7) assert thresh_results['screw'][0][0] == 'couple solid'
def test_2(): """ Tests that the top 70% of function-flow combinations for the component "screw" only has one result """ script_dir = os.path.dirname(__file__) file1 = os.path.join(script_dir, '../assets/bladeCombined.csv') store_data, records = get_data(file1) conf_results, results = find_associations(store_data, records) thresh_results = get_top_results(conf_results, 0.7) assert len(thresh_results['screw']) == 1
def test_get_top_results(): """ Tests that the top 70% of function-flow combinations for the component "screw" only has one result """ script_dir = os.path.dirname(__file__) file2 = os.path.join(script_dir, '../assets/bladeCombined.csv') comb_sort = count_stuff(file2) threshold = 0.7 thresh_results = get_top_results(comb_sort, threshold) assert len(thresh_results['screw']) == 1
def test_1(): """ Example showing how to automate result finding with probability values """ # Dataset used for data mining script_dir = os.path.dirname(__file__) file1 = os.path.join(script_dir, '../assets/bladeCombined.csv') comb_sort = count_stuff(file1) # Use a threshold to get the top XX% of confidence values threshold = 0.5 thresh_results = get_top_results(comb_sort, threshold) # Use a known product for verification input_file = os.path.join(script_dir, '../assets/InputExample.csv') # Get dictionary of functions and flows for each component based on data mining results, unmatched = get_func_rep(thresh_results, input_file, True) assert results['screw'][0][0] == 'couple solid' assert 'cheese' in unmatched
def test_1(): """ Example showing how to automate functional representation with frequency values""" # Dataset used for data mining script_dir = os.path.dirname(__file__) file_to_test = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') test_data = pd.read_csv(file_to_test) combos_sorted = counter_pandas(test_data) # Use a threshold to get the top XX% of confidence values threshold = 0.5 thresh_results = get_top_results(combos_sorted, threshold) # Use a known product for verification input_file = os.path.join(script_dir, '../autofunc/assets/InputExample.csv') # Get dictionary of functions and flows for each component based on data mining results, unmatched = get_func_rep(thresh_results, input_file, True) assert results['screw'][0][0] == 'couple solid' assert 'cheese' in unmatched
import os.path import pandas as pd """ Example showing how to find F1 score using separate file of input components """ # Dataset used for data mining script_dir = os.path.dirname(__file__) file_to_learn = os.path.join(script_dir, '../autofunc/assets/consumer_systems.csv') train_data = pd.read_csv(file_to_learn) combos_sorted = counter_pandas(train_data) # Use a threshold to get the top XX% of confidence values threshold = 0.5 thresh_results = get_top_results(combos_sorted, threshold) # Use a known product for verification test_file = os.path.join(script_dir, '../autofunc/assets/jigsawQuery_headers.csv') test_data = pd.read_csv(test_file) test_list = df_to_list(test_data) learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall(thresh_results, test_list) # Optional write to file - uncomment and rename to write file # write_results_from_dict(learned_dict, 'test1.csv')
# Making folds using list comprehension folds = [ test_ids[i * n:(i + 1) * n] for i in range((len(test_ids) + n - 1) // n) ] for e in folds: verification_ids = e ver_df, learn_df = split_learning_verification(df, verification_ids) ver_list = df_to_list(ver_df) if not bd: comb_sort, counts, combos = counter_pandas_with_counts(learn_df) thresh_results = get_top_results(comb_sort, threshold) # Find the F1 score of the verification test by comparing the learned results with the known function/flows learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( thresh_results, ver_list) if bd: bd_comb_sort = counter_pandas(bd_df) bd_thresh_results = get_top_results(bd_comb_sort, threshold) learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( bd_thresh_results, ver_list) precisions.append(precision) recalls.append(recall) print(e)
train_comps = list(keep_df.comp.unique()) if train_comps: comp_ratio = len(train_comps) / num_all_comps comp_ratios.append((len(keep_ids), i, comp_ratio)) if comp_ratio > 0.7 and len(keep_ids) < 40: keepers.append(keep_ids) scatter_keep.append((comp_ratio, len(keep_ids))) for t in range(10, 100, 5): threshold = t / 100 print(test_id, ' ', ps_thresh, ' ', threshold) thresh_results = get_top_results(comb_sort, threshold) if not keep_ids: f1 = 0 num_train_comps = 0 else: # Find the F1 score of the verification test by comparing the learned results with the known function/flows learned_dict, matched, overmatched, unmatched, recall, precision, f1 = precision_recall( thresh_results, test_list) num_train_comps = len(train_comps) save_data.append((test_id, ps_thresh, threshold, len(keep_ids), f1, num_train_comps / num_all_comps)) points.append((ps_thresh, threshold, f1))
from autofunc.get_match_factor import match from autofunc.get_top_results import get_top_results from autofunc.find_associations import find_associations from autofunc.get_data import get_data import os.path """ Example showing how to find the match factor using association rules """ # Dataset used for data mining script_dir = os.path.dirname(__file__) file1 = os.path.join(script_dir, '../assets/bladeCombined.csv') # Convert file to data frame and list store_data, records = get_data(file1) # Use Association Rules to sort the functions/flows of components by confidence conf_results, results = find_associations(store_data, records) # Use a threshold to get the top XX% of confidence values thresh_results = get_top_results(conf_results, 0.7) # Use a known product for verification test_file = os.path.join(script_dir, '../assets/jigsawQuery.csv') test_data, test_records = get_data(test_file) # Find the match factor of the verification test by comparing the learned results with the known function/flows learned_dict, matched, overmatched, unmatched, match_factor = match( thresh_results, test_records) print('Match factor = {0:.5f}'.format(match_factor))