def baseline(train, test, score_func): """ train by baseline algorithm - finding the threshold which yelding the best accuracy scores (sorting according given score_func). Parameters: train: train data test: test data score_func: the scoring function for a given rule Retutn: """ rules_train, X_train, y_train = zip(*train) rules_test, X_test, y_test = zip(*test) train_split = list(zip(X_train, y_train, rules_train)) test_split = list(zip(X_test, y_test, rules_test)) s_train = sorted(train_split, key=lambda x: score_func(x), reverse=True) s_X, s_y, s_rules = zip(*s_train) s_AP = AP(s_y) print('train AP: {}'.format(s_AP)) save_baseline_rank(s_train, 'ranks/baseline_train_{}.txt'.format(s_AP)) s_test = sorted(test_split, key=lambda x: score_func(x), reverse=True) s_X, s_y, s_rules = zip(*s_test) s_AP = AP(s_y) print('test AP: {}'.format(s_AP)) save_baseline_rank(s_test, 'ranks/baseline_test_{}.txt'.format(s_AP)) train_chirps = np.array([score_func(t) for t in train_split]) train_y = np.array([t[1] for t in train_split]) test_chirps = np.array([score_func(t) for t in test_split]) test_y = np.array([t[1] for t in test_split]) results = [] for i, ts in enumerate(train_chirps): pred = np.where(train_chirps > ts, 1, 0) accuracy = accuracy_score(train_y, pred) prf = precision_recall_fscore_support(train_y, pred, labels=[1, 0]) ts_scores = prf[0][0], prf[1][0], prf[2][0], accuracy results.append((i, ts, ts_scores)) # max accuracy result max_i, ts, train_scores = max(results, key=lambda r: r[2][3]) print(train_scores) print('threshold: {}'.format(ts)) print('train_precision: {}'.format(train_scores[0])) print('train_recall: {}'.format(train_scores[1])) print('train_f1: {}'.format(train_scores[2])) print('train_accuracy: {}'.format(train_scores[3])) test_pred = np.where(test_chirps > ts, 1, 0) prf = precision_recall_fscore_support(test_y, test_pred, labels=[1, 0]) test_scores = prf[0][0], prf[1][0], prf[2][0] print('test_presicion: {}'.format(test_scores[0])) print('test_recall: {}'.format(test_scores[1])) print('test_f1: {}'.format(test_scores[2])) print('test_accuracy: {}'.format(accuracy_score(test_y, test_pred)))
def baseline(train, test): rules_train, X_train, y_train = zip(*train) rules_test, X_test, y_test = zip(*test) train_split = list(zip(X_train, y_train, rules_train)) test_split = list(zip(X_test, y_test, rules_test)) s_train = sorted(train_split, key=lambda x: x[0][4], reverse=True) s_X, s_y, s_rules = zip(*s_train) s_AP = AP(s_y) print('train AP: {}'.format(s_AP)) save_baseline_rank(s_train, 'ranks/baseline_train_{}.txt'.format(s_AP)) s_test = sorted(test_split, key=lambda x: x[0][4], reverse=True) s_X, s_y, s_rules = zip(*s_test) s_AP = AP(s_y) print('test AP: {}'.format(s_AP)) save_baseline_rank(s_test, 'ranks/baseline_test_{}.txt'.format(s_AP)) train_chirps = np.array([t[0][4] for t in train_split]) train_y = np.array([t[1] for t in train_split]) test_chirps = np.array([t[0][4] for t in test_split]) test_y = np.array([t[1] for t in test_split]) results = [] for i, ts in enumerate(train_chirps): pred = np.where(train_chirps > ts, 1, 0) accuracy = accuracy_score(train_y, pred) prf = precision_recall_fscore_support(train_y, pred, labels=[1, 0]) ts_scores = prf[0][0], prf[1][0], prf[2][0], accuracy results.append((i, ts, ts_scores)) # max f1 result max_i, ts, train_scores = max(results, key=lambda r: r[2][2]) print(train_scores) print('threshold: {}'.format(ts)) # print('precision: {}, recall: {}, f1: {}, accuracy: {}'.format print('train_precision: {}'.format(train_scores[0])) print('train_recall: {}'.format(train_scores[1])) print('train_f1: {}'.format(train_scores[2])) print('train_accuracy: {}'.format(train_scores[3])) test_pred = np.where(test_chirps > ts, 1, 0) prf = precision_recall_fscore_support(test_y, test_pred, labels=[1, 0]) test_scores = prf[0][0], prf[1][0], prf[2][0] print('test_presicion: {}'.format(test_scores[0])) print('test_recall: {}'.format(test_scores[1])) print('test_f1: {}'.format(test_scores[2])) print('test_accuracy: {}'.format(accuracy_score(test_y, test_pred)))
def calculate_AP(data, clf, save_to): rules, X, y = zip(*data) proba = clf.predict_proba(X)[:, np.where(clf.classes_ == 1)[0][0]] test_rank = sorted(zip(proba, y, rules), key=lambda r: r[0], reverse=True) label_rank = list(zip(*test_rank))[1] AP_score = AP(label_rank) save_clf_rank(test_rank, save_to.format(AP_score)) return AP_score
def main(): with open('datasets/Kian/eval_an', 'rb') as f: eval_rules = pickle.load(f) eval_rules = random.sample(eval_rules, 500) rules, x, y = zip(*eval_rules) with open('best_rf', 'rb') as f_clf: clf = pickle.load(f_clf) pred = clf.predict(x) pred_proba = clf.predict_proba(x) proba = list(zip(x,y,rules, pred_proba, pred)) sort_proba = sorted(proba, key=lambda a:a[3][1], reverse=True) _, y, rules_model, _, pred = zip(*sort_proba) inx = list(map(int, np.linspace(0,len(y), 5))) bins = list(zip(inx[:-1], inx[1:])) print('sort by classifier') for bin_s, bin_e in bins: print("number of positive rules between {} to {}".format(bin_s, bin_e)) print(sum(y[bin_s:bin_e])) print('classifier AP: {}'.format(AP(y))) print_top_10(y, rules_model) sort_chirps = sorted(proba, key=lambda a:a[0][4], reverse=True) _, y, rules_chirps, _, pred = zip(*sort_chirps) print('sort by chirps rank') for bin_s, bin_e in bins: print("number of positive rules between {} to {}".format(bin_s, bin_e)) print(sum(y[bin_s:bin_e])) print('chirps AP: {}'.format(AP(y))) print_top_10(y, rules_chirps) sort_GloVe = sorted(proba, key=lambda a:score_GloVe(a), reverse=True) _, y, rules_GloVe, _, pred = zip(*sort_GloVe) print('sort by GloVe rank') for bin_s, bin_e in bins: print("number of positive rules between {} to {}".format(bin_s, bin_e)) print(sum(y[bin_s:bin_e])) print('GloVe AP: {}'.format(AP(y))) print_top_10(y, rules_GloVe)
def calculate_AP_s(X, y, clf): """ calculate the AP by the classifier proba Parameters: X: list of feature vectors y: list of labels clf: clssifier Return: the AP score """ proba = clf.predict_proba(X)[:, np.where(clf.classes_ == 1)[0][0]] test_rank = sorted(zip(proba, y), key=lambda r: r[0], reverse=True) label_rank = list(zip(*test_rank))[1] AP_score = AP(label_rank) return AP_score
def calculate_AP(data, clf, save_to): """ calculate the AP by the classifier proba and saves the ranked rules with the label and the score to save_to path Parameters: data: list of data tupels (rule name, X, y) clf: clssifier save_to: the path where to save the ranks Return: the AP score """ rules, X, y = zip(*data) proba = clf.predict_proba(X)[:, np.where(clf.classes_ == 1)[0][0]] test_rank = sorted(zip(proba, y, rules), key=lambda r: r[0], reverse=True) label_rank = list(zip(*test_rank))[1] AP_score = AP(label_rank) save_clf_rank(test_rank, save_to.format(AP_score)) return AP_score
def main(): """ Create graphs for all good and bad rules. Calculate a avarage component size. Save ranked rules list to save_to file """ parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default=None, type=str, required=True, help="The data pickle files directory") parser.add_argument("--data_dir2", default=None, type=str, required=False, help="The 2nd data pickle files directory (optional)") args = parser.parse_args() rules_paths = [args.data_dir] if args.data_dir2: rules_paths.append(args.data_dir2) rules = [] for path in rules_paths: rules += extract_rules(good_rules_path) rule_edges = edges_per_rules(rules) rule_avg_comp_size = [] logger.info('start calculating componnents size') for rule, edges in rule_edges.items(): rule_G = create_graph(edges) avg_size = componnents_size(rule_G) rule_type = 1 if rule in good_rules else 0 rule_avg_comp_size.append(['\t'.join(rule), avg_size, rule_type]) rule_avg_comp_size = sorted(rule_avg_comp_size, key=lambda x: x[1], reverse=True) rule_rank = [r[2] for r in rule_avg_comp_size] graph_AP = AP(rule_rank) print(graph_AP) save_ranked_rules(rule_avg_comp_size)
def baseline_cv(train, kf): """ train by baseline algorithm - finding the threshold which yelding the best f1 scores (sorting according to chirps score). Parameters: train: train data kf: cross-validation splitter Retutn: split scores """ scores = defaultdict(list) rules, X, y = zip(*train) rules = np.array(rules) X = np.array(X) y = np.array(y) for train_inx, test_inx in tqdm(kf.split(X)): rules_train, X_train, y_train = rules[train_inx], X[train_inx], y[ train_inx] rules_test, X_test, y_test = rules[test_inx], X[test_inx], y[test_inx] train_split = list(zip(X_train, y_train, rules_train)) test_split = list(zip(X_test, y_test, rules_test)) s_train = sorted(train_split, key=lambda x: x[0][4], reverse=True) s_X, s_y, s_rules = zip(*s_train) s_AP = AP(s_y) scores['train_APs'].append(s_AP) save_baseline_rank(s_train, 'ranks/baseline_train_{}.txt'.format(s_AP)) s_test = sorted(test_split, key=lambda x: x[0][4], reverse=True) s_X, s_y, s_rules = zip(*s_test) s_AP = AP(s_y) scores['test APs'].append(s_AP) save_baseline_rank(s_test, 'ranks/baseline_test_{}.txt'.format(s_AP)) train_chirps = np.array([t[0][4] for t in train_split]) train_y = np.array([t[1] for t in train_split]) test_chirps = np.array([t[0][4] for t in test_split]) test_y = np.array([t[1] for t in test_split]) results = [] for i, ts in enumerate(train_chirps): pred = np.where(train_chirps > ts, 1, 0) accuracy = accuracy_score(train_y, pred) prf = precision_recall_fscore_support(train_y, pred, labels=[1, 0]) ts_scores = prf[0][0], prf[1][0], prf[2][0], accuracy results.append((i, ts, ts_scores)) # max f1 result max_i, ts, train_scores = max(results, key=lambda r: r[2][2]) print(train_scores) scores['thresholds'].append(ts) scores['train_precision'].append(train_scores[0]) scores['train_recall'].append(train_scores[1]) scores['train_f1'].append(train_scores[2]) scores['train_accuracy'].append(train_scores[3]) test_pred = np.where(test_chirps > ts, 1, 0) prf = precision_recall_fscore_support(test_y, test_pred, labels=[1, 0]) test_scores = prf[0][0], prf[1][0], prf[2][0] scores['test_presicion'].append(test_scores[0]) scores['test_recall'].append(test_scores[1]) scores['test_f1'].append(test_scores[2]) scores['test_accuracy'].append(accuracy_score(test_y, test_pred)) print(scores) for score_type, l in scores.items(): scores[score_type] = np.array(l) print_scores(scores, score_type) return scores
def sort_and_AP(X, inx, y): sorted_x = sorted(zip(X, y), key=lambda k: k[0][inx], reverse=True) ys = list(zip(*sorted_x))[1] return AP(ys)
def calculate_AP_s(X, y, clf): proba = clf.predict_proba(X)[:, np.where(clf.classes_ == 1)[0][0]] test_rank = sorted(zip(proba, y), key=lambda r: r[0], reverse=True) label_rank = list(zip(*test_rank))[1] AP_score = AP(label_rank) return AP_score