def baseline(train, test, score_func):
    """
    train by baseline algorithm - finding the threshold which yelding the best accuracy scores (sorting according given score_func).
    Parameters:
        train: train data
        test: test data
        score_func: the scoring function for a given rule
    Retutn:
    """
    rules_train, X_train, y_train = zip(*train)
    rules_test, X_test, y_test = zip(*test)

    train_split = list(zip(X_train, y_train, rules_train))
    test_split = list(zip(X_test, y_test, rules_test))

    s_train = sorted(train_split, key=lambda x: score_func(x), reverse=True)
    s_X, s_y, s_rules = zip(*s_train)
    s_AP = AP(s_y)
    print('train AP: {}'.format(s_AP))
    save_baseline_rank(s_train, 'ranks/baseline_train_{}.txt'.format(s_AP))
    s_test = sorted(test_split, key=lambda x: score_func(x), reverse=True)
    s_X, s_y, s_rules = zip(*s_test)
    s_AP = AP(s_y)
    print('test AP: {}'.format(s_AP))

    save_baseline_rank(s_test, 'ranks/baseline_test_{}.txt'.format(s_AP))

    train_chirps = np.array([score_func(t) for t in train_split])
    train_y = np.array([t[1] for t in train_split])

    test_chirps = np.array([score_func(t) for t in test_split])
    test_y = np.array([t[1] for t in test_split])

    results = []
    for i, ts in enumerate(train_chirps):
        pred = np.where(train_chirps > ts, 1, 0)
        accuracy = accuracy_score(train_y, pred)
        prf = precision_recall_fscore_support(train_y, pred, labels=[1, 0])
        ts_scores = prf[0][0], prf[1][0], prf[2][0], accuracy
        results.append((i, ts, ts_scores))

    # max accuracy result
    max_i, ts, train_scores = max(results, key=lambda r: r[2][3])
    print(train_scores)
    print('threshold: {}'.format(ts))

    print('train_precision: {}'.format(train_scores[0]))
    print('train_recall: {}'.format(train_scores[1]))
    print('train_f1: {}'.format(train_scores[2]))
    print('train_accuracy: {}'.format(train_scores[3]))

    test_pred = np.where(test_chirps > ts, 1, 0)
    prf = precision_recall_fscore_support(test_y, test_pred, labels=[1, 0])
    test_scores = prf[0][0], prf[1][0], prf[2][0]
    print('test_presicion: {}'.format(test_scores[0]))
    print('test_recall: {}'.format(test_scores[1]))
    print('test_f1: {}'.format(test_scores[2]))
    print('test_accuracy: {}'.format(accuracy_score(test_y, test_pred)))
def baseline(train, test):
    rules_train, X_train, y_train = zip(*train)
    rules_test, X_test, y_test = zip(*test)

    train_split = list(zip(X_train, y_train, rules_train))
    test_split = list(zip(X_test, y_test, rules_test))

    s_train = sorted(train_split, key=lambda x: x[0][4], reverse=True)
    s_X, s_y, s_rules = zip(*s_train)
    s_AP = AP(s_y)
    print('train AP: {}'.format(s_AP))
    save_baseline_rank(s_train, 'ranks/baseline_train_{}.txt'.format(s_AP))
    s_test = sorted(test_split, key=lambda x: x[0][4], reverse=True)
    s_X, s_y, s_rules = zip(*s_test)
    s_AP = AP(s_y)
    print('test AP: {}'.format(s_AP))

    save_baseline_rank(s_test, 'ranks/baseline_test_{}.txt'.format(s_AP))

    train_chirps = np.array([t[0][4] for t in train_split])
    train_y = np.array([t[1] for t in train_split])

    test_chirps = np.array([t[0][4] for t in test_split])
    test_y = np.array([t[1] for t in test_split])

    results = []
    for i, ts in enumerate(train_chirps):
        pred = np.where(train_chirps > ts, 1, 0)
        accuracy = accuracy_score(train_y, pred)
        prf = precision_recall_fscore_support(train_y, pred, labels=[1, 0])
        ts_scores = prf[0][0], prf[1][0], prf[2][0], accuracy
        results.append((i, ts, ts_scores))

    # max f1 result
    max_i, ts, train_scores = max(results, key=lambda r: r[2][2])
    print(train_scores)
    print('threshold: {}'.format(ts))

    # print('precision: {}, recall: {}, f1: {}, accuracy: {}'.format
    print('train_precision: {}'.format(train_scores[0]))
    print('train_recall: {}'.format(train_scores[1]))
    print('train_f1: {}'.format(train_scores[2]))
    print('train_accuracy: {}'.format(train_scores[3]))

    test_pred = np.where(test_chirps > ts, 1, 0)
    prf = precision_recall_fscore_support(test_y, test_pred, labels=[1, 0])
    test_scores = prf[0][0], prf[1][0], prf[2][0]
    print('test_presicion: {}'.format(test_scores[0]))
    print('test_recall: {}'.format(test_scores[1]))
    print('test_f1: {}'.format(test_scores[2]))
    print('test_accuracy: {}'.format(accuracy_score(test_y, test_pred)))
def calculate_AP(data, clf, save_to):
    rules, X, y = zip(*data)
    proba = clf.predict_proba(X)[:, np.where(clf.classes_ == 1)[0][0]]
    test_rank = sorted(zip(proba, y, rules), key=lambda r: r[0], reverse=True)
    label_rank = list(zip(*test_rank))[1]
    AP_score = AP(label_rank)
    save_clf_rank(test_rank, save_to.format(AP_score))
    return AP_score
def main():
    with open('datasets/Kian/eval_an', 'rb') as f:
        eval_rules = pickle.load(f)
    
    eval_rules = random.sample(eval_rules, 500)
    rules, x, y = zip(*eval_rules)
    with open('best_rf', 'rb') as f_clf:
        clf = pickle.load(f_clf)

    pred = clf.predict(x)
    pred_proba = clf.predict_proba(x)
    proba = list(zip(x,y,rules, pred_proba, pred))
    sort_proba = sorted(proba, key=lambda a:a[3][1], reverse=True)
    _, y, rules_model, _, pred = zip(*sort_proba)
    inx = list(map(int, np.linspace(0,len(y), 5)))
    bins = list(zip(inx[:-1], inx[1:])) 
    print('sort by classifier')
    for bin_s, bin_e in bins:
        print("number of positive rules between {} to {}".format(bin_s, bin_e))
        print(sum(y[bin_s:bin_e]))
    print('classifier AP: {}'.format(AP(y)))
    print_top_10(y, rules_model)
    
    
    sort_chirps = sorted(proba, key=lambda a:a[0][4], reverse=True)
    _, y, rules_chirps, _, pred = zip(*sort_chirps)
    print('sort by chirps rank')
    for bin_s, bin_e in bins:
        print("number of positive rules between {} to {}".format(bin_s, bin_e))
        print(sum(y[bin_s:bin_e]))
    print('chirps AP: {}'.format(AP(y)))
    print_top_10(y, rules_chirps)
    
    sort_GloVe = sorted(proba, key=lambda a:score_GloVe(a), reverse=True)
    _, y, rules_GloVe, _, pred = zip(*sort_GloVe)
    print('sort by GloVe rank')
    for bin_s, bin_e in bins:
        print("number of positive rules between {} to {}".format(bin_s, bin_e))
        print(sum(y[bin_s:bin_e]))
    print('GloVe AP: {}'.format(AP(y)))
    print_top_10(y, rules_GloVe)
Example #5
0
def calculate_AP_s(X, y, clf):
    """
    calculate the AP by the classifier proba
    Parameters:
        X: list of feature vectors
        y: list of labels
        clf: clssifier
    Return:
        the AP score
    """
    proba = clf.predict_proba(X)[:, np.where(clf.classes_ == 1)[0][0]]
    test_rank = sorted(zip(proba, y), key=lambda r: r[0], reverse=True)
    label_rank = list(zip(*test_rank))[1]
    AP_score = AP(label_rank)
    return AP_score
Example #6
0
def calculate_AP(data, clf, save_to):
    """
    calculate the AP by the classifier proba and saves the ranked rules with the label and the score to save_to path
    Parameters:
        data: list of data tupels (rule name, X, y)
        clf: clssifier
        save_to: the path where to save the ranks
    Return:
        the AP score
    """
    rules, X, y = zip(*data)
    proba = clf.predict_proba(X)[:, np.where(clf.classes_ == 1)[0][0]]
    test_rank = sorted(zip(proba, y, rules), key=lambda r: r[0], reverse=True)
    label_rank = list(zip(*test_rank))[1]
    AP_score = AP(label_rank)
    save_clf_rank(test_rank, save_to.format(AP_score))
    return AP_score
Example #7
0
def main():
    """
    Create graphs for all good and bad rules.
    Calculate a avarage component size.
    Save ranked rules list to save_to file
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The data pickle files directory")
    parser.add_argument("--data_dir2",
                        default=None,
                        type=str,
                        required=False,
                        help="The 2nd data pickle files directory (optional)")

    args = parser.parse_args()
    rules_paths = [args.data_dir]
    if args.data_dir2:
        rules_paths.append(args.data_dir2)
    rules = []
    for path in rules_paths:
        rules += extract_rules(good_rules_path)
    rule_edges = edges_per_rules(rules)
    rule_avg_comp_size = []
    logger.info('start calculating componnents size')
    for rule, edges in rule_edges.items():
        rule_G = create_graph(edges)
        avg_size = componnents_size(rule_G)
        rule_type = 1 if rule in good_rules else 0
        rule_avg_comp_size.append(['\t'.join(rule), avg_size, rule_type])
    rule_avg_comp_size = sorted(rule_avg_comp_size,
                                key=lambda x: x[1],
                                reverse=True)
    rule_rank = [r[2] for r in rule_avg_comp_size]
    graph_AP = AP(rule_rank)
    print(graph_AP)
    save_ranked_rules(rule_avg_comp_size)
def baseline_cv(train, kf):
    """
    train by baseline algorithm - finding the threshold which yelding the best f1 scores (sorting according to chirps score).
    Parameters:
        train: train data
        kf: cross-validation splitter
    Retutn:
        split scores
    """
    scores = defaultdict(list)
    rules, X, y = zip(*train)
    rules = np.array(rules)
    X = np.array(X)
    y = np.array(y)
    for train_inx, test_inx in tqdm(kf.split(X)):
        rules_train, X_train, y_train = rules[train_inx], X[train_inx], y[
            train_inx]
        rules_test, X_test, y_test = rules[test_inx], X[test_inx], y[test_inx]

        train_split = list(zip(X_train, y_train, rules_train))
        test_split = list(zip(X_test, y_test, rules_test))

        s_train = sorted(train_split, key=lambda x: x[0][4], reverse=True)
        s_X, s_y, s_rules = zip(*s_train)
        s_AP = AP(s_y)
        scores['train_APs'].append(s_AP)

        save_baseline_rank(s_train, 'ranks/baseline_train_{}.txt'.format(s_AP))
        s_test = sorted(test_split, key=lambda x: x[0][4], reverse=True)
        s_X, s_y, s_rules = zip(*s_test)
        s_AP = AP(s_y)
        scores['test APs'].append(s_AP)

        save_baseline_rank(s_test, 'ranks/baseline_test_{}.txt'.format(s_AP))
        train_chirps = np.array([t[0][4] for t in train_split])
        train_y = np.array([t[1] for t in train_split])

        test_chirps = np.array([t[0][4] for t in test_split])
        test_y = np.array([t[1] for t in test_split])

        results = []
        for i, ts in enumerate(train_chirps):
            pred = np.where(train_chirps > ts, 1, 0)
            accuracy = accuracy_score(train_y, pred)
            prf = precision_recall_fscore_support(train_y, pred, labels=[1, 0])
            ts_scores = prf[0][0], prf[1][0], prf[2][0], accuracy
            results.append((i, ts, ts_scores))

        # max f1 result
        max_i, ts, train_scores = max(results, key=lambda r: r[2][2])
        print(train_scores)
        scores['thresholds'].append(ts)

        scores['train_precision'].append(train_scores[0])
        scores['train_recall'].append(train_scores[1])
        scores['train_f1'].append(train_scores[2])
        scores['train_accuracy'].append(train_scores[3])

        test_pred = np.where(test_chirps > ts, 1, 0)
        prf = precision_recall_fscore_support(test_y, test_pred, labels=[1, 0])
        test_scores = prf[0][0], prf[1][0], prf[2][0]
        scores['test_presicion'].append(test_scores[0])
        scores['test_recall'].append(test_scores[1])
        scores['test_f1'].append(test_scores[2])
        scores['test_accuracy'].append(accuracy_score(test_y, test_pred))

    print(scores)
    for score_type, l in scores.items():
        scores[score_type] = np.array(l)
        print_scores(scores, score_type)

    return scores
def sort_and_AP(X, inx, y):
    sorted_x = sorted(zip(X, y), key=lambda k: k[0][inx], reverse=True)
    ys = list(zip(*sorted_x))[1]
    return AP(ys)
def calculate_AP_s(X, y, clf):
    proba = clf.predict_proba(X)[:, np.where(clf.classes_ == 1)[0][0]]
    test_rank = sorted(zip(proba, y), key=lambda r: r[0], reverse=True)
    label_rank = list(zip(*test_rank))[1]
    AP_score = AP(label_rank)
    return AP_score