def compare_manual_vs_model():

    with open(DATA_FOLDER + "labels_int.p", "r") as f:
        y_dict = pickle.load(f)

    print "Loading test data"
    X_test, y_test, filenames_test = dataset.load_test()
    y_pred = joblib.load("../models/pred_ml_improved.pkl")

    relevant = []
    for pred, correct, filename in zip(y_pred, y_test, filenames_test):
        if filename in FILES:
            relevant.append((pred, correct, filename, CLASSIFICATIONS[filename]))

    model_predictions, correct, filename, manual_predictions = zip(*relevant)
    manual_predictions = learn.multilabel_binary_y(manual_predictions)
    model_predictions = np.array(model_predictions)
    correct = learn.multilabel_binary_y(correct)

    rules = infer_topology.infer_topology_rules()
    improved_manual = infer_topology.apply_topology_rules(rules, manual_predictions)

    prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"]
    predictions = [model_predictions, manual_predictions, improved_manual]

    for name, pred in zip(prediction_names, predictions):

        print "\n{}\n--".format(name)
        print "Zero-one classification loss", zero_one_loss(correct, pred)
        print "Hamming loss", hamming_loss(correct, pred)
        print "Precision:", precision_score(correct, pred, average="weighted", labels=label_list)
        print "Recall   :", recall_score(correct, pred, average="weighted", labels=label_list)
        print "F1 score :", f1_score(correct, pred, average="weighted", labels=label_list)
def compare_manual_vs_model():

    with open(DATA_FOLDER + 'labels_int.p', 'r') as f:
        y_dict = pickle.load(f)

    print "Loading test data"
    X_test, y_test, filenames_test = dataset.load_test()
    y_pred = joblib.load('../models/pred_ml_improved.pkl')

    relevant = []
    for pred, correct, filename in zip(y_pred, y_test, filenames_test):
        if filename in FILES:
            relevant.append(
                (pred, correct, filename, CLASSIFICATIONS[filename]))

    model_predictions, correct, filename, manual_predictions = zip(*relevant)
    manual_predictions = learn.multilabel_binary_y(manual_predictions)
    model_predictions = np.array(model_predictions)
    correct = learn.multilabel_binary_y(correct)

    rules = infer_topology.infer_topology_rules()
    improved_manual = infer_topology.apply_topology_rules(
        rules, manual_predictions)

    prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"]
    predictions = [model_predictions, manual_predictions, improved_manual]

    for name, pred in zip(prediction_names, predictions):

        print "\n{}\n--".format(name)
        print "Zero-one classification loss", zero_one_loss(correct, pred)
        print "Hamming loss", hamming_loss(correct, pred)
        print "Precision:", precision_score(correct,
                                            pred,
                                            average='weighted',
                                            labels=label_list)
        print "Recall   :", recall_score(correct,
                                         pred,
                                         average='weighted',
                                         labels=label_list)
        print "F1 score :", f1_score(correct,
                                     pred,
                                     average='weighted',
                                     labels=label_list)
Esempio n. 3
0
def improve_predictions(
        probability_predictions_file='../models/pred_ml_proba.pkl',
        out_file='../models/pred_ml_improved.pkl',
        use_infer_topology=True):

    print "> IMPROVING PREDICTIONS\n--- Forcing at least one label (most likely)"
    print "Loading probability predictions"
    y_pred_proba = joblib.load(probability_predictions_file)

    #Because we use a one-versus-rest classifier, there may be documents without any labels
    #We deal with this by adding the most likely labels

    y_pred_improved = np.zeros(y_pred_proba.shape, dtype=np.int_)
    print "Converting to binary predictions"
    y_pred = np.where(y_pred_proba >= 0.5, 1, 0)

    for i, (prediction,
            prediction_proba) in enumerate(tqdm(zip(y_pred, y_pred_proba))):
        if sum(prediction) == 0:
            most_likely_label_index = np.argmax(prediction_proba)
            y_pred_improved[i, most_likely_label_index] = 1
        y_pred_improved[i] += prediction

    print np.sum(np.subtract(y_pred_improved, y_pred)), "labels added"

    if use_infer_topology:
        print "> IMPROVING PREDICTIONS\n--- Topology rules"
        print "Loading train set y-values"
        y_train, filenames_train = dataset.load_train_y()

        rules = infer_topology.infer_topology_rules(y_train)
        y_pred_improved = infer_topology.apply_topology_rules(
            rules, y_pred_improved)

    print "Saving to file"
    joblib.dump(y_pred_improved, out_file)
    print "Done!\n---"
Esempio n. 4
0
def improve_predictions(
    probability_predictions_file="../models/pred_ml_proba.pkl",
    out_file="../models/pred_ml_improved.pkl",
    use_infer_topology=True,
):

    print "> IMPROVING PREDICTIONS\n--- Forcing at least one label (most likely)"
    print "Loading probability predictions"
    y_pred_proba = joblib.load(probability_predictions_file)

    # Because we use a one-versus-rest classifier, there may be documents without any labels
    # We deal with this by adding the most likely labels

    y_pred_improved = np.zeros(y_pred_proba.shape, dtype=np.int_)
    print "Converting to binary predictions"
    y_pred = np.where(y_pred_proba >= 0.5, 1, 0)

    for i, (prediction, prediction_proba) in enumerate(tqdm(zip(y_pred, y_pred_proba))):
        if sum(prediction) == 0:
            most_likely_label_index = np.argmax(prediction_proba)
            y_pred_improved[i, most_likely_label_index] = 1
        y_pred_improved[i] += prediction

    print np.sum(np.subtract(y_pred_improved, y_pred)), "labels added"

    if use_infer_topology:
        print "> IMPROVING PREDICTIONS\n--- Topology rules"
        print "Loading train set y-values"
        y_train, filenames_train = dataset.load_train_y()

        rules = infer_topology.infer_topology_rules(y_train)
        y_pred_improved = infer_topology.apply_topology_rules(rules, y_pred_improved)

    print "Saving to file"
    joblib.dump(y_pred_improved, out_file)
    print "Done!\n---"