Ejemplo n.º 1
0
def vectorized_clean_pu(ratio=1.0):
    P_raw, U_raw, X_test_raw, y_test = clean_corpus_pu(ratio)

    print("\nPU TRAINING", "(on", 100 * ratio, "% of available data)",
          "\tP: HOC POS + CIVIC", "(", num_rows(P_raw), ")",
          "\tN: HOC NEG + ABSTRACTS (", num_rows(U_raw), ")",
          "\tTEST SET (HOC POS + CIVIC + HOC NEG):", num_rows(X_test_raw)
          )

    vec = transformers.vectorizer()
    vec.fit(helpers.concatenate((P_raw, U_raw)))

    P = vec.transform(P_raw)
    U = vec.transform(U_raw)

    print("Features before selection:", np.shape(P)[1])

    # sel = IdentitySelector()
    sel = transformers.percentile_selector()
    # sel = basic_pipeline.factorization('LatentDirichletAllocation')

    sel.fit(vstack((P, U)),
            (helpers.concatenate((np.ones(num_rows(P)), np.zeros(num_rows(U))))))
    P = sel.transform(P)
    U = sel.transform(U)
    X_test = (sel.transform(vec.transform(X_test_raw)))

    print("Features after selection:", np.shape(P)[1])

    return P, U, X_test, y_test, vec, sel
Ejemplo n.º 2
0
def vectorize_preselection(P, U, ratio=1.0):
    """generate and select features for ratio of sentence sets"""

    print("Preprocessing corpora for PU learning")

    if ratio < 1.0:
        print("Training on", 100 * ratio, "% of data")
        P, _ = train_test_split(P, train_size=ratio, random_state=RANDOM_SEED)
        U, _ = train_test_split(U, train_size=ratio, random_state=RANDOM_SEED)

    vec = transformers.vectorizer()
    vec.fit(helpers.concatenate((P, U)))

    P_ = vec.transform(P)
    U_ = vec.transform(U)

    print("Features before selection:", np.shape(U_)[1])

    sel = transformers.percentile_selector()
    sel.fit(vstack((P_, U_)),
            helpers.concatenate((np.ones(num_rows(P_)), -np.ones(num_rows(U_)))))

    P_ = sel.transform(P_)
    U_ = sel.transform(U_)

    return P_, U_, vec, sel
Ejemplo n.º 3
0
def vectorized_clean_pnu(ratio=1.0):
    P_raw, N_raw, U_raw = clean_corpus_pnu(ratio)

    print("\nSEMI-SUPERVISED TRAINING", "(on", 100 * ratio, "% of available data)",
          "\tP: HOC POS + CIVIC (", num_rows(P_raw), ")",
          "\tN: HOC NEG (", num_rows(N_raw), ")",
          "\tU: ABSTRACTS (", num_rows(U_raw), ")"
          )

    vec = transformers.vectorizer()
    vec.fit(helpers.concatenate((P_raw, N_raw, U_raw)))

    P = vec.transform(P_raw)
    N = vec.transform(N_raw)
    U = vec.transform(U_raw)

    print("Features before selection:", np.shape(P)[1])

    sel = transformers.percentile_selector()
    sel.fit(vstack((P, N, U)),
            (helpers.concatenate((np.ones(num_rows(P)), -np.ones(num_rows(N)), np.zeros(num_rows(U))))))

    P = (sel.transform(P))
    N = (sel.transform(N))
    U = (sel.transform(U))

    print("Features after selection:", np.shape(P)[1])

    return P, N, U, vec, sel
Ejemplo n.º 4
0
def run_EM_with_RN(P,
                   U,
                   RN,
                   max_pos_ratio=1.0,
                   tolerance=0.05,
                   max_imbalance_P_RN=10.0,
                   clf_selection=True,
                   verbose=False):
    """second step PU method: train NB with P and RN to get probabilistic labels for U, then iterate EM"""

    if num_rows(P) > max_imbalance_P_RN * num_rows(RN):
        P_init = np.array(
            random.sample(list(P), int(max_imbalance_P_RN * num_rows(RN))))
    else:
        P_init = P

        if verbose:
            print(
                "\nBuilding classifier from Positive and Reliable Negative set"
            )
    initial_model = build_proba_MNB(concatenate((P_init, RN)),
                                    concatenate((np.ones(num_rows(P_init)),
                                                 np.zeros(num_rows(RN)))),
                                    verbose=verbose)

    if num_rows(U) == 0:
        print("Warning: EM: All of U was classified as negative.")
        return initial_model

    y_P = np.array([1] * num_rows(P))

    if verbose:
        print(
            "\nCalculating initial probabilistic labels for Reliable Negative and Unlabelled set"
        )
    ypU = initial_model.predict_proba(U)[:, 1]
    ypN = initial_model.predict_proba(RN)[:, 1]

    if verbose:
        print("\nIterating EM algorithm on P, RN and U\n")
    model = iterate_EM(P,
                       concatenate((RN, U)),
                       y_P,
                       concatenate((ypN, ypU)),
                       tolerance=tolerance,
                       max_pos_ratio=max_pos_ratio,
                       clf_selection=clf_selection,
                       verbose=verbose)

    return model
Ejemplo n.º 5
0
def rocchio(P, N, alpha=16, beta=4, binary=False):
    """fits mean training vector and predicts whether cosine similarity is above threshold (default: 0.0)

    predict_proba returns similarity scores.
    if X_thresh is true, uses the training vectors' similarity scores to compute a threshold.
    """

    clf = BinaryRocchio(alpha=alpha, beta=beta)

    X = concatenate((P, N))
    y = concatenate((ones(num_rows(P)), zeros(num_rows(N))))

    model = clf.fit(X, y)

    return model
Ejemplo n.º 6
0
def get_RN_Spy_Docs(P,
                    U,
                    spy_ratio=0.1,
                    max_pos_ratio=0.5,
                    tolerance=0.2,
                    noise_lvl=0.05,
                    verbose=False):
    """First step technique: Compute reliable negative docs from P using Spy Documents and I-EM"""

    P_minus_spies, spies = spy_partition(P, spy_ratio)
    U_plus_spies = concatenate((U, spies))

    model = iterate_EM(P_minus_spies,
                       U_plus_spies,
                       tolerance=tolerance,
                       max_pos_ratio=max_pos_ratio,
                       clf_selection=False,
                       verbose=verbose)

    y_spies = model.predict_proba(spies)[:, 1]
    y_U = model.predict_proba(U)[:, 1]

    U_minus_RN, RN = select_PN_below_score(y_spies,
                                           U,
                                           y_U,
                                           noise_lvl=noise_lvl)

    return U_minus_RN, RN
Ejemplo n.º 7
0
def biased_SVM_grid_search(P,
                           U,
                           Cs=None,
                           kernel='linear',
                           n_estimators=9,
                           verbose=False):
    if Cs is None:
        Cs = [10**x for x in range(-12, 12, 2)]

    if verbose:
        print(
            "Running Biased-SVM with balanced class weights and grid search over",
            len(Cs), "C values")

    model = BaggingClassifier(LinearSVC())

    grid_search = GridSearchCV(
        model,
        param_grid={
            'base_estimator__C': Cs,
            'base_estimator__class_weight': ['balanced'],
            ### not applicable for LinearSVC
            # 'base_estimator__kernel'      : [kernel],
            # 'base_estimator__cache_size'  : [8000],
            # 'base_estimator__probability' : [True],
            ### fit parameters for Bagging wrapper
            'bootstrap': [True],
            'n_estimators': [n_estimators],
            ### parallelization incompatible with multiprocessing
            # 'n_jobs'                      : [n_estimators]
        },
        scoring=pu_scorer,
        verbose=0)

    if verbose:
        print("Grid searching parameters for biased-SVM")
    X = concatenate((P, U))
    y = concatenate((ones(num_rows(P)), zeros(num_rows(U))))

    grid_search.fit(X, y)

    if verbose:
        train_report(grid_search.best_estimator_, P, U)
    print("Biased-SVM parameters:", grid_search.best_params_, "\tPU score:",
          grid_search.best_score_)

    return grid_search.best_estimator_
Ejemplo n.º 8
0
def model_pu_score_record(P_train, U_train, P_test, U_test, m):
    model = m['model'](P_train, U_train)
    name = m['name']

    y_pred = model.predict(helpers.concatenate((P_test, U_test)))
    y_P = y_pred[:num_rows(P_test)]
    y_U = y_pred[num_rows(P_test):]

    score = pu_score(y_P, y_U)

    return {'name': name, 'model': m['model'], 'pu_score': score, 'ratio_in_U': np.sum(y_U) / num_rows(y_U)}
Ejemplo n.º 9
0
def best_model_cross_val(P, N, U, fold=10):
    """determine best model, cross validate and return pipeline trained on all data"""

    print("\nFinding best model")

    best = get_best_model(P, N, U)['best']

    print("\nCross-validation\n")

    kf = KFold(n_splits=fold, shuffle=True)
    splits = zip(list(kf.split(P)), list(kf.split(N)))

    # TODO doesn't work in parallel
    # if PARALLEL:
    #     with multi.Pool(min(fold, multi.cpu_count())) as p:
    #         stats = list(p.map(partial(eval_fold, best, P, N, U), enumerate(splits), chunksize=1))
    # else:
    #     stats = list(map(partial(eval_fold, best, P, N, U), enumerate(splits)))
    stats = list(map(partial(eval_fold, best, P, N, U), enumerate(splits)))

    mean_stats = np.mean(stats, 0)
    print("Cross-validation average: p {}, r {}, f1 {}, acc {}".format(
            mean_stats[0], mean_stats[1], mean_stats[2], mean_stats[3]))

    print("Retraining model on full data")

    vec, sel = best['vectorizer'], best['selector']
    vec.fit(concatenate((P, N, U)))
    P_, N_, U_ = [vec.transform(x) for x in [P, N, U]]

    y_pp = concatenate((np.ones(num_rows(P)), -np.ones(num_rows(N)), np.zeros(num_rows(U))))
    sel.fit(concatenate((P_, N_, U_)), y_pp)
    P_, N_, U_ = [(sel.transform(x)) for x in [P_, N_, U_]]

    model = best['untrained_model'](P_, N_, U_)

    print("Ratio of U classified as positive:", np.sum(model.predict(U_)) / num_rows(U_))
    print("Returning final model")

    return Pipeline([('vectorizer', vec), ('selector', sel), ('clf', model)])
Ejemplo n.º 10
0
def clean_corpus_pu(ratio=1.0):
    # remove worst percentage
    # print("\nRemoving CIViC-like sentences from HoC[neg]\n")
    # hocneg_ = cleanup_sources.remove_least_similar_percent(noisy=hocneg, guide=civic, ratio=ratio, percentile=15)
    # print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n")
    # hocpos_ = cleanup_sources.remove_least_similar_percent(noisy=hocpos, guide=hocneg_, ratio=ratio, percentile=10)
    # print("\nRemoving CIViC-unlike sentences from HoC[pos]\n")
    # hocpos_ = cleanup_sources.remove_least_similar_percent(noisy=hocpos_, guide=civic, ratio=ratio, percentile=10,
    #                                                        inverse=True)

    # remove what is ambiguous according to PU training
    print("\nRemoving CIViC-like sentences from HoC[neg]\n")
    hocneg_ = remove_P_from_U(U=hocneg, P=civic, ratio=ratio)

    print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n")
    hocpos_ = remove_P_from_U(U=hocpos, P=hocneg_, ratio=ratio)

    # print("\nRemoving CIViC-unlike sentences from HoC[pos]\n")
    # hocpos_ = cleanup_sources.remove_P_from_U(noisy=hocpos, guide=civic, ratio=ratio, inverse=True)

    hocpos_train, hocpos_test = train_test_split(hocpos_, test_size=0.2, random_state=RANDOM_SEED)
    civic_train, civic_test = train_test_split(civic, test_size=0.2, random_state=RANDOM_SEED)

    hocneg_train, X_test_neg = train_test_split(hocneg_, test_size=0.2, random_state=RANDOM_SEED)

    P_raw = helpers.concatenate((hocpos_train, civic_train))
    U_raw = helpers.concatenate((abstracts, hocneg_train))

    X_test_pos = helpers.concatenate((hocpos_test, civic_test))

    if ratio < 1.0:
        P_raw, _ = train_test_split(P_raw, train_size=ratio, random_state=RANDOM_SEED)
        U_raw, _ = train_test_split(U_raw, train_size=ratio, random_state=RANDOM_SEED)
        X_test_pos, _ = train_test_split(X_test_pos, train_size=ratio, random_state=RANDOM_SEED)
        X_test_neg, _ = train_test_split(X_test_neg, train_size=ratio, random_state=RANDOM_SEED)

    X_test_raw = helpers.concatenate((X_test_pos, X_test_neg))
    y_test = helpers.concatenate((np.ones(num_rows(X_test_pos)), np.zeros(num_rows(X_test_neg))))

    return P_raw, U_raw, X_test_raw, y_test
Ejemplo n.º 11
0
def eval_fold(model_record, P, N, U, i_splits):
    """helper function for running cross validation in parallel"""

    i, (p_split, n_split) = i_splits
    P_train, P_test = P[p_split[0]], P[p_split[1]]
    N_train, N_test = N[n_split[0]], N[n_split[1]]

    y_train_pp = concatenate((np.ones(num_rows(P_train)), -np.ones(num_rows(N_train)), np.zeros(num_rows(U))))
    pp = clone(Pipeline([('vectorizer', model_record['vectorizer']), ('selector', model_record['selector'])]))
    pp.fit(concatenate((P_train, N_train, U)), y_train_pp)

    P_, N_, U_, P_test_, N_test_ = [(pp.transform(x)) for x in [P_train, N_train, U, P_test, N_test]]
    model = model_record['untrained_model'](P_, N_, U_)

    y_pred = model.predict(concatenate((P_test_, N_test_)))
    y_test = concatenate((np.ones(num_rows(P_test_)), np.zeros(num_rows(N_test_))))

    pr, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    acc = accuracy_score(y_test, y_pred)

    print("Fold no.", i, "acc", acc, "classification report:\n", classification_report(y_test, y_pred))
    return [pr, r, f1, acc]
Ejemplo n.º 12
0
def iterate_EM(P,
               U,
               y_P=None,
               ypU=None,
               tolerance=0.05,
               max_pos_ratio=1.0,
               clf_selection=False,
               verbose=False):
    """EM algorithm for positive set P and unlabelled set U

        iterate NB classifier with updated labels for unlabelled set (with optional initial labels) until convergence"""

    if y_P is None:
        y_P = ([1.] * num_rows(P))
    if ypU is None:
        ypU = ([0.] * num_rows(U))

    ypU_old = [-999]

    iterations = 0
    old_model = None
    new_model = None

    while not almost_equal(ypU_old, ypU, tolerance):

        iterations += 1

        if verbose:
            print("Iteration #", iterations,
                  "\tBuilding new model using probabilistic labels")

        if clf_selection:
            old_model = new_model

        new_model = build_proba_MNB(concatenate((P, U)),
                                    concatenate((y_P, ypU)),
                                    verbose=verbose)

        if verbose:
            print("Predicting probabilities for U")

        ypU_old = ypU
        ypU = new_model.predict_proba(U)[:, 1]

        predU = [round(p) for p in ypU]
        pos_ratio = sum(predU) / num_rows(U)

        if verbose:
            print("Unlabelled instances classified as positive:", sum(predU),
                  "/", num_rows(U), "(", pos_ratio * 100, "%)\n")

        if clf_selection and old_model is not None:
            if em_getting_worse(old_model, new_model, P, U):
                if verbose:
                    print(
                        "Approximated error has grown since last iteration.\n"
                        "Aborting and returning classifier #", iterations - 1)
                return old_model

        if pos_ratio >= max_pos_ratio:
            if verbose:
                print(
                    "Acceptable ratio of positively labelled sentences in U is reached."
                )
            break

    print("Returning final NB after", iterations, "iterations")
    return new_model
Ejemplo n.º 13
0
def biased_SVM_weight_selection(P,
                                U,
                                Cs_neg=None,
                                Cs_pos_factors=None,
                                Cs=None,
                                kernel='linear',
                                test_size=0.2,
                                verbose=False):
    """run biased SVMs with combinations of class weight values, choose the one with the best pu_measure"""

    # default values
    if Cs is None:
        Cs = [10**x for x in range(-12, 12, 2)]
    if Cs_neg is None:
        Cs_neg = [1]  # arange(0.01, 0.63, 0.04)
    if Cs_pos_factors is None:
        Cs_pos_factors = range(1, 1100, 200)

    Cs = [(C, C_neg * j, C_neg) for C in Cs for C_neg in Cs_neg
          for j in Cs_pos_factors]

    if verbose:
        print(
            "Running Biased-SVM with range of C and positive class weight factors.",
            num_rows(Cs), "parameter combinations.")

    P_train, P_test = train_test_split(P, test_size=test_size)
    U_train, U_test = train_test_split(U, test_size=test_size)
    X = concatenate((P_train, U_train))
    y = concatenate((ones(num_rows(P_train)), zeros(num_rows(U_train))))

    # with Pool(processes=min(cpu_count() - 1, num_rows(Cs))) as p:
    score_weights = map(
        partial(eval_params,
                X_train=X,
                y_train=y,
                P_test=P_test,
                U_test=U_test,
                kernel=kernel), Cs)

    best_score_params = max(score_weights, key=lambda tup: tup[0])

    [print(s) for s in score_weights]
    if verbose:
        print("\nBest model has parameters", best_score_params[1],
              "and PU-score", best_score_params[0])
        print("Building final classifier")

    model = build_biased_SVM(concatenate((P, U)),
                             concatenate(
                                 (ones(num_rows(P)), zeros(num_rows(U)))),
                             C_pos=best_score_params[1]['C_pos'],
                             C_neg=best_score_params[1]['C_neg'],
                             C=best_score_params[1]['C'],
                             probability=True,
                             kernel=kernel)

    if verbose:
        train_report(model, P, U)
    print("Returning Biased-SVM with parameters", best_score_params[1],
          "and PU-score", best_score_params[0])
    return model
Ejemplo n.º 14
0
def clean_corpus_pnu(mode="tolerant", percentiles=(10, 25, 10), ratio=1.0):
    """clean up HoC corpus using PU learning. Modes: "strict", "percentile", default

    default: remove CIViC-like from HoC[neg], HoC[neg]-like from CIViC
    strict: remove CIViC-like from HoC[neg], keep only CIViC-like in HoC[pos]
    percentile: remove percentiles (CIViC-like from HoC[neg], HoC[neg]-like from HoC[pos], CIViC-unlike from HoC[pos)
    """

    if mode == "percentile":
        # Remove given best/worst percentile of sentences from each set

        print("\nRemoving CIViC-like sentences from HoC[neg] (", percentiles[0], "%)\n")
        hocneg_ = remove_most_similar_percent(U=hocneg, P=civic, ratio=ratio, percentile=percentiles[0])

        print("\nRemoving HoC[neg]-like sentences from HoC[pos] (", percentiles[1], "%)\n")
        hocpos_ = remove_most_similar_percent(U=hocpos, P=hocneg_, ratio=ratio, percentile=percentiles[1])

        print("\nRemoving CIViC-unlike sentences from HoC[pos] (", percentiles[2], "%)\n")
        hocpos_ = remove_most_similar_percent(U=hocpos_, P=civic, ratio=ratio, percentile=percentiles[2],
                                              inverse=True)


    elif mode == "strict":
        # Remove "good" sentences from HoC[neg], keep only "good" sentences in HoC[pos]

        print("\nKeeping only CIViC-like sentences in HoC[pos]\n")
        hocpos_ = remove_P_from_U(P=civic, U=hocpos, ratio=ratio, inverse=True)

        print("\nRemoving CIViC-like sentences from HoC[neg]\n")
        hocneg_ = remove_P_from_U(P=civic, U=hocneg, ratio=ratio)


    elif mode == "mixed":
        # Remove "good" sentences from HoC[neg], CIViC-unlike and HoC[neg]-like sentences from HoC[pos]

        print("\nRemoving CIViC-like sentences from HoC[neg]\n")
        hocneg_ = remove_P_from_U(P=civic, U=hocneg, ratio=ratio)

        print("\nRemoving CIViC-unlike sentences from HoC[pos] (", 75, "%)\n")
        hocpos_ = remove_most_similar_percent(U=hocpos, P=civic, ratio=ratio, percentile=75,
                                              inverse=True)

        print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n")
        hocpos_ = remove_P_from_U(P=hocneg_, U=hocpos, ratio=ratio)


    else:  # mode == "tolerant"
        # Remove "good" sentences from HoC[neg], remove "bad" sentences in HoC[pos]

        print("\nRemoving CIViC-like sentences from HoC[neg]\n")
        hocneg_ = remove_P_from_U(P=civic, U=hocneg, ratio=ratio)

        print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n")
        hocpos_ = remove_P_from_U(P=hocneg_, U=hocpos, ratio=ratio)

    P_raw = helpers.concatenate((hocpos_, civic))
    U_raw = abstracts
    N_raw = hocneg_

    if ratio < 1.0:
        P_raw, _ = train_test_split(P_raw, train_size=ratio, random_state=RANDOM_SEED)
        N_raw, _ = train_test_split(N_raw, train_size=ratio, random_state=RANDOM_SEED)
        U_raw, _ = train_test_split(U_raw, train_size=ratio, random_state=RANDOM_SEED)

    return P_raw, N_raw, U_raw
Ejemplo n.º 15
0
def iterate_SVM(P,
                U,
                RN,
                max_neg_ratio=0.2,
                clf_selection=True,
                kernel=None,
                C=0.1,
                n_estimators=9,
                verbose=False):
    """runs an SVM classifier trained on P and RN iteratively, augmenting RN

    after each iteration, the documents in U classified as negative are moved to RN until there are none left.
    max_neg_ratio is the maximum accepted ratio of P to be classified as negative by final classifier.
    if clf_selection is true and the final classifier regards more than max_neg_ratio of P as negative,
    return the initial one."""

    y_P = np.ones(num_rows(P))
    y_RN = np.zeros(num_rows(RN))

    if kernel is not None:
        if verbose:
            print("Building initial Bagging SVC (", n_estimators, "clfs)",
                  "with Positive and Reliable Negative docs")
        clf = (BaggingClassifier(
            svm.SVC(class_weight='balanced', kernel=kernel, C=C),
            bootstrap=True,
            n_estimators=n_estimators,
            n_jobs=min(n_estimators, cpu_count()),
            max_samples=(1.0 if n_estimators < 4 else 1.0 /
                         (n_estimators - 2))))
    else:
        if verbose:
            print(
                "Building initial linearSVM classifier with Positive and Reliable Negative docs"
            )
        clf = svm.LinearSVC(class_weight='balanced', C=C)

    initial_model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))

    if num_rows(U) == 0:
        print("Warning: SVM: All of U was classified as negative.")
        return initial_model

    if verbose:
        print(
            "Predicting U with initial SVM, adding negatively classified docs to RN for iteration"
        )

    y_U = initial_model.predict(U)
    Q, W = partition_pos_neg(U, y_U)
    iteration = 0
    model = None

    if num_rows(Q) == 0 or num_rows(W) == 0:
        print(
            "Warning: Returning initial SVM because all of U was assigned label",
            y_U[0])
        return initial_model

    if clf_selection:
        y_P_initial = initial_model.predict(P)
        initial_neg_ratio = 1 - np.average(y_P_initial)

        if initial_neg_ratio > max_neg_ratio:
            print("Returning initial SVM ({}% of P classified as negative)".
                  format(100 * initial_neg_ratio))
            return initial_model

    # iterate SVM, each turn augmenting RN by the documents in Q classified negative
    while np.size(W) and np.size(Q):
        iteration += 1

        RN = concatenate((RN, W))
        y_RN = np.zeros(num_rows(RN))

        if verbose:
            print("\nIteration #", iteration, "\tReliable negative examples:",
                  num_rows(RN))

        if kernel is not None:
            clf = (BaggingClassifier(
                svm.SVC(class_weight='balanced', kernel=kernel, C=C),
                bootstrap=True,
                n_estimators=n_estimators,
                n_jobs=min(n_estimators, cpu_count()),
                max_samples=(1.0 if n_estimators < 4 else 1.0 /
                             (n_estimators - 2))))
        else:
            clf = svm.LinearSVC(class_weight='balanced', C=C)

        model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))
        y_Q = model.predict(Q)
        Q, W = partition_pos_neg(Q, y_Q)

    if np.size(W):
        RN = concatenate((RN, W))
        model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))

    if verbose:
        print("Iterative SVM converged. Reliable negative examples:",
              num_rows(RN))

    if clf_selection:
        if verbose:
            print(
                "Ratio of positive examples misclassified as negative by initial SVM:",
                initial_neg_ratio)
        if model is None:
            return initial_model

        y_P_final = model.predict(P)
        final_neg_ratio = 1 - np.average(y_P_final)

        if verbose:
            print(
                "Ratio of positive examples misclassified as negative by final SVM:",
                final_neg_ratio)

        if final_neg_ratio > max_neg_ratio and final_neg_ratio > initial_neg_ratio:
            print(
                iteration,
                "iterations - final SVM discards too many positive examples.",
                "Returning initial SVM instead")
            return initial_model

    print("Returning final SVM after", iteration, "iterations")
    return model
Ejemplo n.º 16
0
def getBestModel(P_train, U_train, X_test, y_test):
    """Evaluate parameter combinations, save results and return pipeline with best model"""

    print("\nEvaluating parameter ranges for preprocessor and classifiers")

    X_train = concatenate((P_train, U_train))
    y_train_pp = concatenate(
        (np.ones(num_rows(P_train)), np.zeros(num_rows(U_train))))

    results = {'best': {'f1': -1, 'acc': -1}, 'all': []}

    preproc_params = {
        'df_min': [0.002],
        'df_max': [1.0],
        'rules': [True],
        'wordgram_range': [(1, 4)],  # [None, (1, 2), (1, 3), (1, 4)],
        'chargram_range': [(2, 6)],  # [None, (2, 4), (2, 5), (2, 6)],
        'feature_select': [
            partial(transformers.percentile_selector, 'chi2'),
            # partial(transformers.factorization, 'PCA', 10),
            # partial(transformers.factorization, 'PCA', 100),
            # partial(transformers.factorization, 'PCA', 1000),
        ]
    }

    for wordgram, chargram in product(preproc_params['wordgram_range'],
                                      preproc_params['chargram_range']):
        for r in preproc_params['rules']:
            for df_min, df_max in product(preproc_params['df_min'],
                                          preproc_params['df_max']):
                for fs in preproc_params['feature_select']:

                    if wordgram is None and chargram is None:
                        break

                    print(
                        "\n----------------------------------------------------------------",
                        "\nwords:", wordgram, "chars:", chargram,
                        "feature selection:", fs,
                        "\n----------------------------------------------------------------\n"
                    )

                    start_time = time.time()

                    X_train_, X_dev_, vectorizer, selector = prepareTrainTest(
                        trainData=X_train,
                        testData=X_test,
                        trainLabels=y_train_pp,
                        rules=r,
                        wordgram_range=wordgram,
                        feature_select=fs,
                        chargram_range=chargram,
                        min_df_char=df_min,
                        min_df_word=df_min,
                        max_df=df_max)
                    if selector:
                        P_train_ = selector.transform(
                            vectorizer.transform(P_train))
                        U_train_ = selector.transform(
                            vectorizer.transform(U_train))
                    else:
                        P_train_ = vectorizer.transform(P_train)
                        U_train_ = vectorizer.transform(U_train)

                    pp = {'word': wordgram, 'char': chargram}

                    # fit models
                    iteration = [
                        # {'name': 'i-em', 'model': partial(two_step.i_EM, P_train_, U_train_)},
                        # {'name' : 's-em spy=0.1',
                        #  'model': partial(two_step.s_EM, P_train_, U_train_, spy_ratio=0.1, noise_lvl=0.1)},
                        # {'name' : 's-em spy=0.2',
                        #  'model': partial(two_step.s_EM, P_train_, U_train_, spy_ratio=0.2, noise_lvl=0.2)},
                        {
                            'name': 'roc-svm',
                            'model': partial(two_step.roc_SVM, P_train_,
                                             U_train_)
                        },
                        {
                            'name':
                            'cr_svm noise=0.1',
                            'model':
                            partial(two_step.cr_SVM,
                                    P_train_,
                                    U_train_,
                                    noise_lvl=0.1)
                        },
                        {
                            'name':
                            'cr_svm noise=0.2',
                            'model':
                            partial(two_step.cr_SVM,
                                    P_train_,
                                    U_train_,
                                    noise_lvl=0.2)
                        },
                        {
                            'name':
                            'cr_svm noise=0.3',
                            'model':
                            partial(two_step.cr_SVM,
                                    P_train_,
                                    U_train_,
                                    noise_lvl=0.3)
                        },
                        # {'name': 'roc_em', 'model': partial(two_step.roc_EM, P_train_, U_train_)},
                        # {'name' : 'spy_svm spy=0.1',
                        #  'model': partial(two_step.spy_SVM, P_train_, U_train_, spy_ratio=0.1, noise_lvl=0.1)},
                        # {'name' : 'spy_svm spy=0.2',
                        #  'model': partial(two_step.spy_SVM, P_train_, U_train_, spy_ratio=0.2, noise_lvl=0.2)},
                        # {'name' : 'biased-svm',
                        #  'model': partial(biased_svm.biased_SVM_weight_selection, P_train_, U_train_)},
                        ## {'name' : 'bagging-svm',
                        ##  'model': partial(biased_svm.biased_SVM_grid_search, P_train_, U_train_)}
                    ]

                    # eval models
                    if PARALLEL:
                        with multi.Pool(min(multi.cpu_count(),
                                            len(iteration))) as p:
                            iter_stats = list(
                                p.map(partial(model_eval_record, X_dev_,
                                              y_test, U_train_),
                                      iteration,
                                      chunksize=1))
                    else:
                        iter_stats = list(
                            map(
                                partial(model_eval_record, X_dev_, y_test,
                                        U_train_), iteration))

                    # finalize records: remove model, add n-gram stats, update best
                    for m in iter_stats:
                        m['n-grams'] = pp
                        m['fs'] = fs()
                        if m['acc'] > results['best']['acc']:
                            results['best'] = deepcopy(m)
                            results['best']['vectorizer'] = vectorizer
                            results['best']['selector'] = selector
                        m.pop('model', None)

                    results['all'].append(iter_stats)

                    print("Evaluated words:", wordgram, "chars:", chargram,
                          "in %s seconds\n" % (time.time() - start_time))

                    print_reports(iter_stats)

    print_results(results)

    # save results to disk

    with open(
            file_path("./pickles/model_eval{}.pickle".format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))),
            "wb") as f:
        print('saving model stats to disk\n')
        pickle.dump(results, f)

    # ----------------------------------------------------------------
    # check how much of U (abstracts) is supposed to be positive
    # ----------------------------------------------------------------

    best_model = results['best']['model']
    selector = results['best']['selector']
    vectorizer = results['best']['vectorizer']

    print("\nAmount of unlabelled training set classified as positive:")
    if selector:
        transformedU = (selector.transform(vectorizer.transform(U_train)))
    else:
        transformedU = (vectorizer.transform(U_train))
    y_predicted_U = best_model.predict(transformedU)
    print(np.sum(y_predicted_U), "/", num_rows(y_predicted_U), "(",
          np.sum(y_predicted_U) / num_rows(y_predicted_U), ")")

    return results['best']
Ejemplo n.º 17
0
def get_best_model(P_train, N_train, U_train, X_test=None, y_test=None):
    """Evaluate parameter combinations, save results and return object with stats of all models"""

    print("Evaluating parameter ranges for preprocessor and classifiers")

    if X_test is None or y_test is None:
        P_train, X_test_pos = train_test_split(P_train, test_size=0.2, random_state=RANDOM_SEED)
        N_train, X_test_neg = train_test_split(N_train, test_size=0.2, random_state=RANDOM_SEED)
        X_test = concatenate((X_test_pos, X_test_neg))
        y_test = concatenate((np.ones(num_rows(X_test_pos)), np.zeros(num_rows(X_test_neg))))

    X_train = concatenate((P_train, N_train, U_train))
    y_train_pp = concatenate((np.ones(num_rows(P_train)), -np.ones(num_rows(N_train)), np.zeros(num_rows(U_train))))

    results = {'best': {'f1': -1, 'acc': -1}, 'all': []}

    preproc_params = preproc_param_dict()
    estimators = estimator_list()

    for wordgram, chargram in product(preproc_params['wordgram_range'], preproc_params['chargram_range']):
        for r in preproc_params['rules']:
            for df_min, df_max in product(preproc_params['df_min'], preproc_params['df_max']):
                for fs in preproc_params['feature_select']:

                    if wordgram is None and chargram is None:
                        break

                    print("\n----------------------------------------------------------------",
                          "\nwords:", wordgram, "chars:", chargram, "feature selection:", fs,
                          "df_min, df_max:", df_min, df_max, "rules", r,
                          "\n----------------------------------------------------------------\n")

                    start_time = time.time()

                    X_train_, X_test_, vectorizer, selector = prepare_train_test(trainData=X_train, testData=X_test,
                                                                                 trainLabels=y_train_pp, rules=r,
                                                                                 wordgram_range=wordgram,
                                                                                 feature_select=fs,
                                                                                 chargram_range=chargram,
                                                                                 min_df_char=df_min, min_df_word=df_min,
                                                                                 max_df=df_max)
                    if selector:
                        P_train_, N_train_, U_train_ = [(selector.transform(vectorizer.transform(x)))
                                                        for x in [P_train, N_train, U_train]]
                    else:
                        P_train_, N_train_, U_train_ = [(vectorizer.transform(x))
                                                        for x in [P_train, N_train, U_train]]

                    # fit models
                    if PARALLEL:
                        with multi.Pool(min(multi.cpu_count(), len(estimators))) as p:
                            iter_stats = list(p.map(partial(model_eval_record,
                                                            P_train_, N_train_, U_train_, X_test_, y_test),
                                                    estimators, chunksize=1))
                    else:
                        iter_stats = list(map(partial(model_eval_record,
                                                      P_train_, N_train_, U_train_, X_test_, y_test),
                                              estimators))

                    # finalize records: remove model, add n-gram stats, update best
                    for m in iter_stats:
                        m['n-grams'] = {'word': wordgram, 'char': chargram},
                        m['rules'] = r,
                        m['df_min, df_max'] = (df_min, df_max)
                        m['fs'] = fs()
                        if m['acc'] > results['best']['acc']:
                            results['best'] = deepcopy(m)
                            results['best']['vectorizer'] = vectorizer
                            results['best']['selector'] = selector
                        m.pop('model', None)

                    results['all'].append(iter_stats)

                    print("Evaluated words:", wordgram, "chars:", chargram,
                          "rules:", r,
                          "feature selection:", fs, "min_df:", df_min,
                          "in %s seconds\n" % (time.time() - start_time))

                    print_reports(iter_stats)

    print_results(results)

    return results