コード例 #1
0
def vectorize_preselection(P, U, ratio=1.0):
    """generate and select features for ratio of sentence sets"""

    print("Preprocessing corpora for PU learning")

    if ratio < 1.0:
        print("Training on", 100 * ratio, "% of data")
        P, _ = train_test_split(P, train_size=ratio, random_state=RANDOM_SEED)
        U, _ = train_test_split(U, train_size=ratio, random_state=RANDOM_SEED)

    vec = transformers.vectorizer()
    vec.fit(helpers.concatenate((P, U)))

    P_ = vec.transform(P)
    U_ = vec.transform(U)

    print("Features before selection:", np.shape(U_)[1])

    sel = transformers.percentile_selector()
    sel.fit(vstack((P_, U_)),
            helpers.concatenate((np.ones(num_rows(P_)), -np.ones(num_rows(U_)))))

    P_ = sel.transform(P_)
    U_ = sel.transform(U_)

    return P_, U_, vec, sel
コード例 #2
0
def em_getting_worse(old_model, new_model, P, U, verbose=False):
    """calculates approximated change in probability of error for iterative EM

    should be used in S-EM, but not in I-EM,
    according to \"Partially Supervised Classification of Text Documents\""""

    # probability of error:
    # Pr[f(X) =\= Y] = Pr[f(X) = 1] - Pr[Y = 1] + 2 * Pr[f(X) = 0 | Y = 1] * Pr[Y = 1]
    # change in probability of error has to be approximated since ground truth is unavailable:
    # Delta_i = Pr_U

    # predict P and U with old models to compare predicted class distributions
    y_P_old = old_model.predict(P)
    y_U_old = old_model.predict(U)

    y_P_new = new_model.predict(P)
    y_U_new = new_model.predict(U)

    Pr_U_pos_old = num_rows(y_U_old[y_U_old == 1])
    Pr_P_neg_old = num_rows(y_P_old[y_P_old == 0])

    Pr_U_pos_new = num_rows(y_U_new[y_U_new == 1])
    Pr_P_neg_new = num_rows(y_P_new[y_P_new == 0])

    Delta_i = (Pr_U_pos_new - Pr_U_pos_old + 2 *
               (Pr_P_neg_new - Pr_P_neg_old) * Pr_U_pos_old)

    if verbose:
        print("Delta_i:", Delta_i)

    return Delta_i > 0
コード例 #3
0
def vectorized_clean_pu(ratio=1.0):
    P_raw, U_raw, X_test_raw, y_test = clean_corpus_pu(ratio)

    print("\nPU TRAINING", "(on", 100 * ratio, "% of available data)",
          "\tP: HOC POS + CIVIC", "(", num_rows(P_raw), ")",
          "\tN: HOC NEG + ABSTRACTS (", num_rows(U_raw), ")",
          "\tTEST SET (HOC POS + CIVIC + HOC NEG):", num_rows(X_test_raw)
          )

    vec = transformers.vectorizer()
    vec.fit(helpers.concatenate((P_raw, U_raw)))

    P = vec.transform(P_raw)
    U = vec.transform(U_raw)

    print("Features before selection:", np.shape(P)[1])

    # sel = IdentitySelector()
    sel = transformers.percentile_selector()
    # sel = basic_pipeline.factorization('LatentDirichletAllocation')

    sel.fit(vstack((P, U)),
            (helpers.concatenate((np.ones(num_rows(P)), np.zeros(num_rows(U))))))
    P = sel.transform(P)
    U = sel.transform(U)
    X_test = (sel.transform(vec.transform(X_test_raw)))

    print("Features after selection:", np.shape(P)[1])

    return P, U, X_test, y_test, vec, sel
コード例 #4
0
def vectorized_clean_pnu(ratio=1.0):
    P_raw, N_raw, U_raw = clean_corpus_pnu(ratio)

    print("\nSEMI-SUPERVISED TRAINING", "(on", 100 * ratio, "% of available data)",
          "\tP: HOC POS + CIVIC (", num_rows(P_raw), ")",
          "\tN: HOC NEG (", num_rows(N_raw), ")",
          "\tU: ABSTRACTS (", num_rows(U_raw), ")"
          )

    vec = transformers.vectorizer()
    vec.fit(helpers.concatenate((P_raw, N_raw, U_raw)))

    P = vec.transform(P_raw)
    N = vec.transform(N_raw)
    U = vec.transform(U_raw)

    print("Features before selection:", np.shape(P)[1])

    sel = transformers.percentile_selector()
    sel.fit(vstack((P, N, U)),
            (helpers.concatenate((np.ones(num_rows(P)), -np.ones(num_rows(N)), np.zeros(num_rows(U))))))

    P = (sel.transform(P))
    N = (sel.transform(N))
    U = (sel.transform(U))

    print("Features after selection:", np.shape(P)[1])

    return P, N, U, vec, sel
コード例 #5
0
def train_pipeline(from_scratch=False,
                   write=True,
                   outpath=None,
                   mode=None,
                   ratio=1.0):
    if not from_scratch:
        try:
            return load_pipeline()
        except:
            pass

    print("Building new classifier")

    P, N, U = cleanup_corpora.clean_corpus_pnu(ratio=ratio,
                                               mode=mode,
                                               percentiles=[10, 25, 10])

    print("P (HoC labelled + CIViC)", num_rows(P), "\tN (HoC unlabelled)",
          num_rows(N), "\tU (CIViC source abstracts)", num_rows(U))

    best_pipeline = ss_model_selection.best_model_cross_val(P, N, U, fold=10)

    if write or outpath:
        outpath = outpath or file_path(
            "./semisuper/pickles/semi_pipeline.pickle")
        print("Pickling pipeline to", outpath)
        with open(outpath, "wb") as f:
            pickle.dump(best_pipeline, f)

    return best_pipeline
コード例 #6
0
def model_pu_score_record(P_train, U_train, P_test, U_test, m):
    model = m['model'](P_train, U_train)
    name = m['name']

    y_pred = model.predict(helpers.concatenate((P_test, U_test)))
    y_P = y_pred[:num_rows(P_test)]
    y_U = y_pred[num_rows(P_test):]

    score = pu_score(y_P, y_U)

    return {'name': name, 'model': m['model'], 'pu_score': score, 'ratio_in_U': np.sum(y_U) / num_rows(y_U)}
コード例 #7
0
def get_RN_rocchio(P, U, alpha=16, beta=4, verbose=False):
    """extract Reliable Negative documents using Binary Rocchio algorithm"""

    if verbose:
        print("Building Rocchio model to determine Reliable Negative examples")
    model = rocchio(P, U, alpha=alpha, beta=beta)

    y_U = model.predict(U)

    U_minus_RN, RN = partition_pos_neg(U, y_U)
    if verbose:
        print("Reliable Negative examples in U:", num_rows(RN), "(",
              100 * num_rows(RN) / num_rows(U), "%)")

    return U_minus_RN, RN
コード例 #8
0
def rocchio(P, N, alpha=16, beta=4, binary=False):
    """fits mean training vector and predicts whether cosine similarity is above threshold (default: 0.0)

    predict_proba returns similarity scores.
    if X_thresh is true, uses the training vectors' similarity scores to compute a threshold.
    """

    clf = BinaryRocchio(alpha=alpha, beta=beta)

    X = concatenate((P, N))
    y = concatenate((ones(num_rows(P)), zeros(num_rows(N))))

    model = clf.fit(X, y)

    return model
コード例 #9
0
def remove_P_from_U(P, U, ratio=1.0, inverse=False, verbose=True):
    """Remove sentences from noisy_set that are similar to guide_set according to strictest PU estimator.

    if inverse is set to True, keep rather than discard them."""

    guide_, noisy_, vectorizer, selector = vectorize_preselection(P, U, ratio=ratio)

    model = best_pu(guide_, noisy_)

    y_noisy = model.predict(selector.transform((vectorizer.transform(U))))

    if inverse:
        action = "Keeping"
        criterion = 1
    else:
        action = "Discarding"
        criterion = 0

    print(action, (100 * np.sum(y_noisy) / num_rows(y_noisy)), "% of noisy data (", np.sum(y_noisy), "sentences )",
          "as per result of PU learning")

    keeping = np.array([x for (x, y) in zip(U, y_noisy) if y == criterion], dtype=object)

    if verbose:
        discarding = [x for (x, y) in zip(U, y_noisy) if y != criterion]
        print("Keeping e.g.\n", random.sample(keeping.tolist(), 10))
        print("Discarding e.g.\n", random.sample(discarding, 10))

    return keeping
コード例 #10
0
def remove_most_similar_percent(P, U, ratio=1.0, percentile=10, inverse=False, verbose=True):
    """Remove percentile of sentences from noisy_set that are similar to guide_set according to strictest PU estimator.

    if inverse is set to True, remove least rather than most similar."""

    guide_, noisy_, vectorizer, selector = vectorize_preselection(P, U, ratio=ratio)

    model = best_pu(guide_, noisy_)

    if hasattr(model, 'decision_function'):
        y_pred = model.decision_function(noisy_)
    elif hasattr(model, 'predict_proba'):
        y_pred = np.abs(model.predict_proba(noisy_)[:, 1])

    if inverse:
        predicate = "least"
        y_pred = y_pred
    else:
        predicate = "most"
        y_pred = -y_pred

    print("Removing", percentile, "% of noisy data", predicate, "similar to guide set"
          , "(", (percentile * num_rows(U) / 100), "sentences )")

    U = np.array(U, dtype=object)
    U_minus_PN, PN = select_PN_below_score(y_pred, U, y_pred, noise_lvl=percentile / 100.0)

    if verbose:
        print("Keeping e.g.\n", train_test_split(U_minus_PN, train_size=10)[0])
        print("Discarding e.g.\n", train_test_split(PN, train_size=10)[0])

    return U_minus_PN
コード例 #11
0
ファイル: pu_biased_svm.py プロジェクト: nachne/semisuper
def biased_SVM_grid_search(P,
                           U,
                           Cs=None,
                           kernel='linear',
                           n_estimators=9,
                           verbose=False):
    if Cs is None:
        Cs = [10**x for x in range(-12, 12, 2)]

    if verbose:
        print(
            "Running Biased-SVM with balanced class weights and grid search over",
            len(Cs), "C values")

    model = BaggingClassifier(LinearSVC())

    grid_search = GridSearchCV(
        model,
        param_grid={
            'base_estimator__C': Cs,
            'base_estimator__class_weight': ['balanced'],
            ### not applicable for LinearSVC
            # 'base_estimator__kernel'      : [kernel],
            # 'base_estimator__cache_size'  : [8000],
            # 'base_estimator__probability' : [True],
            ### fit parameters for Bagging wrapper
            'bootstrap': [True],
            'n_estimators': [n_estimators],
            ### parallelization incompatible with multiprocessing
            # 'n_jobs'                      : [n_estimators]
        },
        scoring=pu_scorer,
        verbose=0)

    if verbose:
        print("Grid searching parameters for biased-SVM")
    X = concatenate((P, U))
    y = concatenate((ones(num_rows(P)), zeros(num_rows(U))))

    grid_search.fit(X, y)

    if verbose:
        train_report(grid_search.best_estimator_, P, U)
    print("Biased-SVM parameters:", grid_search.best_params_, "\tPU score:",
          grid_search.best_score_)

    return grid_search.best_estimator_
コード例 #12
0
def standalone_rocchio(P, U, alpha=16, beta=4, verbose=False):
    """1-step Rocchio method"""

    print("Running Rocchio")

    if verbose:
        print("Building Rocchio model to determine Reliable Negative examples")
    model = rocchio(P, U, alpha=alpha, beta=beta)

    y_U = model.predict(U)

    U_minus_RN, RN = partition_pos_neg(U, y_U)
    if verbose:
        print("Reliable Negative examples in U:", num_rows(RN), "(",
              100 * num_rows(RN) / num_rows(U), "%)")
        train_report(model, P, U)

    return model
コード例 #13
0
def ranking_cos_sim(X, threshold=0.1, compute_thresh=False):
    """fits mean training vector and predicts whether cosine similarity is above threshold (default: 0.0)

    predict_proba returns similarity scores.
    if X_thresh is true, uses the training vectors' similarity scores to compute a threshold.
    """

    clf = SimRanker(threshold, compute_thresh)

    model = clf.fit(X, ones(num_rows(X)))
    return model
コード例 #14
0
def clean_corpus_pu(ratio=1.0):
    # remove worst percentage
    # print("\nRemoving CIViC-like sentences from HoC[neg]\n")
    # hocneg_ = cleanup_sources.remove_least_similar_percent(noisy=hocneg, guide=civic, ratio=ratio, percentile=15)
    # print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n")
    # hocpos_ = cleanup_sources.remove_least_similar_percent(noisy=hocpos, guide=hocneg_, ratio=ratio, percentile=10)
    # print("\nRemoving CIViC-unlike sentences from HoC[pos]\n")
    # hocpos_ = cleanup_sources.remove_least_similar_percent(noisy=hocpos_, guide=civic, ratio=ratio, percentile=10,
    #                                                        inverse=True)

    # remove what is ambiguous according to PU training
    print("\nRemoving CIViC-like sentences from HoC[neg]\n")
    hocneg_ = remove_P_from_U(U=hocneg, P=civic, ratio=ratio)

    print("\nRemoving HoC[neg]-like sentences from HoC[pos]\n")
    hocpos_ = remove_P_from_U(U=hocpos, P=hocneg_, ratio=ratio)

    # print("\nRemoving CIViC-unlike sentences from HoC[pos]\n")
    # hocpos_ = cleanup_sources.remove_P_from_U(noisy=hocpos, guide=civic, ratio=ratio, inverse=True)

    hocpos_train, hocpos_test = train_test_split(hocpos_, test_size=0.2, random_state=RANDOM_SEED)
    civic_train, civic_test = train_test_split(civic, test_size=0.2, random_state=RANDOM_SEED)

    hocneg_train, X_test_neg = train_test_split(hocneg_, test_size=0.2, random_state=RANDOM_SEED)

    P_raw = helpers.concatenate((hocpos_train, civic_train))
    U_raw = helpers.concatenate((abstracts, hocneg_train))

    X_test_pos = helpers.concatenate((hocpos_test, civic_test))

    if ratio < 1.0:
        P_raw, _ = train_test_split(P_raw, train_size=ratio, random_state=RANDOM_SEED)
        U_raw, _ = train_test_split(U_raw, train_size=ratio, random_state=RANDOM_SEED)
        X_test_pos, _ = train_test_split(X_test_pos, train_size=ratio, random_state=RANDOM_SEED)
        X_test_neg, _ = train_test_split(X_test_neg, train_size=ratio, random_state=RANDOM_SEED)

    X_test_raw = helpers.concatenate((X_test_pos, X_test_neg))
    y_test = helpers.concatenate((np.ones(num_rows(X_test_pos)), np.zeros(num_rows(X_test_neg))))

    return P_raw, U_raw, X_test_raw, y_test
コード例 #15
0
def get_RN_cosine_rocchio(P,
                          U,
                          noise_lvl=0.20,
                          alpha=16,
                          beta=4,
                          verbose=False):
    """extract Reliable Negative documents using cosine similarity and Binary Rocchio algorithm

    similarity is the cosine similarity compared to the mean positive sample.
    firstly, select Potential Negative docs that have lower similarity than the worst l% in P.
    source: negative harmful
    """

    if verbose:
        print("Computing ranking (cosine similarity to mean positive example)")
    mean_p_ranker = ranking_cos_sim(P)

    sims_P = mean_p_ranker.predict_proba(P)
    sims_U = mean_p_ranker.predict_proba(U)

    if verbose:
        print("Choosing Potential Negative examples with ranking threshold")
    _, PN = select_PN_below_score(sims_P,
                                  U,
                                  sims_U,
                                  noise_lvl=noise_lvl,
                                  verbose=verbose)

    if verbose:
        print("Building Rocchio model to determine Reliable Negative examples")
    model = rocchio(P, PN, alpha=alpha, beta=beta)

    y_U = model.predict(U)

    U_minus_RN, RN = partition_pos_neg(U, y_U)
    if verbose:
        print("Reliable Negative examples in U:", num_rows(RN), "(",
              100 * num_rows(RN) / num_rows(U), "%)")

    return U_minus_RN, RN
コード例 #16
0
def best_model_cross_val(P, N, U, fold=10):
    """determine best model, cross validate and return pipeline trained on all data"""

    print("\nFinding best model")

    best = get_best_model(P, N, U)['best']

    print("\nCross-validation\n")

    kf = KFold(n_splits=fold, shuffle=True)
    splits = zip(list(kf.split(P)), list(kf.split(N)))

    # TODO doesn't work in parallel
    # if PARALLEL:
    #     with multi.Pool(min(fold, multi.cpu_count())) as p:
    #         stats = list(p.map(partial(eval_fold, best, P, N, U), enumerate(splits), chunksize=1))
    # else:
    #     stats = list(map(partial(eval_fold, best, P, N, U), enumerate(splits)))
    stats = list(map(partial(eval_fold, best, P, N, U), enumerate(splits)))

    mean_stats = np.mean(stats, 0)
    print("Cross-validation average: p {}, r {}, f1 {}, acc {}".format(
            mean_stats[0], mean_stats[1], mean_stats[2], mean_stats[3]))

    print("Retraining model on full data")

    vec, sel = best['vectorizer'], best['selector']
    vec.fit(concatenate((P, N, U)))
    P_, N_, U_ = [vec.transform(x) for x in [P, N, U]]

    y_pp = concatenate((np.ones(num_rows(P)), -np.ones(num_rows(N)), np.zeros(num_rows(U))))
    sel.fit(concatenate((P_, N_, U_)), y_pp)
    P_, N_, U_ = [(sel.transform(x)) for x in [P_, N_, U_]]

    model = best['untrained_model'](P_, N_, U_)

    print("Ratio of U classified as positive:", np.sum(model.predict(U_)) / num_rows(U_))
    print("Returning final model")

    return Pipeline([('vectorizer', vec), ('selector', sel), ('clf', model)])
コード例 #17
0
def eval_fold(model_record, P, N, U, i_splits):
    """helper function for running cross validation in parallel"""

    i, (p_split, n_split) = i_splits
    P_train, P_test = P[p_split[0]], P[p_split[1]]
    N_train, N_test = N[n_split[0]], N[n_split[1]]

    y_train_pp = concatenate((np.ones(num_rows(P_train)), -np.ones(num_rows(N_train)), np.zeros(num_rows(U))))
    pp = clone(Pipeline([('vectorizer', model_record['vectorizer']), ('selector', model_record['selector'])]))
    pp.fit(concatenate((P_train, N_train, U)), y_train_pp)

    P_, N_, U_, P_test_, N_test_ = [(pp.transform(x)) for x in [P_train, N_train, U, P_test, N_test]]
    model = model_record['untrained_model'](P_, N_, U_)

    y_pred = model.predict(concatenate((P_test_, N_test_)))
    y_test = concatenate((np.ones(num_rows(P_test_)), np.zeros(num_rows(N_test_))))

    pr, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    acc = accuracy_score(y_test, y_pred)

    print("Fold no.", i, "acc", acc, "classification report:\n", classification_report(y_test, y_pred))
    return [pr, r, f1, acc]
コード例 #18
0
def spy_partition(P, spy_ratio=0.1):
    """Partition P, extracting Spy Documents"""

    num_P = num_rows(P)
    num_idx = int(spy_ratio * num_P)

    # define spy partition
    idx = random.sample(range(num_P), num_idx)
    spies = P[idx]

    # define rest partition
    mask = np.ones(num_P, dtype=bool)
    mask[idx] = False
    P_minus_spies = P[mask]

    return P_minus_spies, spies
コード例 #19
0
def run_EM_with_RN(P,
                   U,
                   RN,
                   max_pos_ratio=1.0,
                   tolerance=0.05,
                   max_imbalance_P_RN=10.0,
                   clf_selection=True,
                   verbose=False):
    """second step PU method: train NB with P and RN to get probabilistic labels for U, then iterate EM"""

    if num_rows(P) > max_imbalance_P_RN * num_rows(RN):
        P_init = np.array(
            random.sample(list(P), int(max_imbalance_P_RN * num_rows(RN))))
    else:
        P_init = P

        if verbose:
            print(
                "\nBuilding classifier from Positive and Reliable Negative set"
            )
    initial_model = build_proba_MNB(concatenate((P_init, RN)),
                                    concatenate((np.ones(num_rows(P_init)),
                                                 np.zeros(num_rows(RN)))),
                                    verbose=verbose)

    if num_rows(U) == 0:
        print("Warning: EM: All of U was classified as negative.")
        return initial_model

    y_P = np.array([1] * num_rows(P))

    if verbose:
        print(
            "\nCalculating initial probabilistic labels for Reliable Negative and Unlabelled set"
        )
    ypU = initial_model.predict_proba(U)[:, 1]
    ypN = initial_model.predict_proba(RN)[:, 1]

    if verbose:
        print("\nIterating EM algorithm on P, RN and U\n")
    model = iterate_EM(P,
                       concatenate((RN, U)),
                       y_P,
                       concatenate((ypN, ypU)),
                       tolerance=tolerance,
                       max_pos_ratio=max_pos_ratio,
                       clf_selection=clf_selection,
                       verbose=verbose)

    return model
コード例 #20
0
def model_eval_record(P, N, U, X, y, m):
    """helper function for finding best model in parallel: evaluate model and return stat object. """

    untrained_model = m['model']
    model = m['model'](P, N, U)
    name = m['name']

    y_pred = model.predict(X)

    p, r, f1, _ = precision_recall_fscore_support(y, y_pred, average='weighted')
    acc = accuracy_score(y, y_pred)
    clsr = classification_report(y, y_pred)

    pos_ratio = np.sum(model.predict(U)) / num_rows(U)

    print("\n")
    # print("\n{}:\tacc: {},\tpositive ratio in U:{},\tclassification report:\n{}".format(name, acc, pos_ratio, clsr))
    return {'name' : name, 'p': p, 'r': r, 'f1': f1, 'acc': acc, 'clsr': clsr,
            'model': model, 'untrained_model': untrained_model, 'U_ratio': pos_ratio}
コード例 #21
0
def save_silver_standard(pipeline, write=True, outpath=None):
    print("Building new silver standard")

    float_format = '%.4g'

    pmid, pos, text, title = [0, 1, 2, 3]

    abstracts = np.array(abstracts2pmid_pos_sentence_title())

    if hasattr(pipeline, 'decision_function'):
        dec_fn = pipeline.decision_function(abstracts[:, text])
    elif hasattr(pipeline, 'predict_proba'):
        dec_fn = np.abs(pipeline.predict_proba(abstracts[:, text])[:, 1])
    else:
        dec_fn = [0] * num_rows(abstracts)

    y = pipeline.predict(abstracts[:, text]).astype(int)

    abs_classified = pd.DataFrame(data={
        "label":
        y,
        "decision_function": [float_format % df for df in dec_fn],
        "pmid":
        abstracts[:, pmid],
        "sentence_pos":
        [float_format % float(pos) for pos in abstracts[:, pos]],
        "text":
        abstracts[:, text],
        "title":
        abstracts[:, title]
    },
                                  columns=[
                                      "label", "decision_function", "pmid",
                                      "sentence_pos", "text", "title"
                                  ])

    if write or outpath:
        outpath = outpath or file_path(
            "./semisuper/silver_standard/silver_standard.tsv")
        print("Writing silver standard corpus to", outpath)
        abs_classified.to_csv(outpath, sep="\t", float_format=float_format)

    return abs_classified
コード例 #22
0
def model_eval_record(X, y, U, m):
    model = m['model']()
    name = m['name']

    y_pred = model.predict(X)

    p, r, f1, _ = precision_recall_fscore_support(y, y_pred, average='macro')
    acc = accuracy_score(y, y_pred)
    clsr = classification_report(y, y_pred)

    pos_ratio = np.sum(model.predict(U)) / num_rows(U)

    # print("\n{}:\tacc: {}, classification report:\n{}".format(name, acc, clsr))

    return {
        'name': name,
        'p': p,
        'r': r,
        'f1': f1,
        'acc': acc,
        'clsr': clsr,
        'model': model,
        'U_ratio': pos_ratio
    }
コード例 #23
0
ファイル: pu_biased_svm.py プロジェクト: nachne/semisuper
def biased_SVM_weight_selection(P,
                                U,
                                Cs_neg=None,
                                Cs_pos_factors=None,
                                Cs=None,
                                kernel='linear',
                                test_size=0.2,
                                verbose=False):
    """run biased SVMs with combinations of class weight values, choose the one with the best pu_measure"""

    # default values
    if Cs is None:
        Cs = [10**x for x in range(-12, 12, 2)]
    if Cs_neg is None:
        Cs_neg = [1]  # arange(0.01, 0.63, 0.04)
    if Cs_pos_factors is None:
        Cs_pos_factors = range(1, 1100, 200)

    Cs = [(C, C_neg * j, C_neg) for C in Cs for C_neg in Cs_neg
          for j in Cs_pos_factors]

    if verbose:
        print(
            "Running Biased-SVM with range of C and positive class weight factors.",
            num_rows(Cs), "parameter combinations.")

    P_train, P_test = train_test_split(P, test_size=test_size)
    U_train, U_test = train_test_split(U, test_size=test_size)
    X = concatenate((P_train, U_train))
    y = concatenate((ones(num_rows(P_train)), zeros(num_rows(U_train))))

    # with Pool(processes=min(cpu_count() - 1, num_rows(Cs))) as p:
    score_weights = map(
        partial(eval_params,
                X_train=X,
                y_train=y,
                P_test=P_test,
                U_test=U_test,
                kernel=kernel), Cs)

    best_score_params = max(score_weights, key=lambda tup: tup[0])

    [print(s) for s in score_weights]
    if verbose:
        print("\nBest model has parameters", best_score_params[1],
              "and PU-score", best_score_params[0])
        print("Building final classifier")

    model = build_biased_SVM(concatenate((P, U)),
                             concatenate(
                                 (ones(num_rows(P)), zeros(num_rows(U)))),
                             C_pos=best_score_params[1]['C_pos'],
                             C_neg=best_score_params[1]['C_neg'],
                             C=best_score_params[1]['C'],
                             probability=True,
                             kernel=kernel)

    if verbose:
        train_report(model, P, U)
    print("Returning Biased-SVM with parameters", best_score_params[1],
          "and PU-score", best_score_params[0])
    return model
コード例 #24
0
def get_best_model(P_train, N_train, U_train, X_test=None, y_test=None):
    """Evaluate parameter combinations, save results and return object with stats of all models"""

    print("Evaluating parameter ranges for preprocessor and classifiers")

    if X_test is None or y_test is None:
        P_train, X_test_pos = train_test_split(P_train, test_size=0.2, random_state=RANDOM_SEED)
        N_train, X_test_neg = train_test_split(N_train, test_size=0.2, random_state=RANDOM_SEED)
        X_test = concatenate((X_test_pos, X_test_neg))
        y_test = concatenate((np.ones(num_rows(X_test_pos)), np.zeros(num_rows(X_test_neg))))

    X_train = concatenate((P_train, N_train, U_train))
    y_train_pp = concatenate((np.ones(num_rows(P_train)), -np.ones(num_rows(N_train)), np.zeros(num_rows(U_train))))

    results = {'best': {'f1': -1, 'acc': -1}, 'all': []}

    preproc_params = preproc_param_dict()
    estimators = estimator_list()

    for wordgram, chargram in product(preproc_params['wordgram_range'], preproc_params['chargram_range']):
        for r in preproc_params['rules']:
            for df_min, df_max in product(preproc_params['df_min'], preproc_params['df_max']):
                for fs in preproc_params['feature_select']:

                    if wordgram is None and chargram is None:
                        break

                    print("\n----------------------------------------------------------------",
                          "\nwords:", wordgram, "chars:", chargram, "feature selection:", fs,
                          "df_min, df_max:", df_min, df_max, "rules", r,
                          "\n----------------------------------------------------------------\n")

                    start_time = time.time()

                    X_train_, X_test_, vectorizer, selector = prepare_train_test(trainData=X_train, testData=X_test,
                                                                                 trainLabels=y_train_pp, rules=r,
                                                                                 wordgram_range=wordgram,
                                                                                 feature_select=fs,
                                                                                 chargram_range=chargram,
                                                                                 min_df_char=df_min, min_df_word=df_min,
                                                                                 max_df=df_max)
                    if selector:
                        P_train_, N_train_, U_train_ = [(selector.transform(vectorizer.transform(x)))
                                                        for x in [P_train, N_train, U_train]]
                    else:
                        P_train_, N_train_, U_train_ = [(vectorizer.transform(x))
                                                        for x in [P_train, N_train, U_train]]

                    # fit models
                    if PARALLEL:
                        with multi.Pool(min(multi.cpu_count(), len(estimators))) as p:
                            iter_stats = list(p.map(partial(model_eval_record,
                                                            P_train_, N_train_, U_train_, X_test_, y_test),
                                                    estimators, chunksize=1))
                    else:
                        iter_stats = list(map(partial(model_eval_record,
                                                      P_train_, N_train_, U_train_, X_test_, y_test),
                                              estimators))

                    # finalize records: remove model, add n-gram stats, update best
                    for m in iter_stats:
                        m['n-grams'] = {'word': wordgram, 'char': chargram},
                        m['rules'] = r,
                        m['df_min, df_max'] = (df_min, df_max)
                        m['fs'] = fs()
                        if m['acc'] > results['best']['acc']:
                            results['best'] = deepcopy(m)
                            results['best']['vectorizer'] = vectorizer
                            results['best']['selector'] = selector
                        m.pop('model', None)

                    results['all'].append(iter_stats)

                    print("Evaluated words:", wordgram, "chars:", chargram,
                          "rules:", r,
                          "feature selection:", fs, "min_df:", df_min,
                          "in %s seconds\n" % (time.time() - start_time))

                    print_reports(iter_stats)

    print_results(results)

    return results
コード例 #25
0
def getBestModel(P_train, U_train, X_test, y_test):
    """Evaluate parameter combinations, save results and return pipeline with best model"""

    print("\nEvaluating parameter ranges for preprocessor and classifiers")

    X_train = concatenate((P_train, U_train))
    y_train_pp = concatenate(
        (np.ones(num_rows(P_train)), np.zeros(num_rows(U_train))))

    results = {'best': {'f1': -1, 'acc': -1}, 'all': []}

    preproc_params = {
        'df_min': [0.002],
        'df_max': [1.0],
        'rules': [True],
        'wordgram_range': [(1, 4)],  # [None, (1, 2), (1, 3), (1, 4)],
        'chargram_range': [(2, 6)],  # [None, (2, 4), (2, 5), (2, 6)],
        'feature_select': [
            partial(transformers.percentile_selector, 'chi2'),
            # partial(transformers.factorization, 'PCA', 10),
            # partial(transformers.factorization, 'PCA', 100),
            # partial(transformers.factorization, 'PCA', 1000),
        ]
    }

    for wordgram, chargram in product(preproc_params['wordgram_range'],
                                      preproc_params['chargram_range']):
        for r in preproc_params['rules']:
            for df_min, df_max in product(preproc_params['df_min'],
                                          preproc_params['df_max']):
                for fs in preproc_params['feature_select']:

                    if wordgram is None and chargram is None:
                        break

                    print(
                        "\n----------------------------------------------------------------",
                        "\nwords:", wordgram, "chars:", chargram,
                        "feature selection:", fs,
                        "\n----------------------------------------------------------------\n"
                    )

                    start_time = time.time()

                    X_train_, X_dev_, vectorizer, selector = prepareTrainTest(
                        trainData=X_train,
                        testData=X_test,
                        trainLabels=y_train_pp,
                        rules=r,
                        wordgram_range=wordgram,
                        feature_select=fs,
                        chargram_range=chargram,
                        min_df_char=df_min,
                        min_df_word=df_min,
                        max_df=df_max)
                    if selector:
                        P_train_ = selector.transform(
                            vectorizer.transform(P_train))
                        U_train_ = selector.transform(
                            vectorizer.transform(U_train))
                    else:
                        P_train_ = vectorizer.transform(P_train)
                        U_train_ = vectorizer.transform(U_train)

                    pp = {'word': wordgram, 'char': chargram}

                    # fit models
                    iteration = [
                        # {'name': 'i-em', 'model': partial(two_step.i_EM, P_train_, U_train_)},
                        # {'name' : 's-em spy=0.1',
                        #  'model': partial(two_step.s_EM, P_train_, U_train_, spy_ratio=0.1, noise_lvl=0.1)},
                        # {'name' : 's-em spy=0.2',
                        #  'model': partial(two_step.s_EM, P_train_, U_train_, spy_ratio=0.2, noise_lvl=0.2)},
                        {
                            'name': 'roc-svm',
                            'model': partial(two_step.roc_SVM, P_train_,
                                             U_train_)
                        },
                        {
                            'name':
                            'cr_svm noise=0.1',
                            'model':
                            partial(two_step.cr_SVM,
                                    P_train_,
                                    U_train_,
                                    noise_lvl=0.1)
                        },
                        {
                            'name':
                            'cr_svm noise=0.2',
                            'model':
                            partial(two_step.cr_SVM,
                                    P_train_,
                                    U_train_,
                                    noise_lvl=0.2)
                        },
                        {
                            'name':
                            'cr_svm noise=0.3',
                            'model':
                            partial(two_step.cr_SVM,
                                    P_train_,
                                    U_train_,
                                    noise_lvl=0.3)
                        },
                        # {'name': 'roc_em', 'model': partial(two_step.roc_EM, P_train_, U_train_)},
                        # {'name' : 'spy_svm spy=0.1',
                        #  'model': partial(two_step.spy_SVM, P_train_, U_train_, spy_ratio=0.1, noise_lvl=0.1)},
                        # {'name' : 'spy_svm spy=0.2',
                        #  'model': partial(two_step.spy_SVM, P_train_, U_train_, spy_ratio=0.2, noise_lvl=0.2)},
                        # {'name' : 'biased-svm',
                        #  'model': partial(biased_svm.biased_SVM_weight_selection, P_train_, U_train_)},
                        ## {'name' : 'bagging-svm',
                        ##  'model': partial(biased_svm.biased_SVM_grid_search, P_train_, U_train_)}
                    ]

                    # eval models
                    if PARALLEL:
                        with multi.Pool(min(multi.cpu_count(),
                                            len(iteration))) as p:
                            iter_stats = list(
                                p.map(partial(model_eval_record, X_dev_,
                                              y_test, U_train_),
                                      iteration,
                                      chunksize=1))
                    else:
                        iter_stats = list(
                            map(
                                partial(model_eval_record, X_dev_, y_test,
                                        U_train_), iteration))

                    # finalize records: remove model, add n-gram stats, update best
                    for m in iter_stats:
                        m['n-grams'] = pp
                        m['fs'] = fs()
                        if m['acc'] > results['best']['acc']:
                            results['best'] = deepcopy(m)
                            results['best']['vectorizer'] = vectorizer
                            results['best']['selector'] = selector
                        m.pop('model', None)

                    results['all'].append(iter_stats)

                    print("Evaluated words:", wordgram, "chars:", chargram,
                          "in %s seconds\n" % (time.time() - start_time))

                    print_reports(iter_stats)

    print_results(results)

    # save results to disk

    with open(
            file_path("./pickles/model_eval{}.pickle".format(
                datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))),
            "wb") as f:
        print('saving model stats to disk\n')
        pickle.dump(results, f)

    # ----------------------------------------------------------------
    # check how much of U (abstracts) is supposed to be positive
    # ----------------------------------------------------------------

    best_model = results['best']['model']
    selector = results['best']['selector']
    vectorizer = results['best']['vectorizer']

    print("\nAmount of unlabelled training set classified as positive:")
    if selector:
        transformedU = (selector.transform(vectorizer.transform(U_train)))
    else:
        transformedU = (vectorizer.transform(U_train))
    y_predicted_U = best_model.predict(transformedU)
    print(np.sum(y_predicted_U), "/", num_rows(y_predicted_U), "(",
          np.sum(y_predicted_U) / num_rows(y_predicted_U), ")")

    return results['best']
コード例 #26
0
def build_classifier(outpath="./semisuper/pickles/super_pipeline.pickle"):
    corpus_csv = load_silver_standard()
    X_train, y_train = X_y_from_csv(corpus_csv)

    model = super_model_selection.best_model_cross_val(X_train, y_train, fold=10)

    if outpath:
        with open(outpath, "wb") as f:
            print("saving supervised pipeline to", outpath)
            pickle.dump(model, f)

    new_abstracts = np.array(loaders.abstract_pmid_pos_sentences_query(anew=False, max_ids=1000))
    pmid, pos, text, title = [0, 1, 2, 3]

    print("\nSupervised model\n")

    X = np.vstack((new_abstracts[:, text],
                   new_abstracts[:, pos].astype(float),
                   new_abstracts[:, title])).T
    y = model.predict(X)

    print("Prediction for new PubMed abstracts:", np.sum(y), "/", num_rows(y), "(", np.sum(y) / num_rows(y), ")")
    print("Some positive sentences:")
    [print(x) for x in random.sample(X[np.where(y == 1.0)].tolist(), 10)]
    print("Some negative sentences:")
    [print(x) for x in random.sample(X[np.where(y == 0.0)].tolist(), 10)]

    # ----------------------------------------------------------------
    # TODO remove tests below this line

    print("\nInductive Semi-Supervised model\n")

    semi_pipeline = build_corpus_and_ss_classifier.train_pipeline(from_scratch=False, ratio=1.0)

    X_ss = new_abstracts[:, text]
    y_ss = semi_pipeline.predict(X_ss)

    print("Prediction for new PubMed abstracts:", np.sum(y_ss), "/", num_rows(y_ss), "(", np.sum(y_ss) / num_rows(y_ss),
          ")")
    print("Some positive sentences:")
    [print(x) for x in random.sample(X_ss[np.where(y_ss == 1.0)].tolist(), 10)]
    print("Some negative sentences:")
    [print(x) for x in random.sample(X_ss[np.where(y_ss == 0.0)].tolist(), 10)]

    print("\nTest abstracts\n")
    start_time = time.time()
    max_abs = 1000
    # test_abstracts = np.array(loaders.abstract_pmid_pos_sentences_idlist([str(x) for x in range(20000000, 20001000)]))
    test_abstracts = np.array(loaders.abstract_pmid_pos_sentences_query(max_ids=max_abs, anew=True, term="cancer"))
    print("fetching", max_abs, "abstracts took %s seconds\n" % (time.time() - start_time))
    print(test_abstracts)

    start_time = time.time()
    y_sup = model.predict(np.vstack((test_abstracts[:, text],
                                     test_abstracts[:, pos].astype(float),
                                     test_abstracts[:, title])).T)
    print("\nsupervised classification of", num_rows(test_abstracts),
          "sentences took %s seconds\n" % (time.time() - start_time))

    start_time = time.time()
    y_semi = semi_pipeline.predict(test_abstracts[:, text])
    print("\ninductive classification of", num_rows(test_abstracts),
          "sentences took %s seconds\n" % (time.time() - start_time))

    print("Supervised:", np.sum(y_sup), "inductive:", np.sum(y_semi),
          "agreement:", np.size(np.where(y_sup == y_semi)) / num_rows(test_abstracts))
    print(y_sup + 2 * y_semi)

    # ----------------------------------------------------------------

    return model
コード例 #27
0
def iterate_EM(P,
               U,
               y_P=None,
               ypU=None,
               tolerance=0.05,
               max_pos_ratio=1.0,
               clf_selection=False,
               verbose=False):
    """EM algorithm for positive set P and unlabelled set U

        iterate NB classifier with updated labels for unlabelled set (with optional initial labels) until convergence"""

    if y_P is None:
        y_P = ([1.] * num_rows(P))
    if ypU is None:
        ypU = ([0.] * num_rows(U))

    ypU_old = [-999]

    iterations = 0
    old_model = None
    new_model = None

    while not almost_equal(ypU_old, ypU, tolerance):

        iterations += 1

        if verbose:
            print("Iteration #", iterations,
                  "\tBuilding new model using probabilistic labels")

        if clf_selection:
            old_model = new_model

        new_model = build_proba_MNB(concatenate((P, U)),
                                    concatenate((y_P, ypU)),
                                    verbose=verbose)

        if verbose:
            print("Predicting probabilities for U")

        ypU_old = ypU
        ypU = new_model.predict_proba(U)[:, 1]

        predU = [round(p) for p in ypU]
        pos_ratio = sum(predU) / num_rows(U)

        if verbose:
            print("Unlabelled instances classified as positive:", sum(predU),
                  "/", num_rows(U), "(", pos_ratio * 100, "%)\n")

        if clf_selection and old_model is not None:
            if em_getting_worse(old_model, new_model, P, U):
                if verbose:
                    print(
                        "Approximated error has grown since last iteration.\n"
                        "Aborting and returning classifier #", iterations - 1)
                return old_model

        if pos_ratio >= max_pos_ratio:
            if verbose:
                print(
                    "Acceptable ratio of positively labelled sentences in U is reached."
                )
            break

    print("Returning final NB after", iterations, "iterations")
    return new_model
コード例 #28
0
def iterate_SVM(P,
                U,
                RN,
                max_neg_ratio=0.2,
                clf_selection=True,
                kernel=None,
                C=0.1,
                n_estimators=9,
                verbose=False):
    """runs an SVM classifier trained on P and RN iteratively, augmenting RN

    after each iteration, the documents in U classified as negative are moved to RN until there are none left.
    max_neg_ratio is the maximum accepted ratio of P to be classified as negative by final classifier.
    if clf_selection is true and the final classifier regards more than max_neg_ratio of P as negative,
    return the initial one."""

    y_P = np.ones(num_rows(P))
    y_RN = np.zeros(num_rows(RN))

    if kernel is not None:
        if verbose:
            print("Building initial Bagging SVC (", n_estimators, "clfs)",
                  "with Positive and Reliable Negative docs")
        clf = (BaggingClassifier(
            svm.SVC(class_weight='balanced', kernel=kernel, C=C),
            bootstrap=True,
            n_estimators=n_estimators,
            n_jobs=min(n_estimators, cpu_count()),
            max_samples=(1.0 if n_estimators < 4 else 1.0 /
                         (n_estimators - 2))))
    else:
        if verbose:
            print(
                "Building initial linearSVM classifier with Positive and Reliable Negative docs"
            )
        clf = svm.LinearSVC(class_weight='balanced', C=C)

    initial_model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))

    if num_rows(U) == 0:
        print("Warning: SVM: All of U was classified as negative.")
        return initial_model

    if verbose:
        print(
            "Predicting U with initial SVM, adding negatively classified docs to RN for iteration"
        )

    y_U = initial_model.predict(U)
    Q, W = partition_pos_neg(U, y_U)
    iteration = 0
    model = None

    if num_rows(Q) == 0 or num_rows(W) == 0:
        print(
            "Warning: Returning initial SVM because all of U was assigned label",
            y_U[0])
        return initial_model

    if clf_selection:
        y_P_initial = initial_model.predict(P)
        initial_neg_ratio = 1 - np.average(y_P_initial)

        if initial_neg_ratio > max_neg_ratio:
            print("Returning initial SVM ({}% of P classified as negative)".
                  format(100 * initial_neg_ratio))
            return initial_model

    # iterate SVM, each turn augmenting RN by the documents in Q classified negative
    while np.size(W) and np.size(Q):
        iteration += 1

        RN = concatenate((RN, W))
        y_RN = np.zeros(num_rows(RN))

        if verbose:
            print("\nIteration #", iteration, "\tReliable negative examples:",
                  num_rows(RN))

        if kernel is not None:
            clf = (BaggingClassifier(
                svm.SVC(class_weight='balanced', kernel=kernel, C=C),
                bootstrap=True,
                n_estimators=n_estimators,
                n_jobs=min(n_estimators, cpu_count()),
                max_samples=(1.0 if n_estimators < 4 else 1.0 /
                             (n_estimators - 2))))
        else:
            clf = svm.LinearSVC(class_weight='balanced', C=C)

        model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))
        y_Q = model.predict(Q)
        Q, W = partition_pos_neg(Q, y_Q)

    if np.size(W):
        RN = concatenate((RN, W))
        model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))

    if verbose:
        print("Iterative SVM converged. Reliable negative examples:",
              num_rows(RN))

    if clf_selection:
        if verbose:
            print(
                "Ratio of positive examples misclassified as negative by initial SVM:",
                initial_neg_ratio)
        if model is None:
            return initial_model

        y_P_final = model.predict(P)
        final_neg_ratio = 1 - np.average(y_P_final)

        if verbose:
            print(
                "Ratio of positive examples misclassified as negative by final SVM:",
                final_neg_ratio)

        if final_neg_ratio > max_neg_ratio and final_neg_ratio > initial_neg_ratio:
            print(
                iteration,
                "iterations - final SVM discards too many positive examples.",
                "Returning initial SVM instead")
            return initial_model

    print("Returning final SVM after", iteration, "iterations")
    return model