Exemple #1
0
def get_RN_rocchio(P, U, alpha=16, beta=4, verbose=False):
    """extract Reliable Negative documents using Binary Rocchio algorithm"""

    if verbose:
        print("Building Rocchio model to determine Reliable Negative examples")
    model = rocchio(P, U, alpha=alpha, beta=beta)

    y_U = model.predict(U)

    U_minus_RN, RN = partition_pos_neg(U, y_U)
    if verbose:
        print("Reliable Negative examples in U:", num_rows(RN), "(",
              100 * num_rows(RN) / num_rows(U), "%)")

    return U_minus_RN, RN
Exemple #2
0
    def fit(self, X, y):
        """learn prototype vectors for positive and negative docs"""
        y = [label2num(l) for l in y]

        P, N = partition_pos_neg(X, y)

        normalized_p = normalize(P.mean(axis=0))
        normalized_n = normalize(N.mean(axis=0))

        self.proto_p = normalize(self.alpha * normalized_p -
                                 self.beta * normalized_n)
        self.proto_n = normalize(self.alpha * normalized_n -
                                 self.beta * normalized_p)

        return self
Exemple #3
0
def standalone_rocchio(P, U, alpha=16, beta=4, verbose=False):
    """1-step Rocchio method"""

    print("Running Rocchio")

    if verbose:
        print("Building Rocchio model to determine Reliable Negative examples")
    model = rocchio(P, U, alpha=alpha, beta=beta)

    y_U = model.predict(U)

    U_minus_RN, RN = partition_pos_neg(U, y_U)
    if verbose:
        print("Reliable Negative examples in U:", num_rows(RN), "(",
              100 * num_rows(RN) / num_rows(U), "%)")
        train_report(model, P, U)

    return model
Exemple #4
0
def get_RN_cosine_rocchio(P,
                          U,
                          noise_lvl=0.20,
                          alpha=16,
                          beta=4,
                          verbose=False):
    """extract Reliable Negative documents using cosine similarity and Binary Rocchio algorithm

    similarity is the cosine similarity compared to the mean positive sample.
    firstly, select Potential Negative docs that have lower similarity than the worst l% in P.
    source: negative harmful
    """

    if verbose:
        print("Computing ranking (cosine similarity to mean positive example)")
    mean_p_ranker = ranking_cos_sim(P)

    sims_P = mean_p_ranker.predict_proba(P)
    sims_U = mean_p_ranker.predict_proba(U)

    if verbose:
        print("Choosing Potential Negative examples with ranking threshold")
    _, PN = select_PN_below_score(sims_P,
                                  U,
                                  sims_U,
                                  noise_lvl=noise_lvl,
                                  verbose=verbose)

    if verbose:
        print("Building Rocchio model to determine Reliable Negative examples")
    model = rocchio(P, PN, alpha=alpha, beta=beta)

    y_U = model.predict(U)

    U_minus_RN, RN = partition_pos_neg(U, y_U)
    if verbose:
        print("Reliable Negative examples in U:", num_rows(RN), "(",
              100 * num_rows(RN) / num_rows(U), "%)")

    return U_minus_RN, RN
Exemple #5
0
def iterate_SVM(P,
                U,
                RN,
                max_neg_ratio=0.2,
                clf_selection=True,
                kernel=None,
                C=0.1,
                n_estimators=9,
                verbose=False):
    """runs an SVM classifier trained on P and RN iteratively, augmenting RN

    after each iteration, the documents in U classified as negative are moved to RN until there are none left.
    max_neg_ratio is the maximum accepted ratio of P to be classified as negative by final classifier.
    if clf_selection is true and the final classifier regards more than max_neg_ratio of P as negative,
    return the initial one."""

    y_P = np.ones(num_rows(P))
    y_RN = np.zeros(num_rows(RN))

    if kernel is not None:
        if verbose:
            print("Building initial Bagging SVC (", n_estimators, "clfs)",
                  "with Positive and Reliable Negative docs")
        clf = (BaggingClassifier(
            svm.SVC(class_weight='balanced', kernel=kernel, C=C),
            bootstrap=True,
            n_estimators=n_estimators,
            n_jobs=min(n_estimators, cpu_count()),
            max_samples=(1.0 if n_estimators < 4 else 1.0 /
                         (n_estimators - 2))))
    else:
        if verbose:
            print(
                "Building initial linearSVM classifier with Positive and Reliable Negative docs"
            )
        clf = svm.LinearSVC(class_weight='balanced', C=C)

    initial_model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))

    if num_rows(U) == 0:
        print("Warning: SVM: All of U was classified as negative.")
        return initial_model

    if verbose:
        print(
            "Predicting U with initial SVM, adding negatively classified docs to RN for iteration"
        )

    y_U = initial_model.predict(U)
    Q, W = partition_pos_neg(U, y_U)
    iteration = 0
    model = None

    if num_rows(Q) == 0 or num_rows(W) == 0:
        print(
            "Warning: Returning initial SVM because all of U was assigned label",
            y_U[0])
        return initial_model

    if clf_selection:
        y_P_initial = initial_model.predict(P)
        initial_neg_ratio = 1 - np.average(y_P_initial)

        if initial_neg_ratio > max_neg_ratio:
            print("Returning initial SVM ({}% of P classified as negative)".
                  format(100 * initial_neg_ratio))
            return initial_model

    # iterate SVM, each turn augmenting RN by the documents in Q classified negative
    while np.size(W) and np.size(Q):
        iteration += 1

        RN = concatenate((RN, W))
        y_RN = np.zeros(num_rows(RN))

        if verbose:
            print("\nIteration #", iteration, "\tReliable negative examples:",
                  num_rows(RN))

        if kernel is not None:
            clf = (BaggingClassifier(
                svm.SVC(class_weight='balanced', kernel=kernel, C=C),
                bootstrap=True,
                n_estimators=n_estimators,
                n_jobs=min(n_estimators, cpu_count()),
                max_samples=(1.0 if n_estimators < 4 else 1.0 /
                             (n_estimators - 2))))
        else:
            clf = svm.LinearSVC(class_weight='balanced', C=C)

        model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))
        y_Q = model.predict(Q)
        Q, W = partition_pos_neg(Q, y_Q)

    if np.size(W):
        RN = concatenate((RN, W))
        model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN)))

    if verbose:
        print("Iterative SVM converged. Reliable negative examples:",
              num_rows(RN))

    if clf_selection:
        if verbose:
            print(
                "Ratio of positive examples misclassified as negative by initial SVM:",
                initial_neg_ratio)
        if model is None:
            return initial_model

        y_P_final = model.predict(P)
        final_neg_ratio = 1 - np.average(y_P_final)

        if verbose:
            print(
                "Ratio of positive examples misclassified as negative by final SVM:",
                final_neg_ratio)

        if final_neg_ratio > max_neg_ratio and final_neg_ratio > initial_neg_ratio:
            print(
                iteration,
                "iterations - final SVM discards too many positive examples.",
                "Returning initial SVM instead")
            return initial_model

    print("Returning final SVM after", iteration, "iterations")
    return model
Exemple #6
0
def pu_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    y_P, y_U = partition_pos_neg(y_pred, y)
    return pu_score(y_P, y_U)