def get_RN_rocchio(P, U, alpha=16, beta=4, verbose=False): """extract Reliable Negative documents using Binary Rocchio algorithm""" if verbose: print("Building Rocchio model to determine Reliable Negative examples") model = rocchio(P, U, alpha=alpha, beta=beta) y_U = model.predict(U) U_minus_RN, RN = partition_pos_neg(U, y_U) if verbose: print("Reliable Negative examples in U:", num_rows(RN), "(", 100 * num_rows(RN) / num_rows(U), "%)") return U_minus_RN, RN
def fit(self, X, y): """learn prototype vectors for positive and negative docs""" y = [label2num(l) for l in y] P, N = partition_pos_neg(X, y) normalized_p = normalize(P.mean(axis=0)) normalized_n = normalize(N.mean(axis=0)) self.proto_p = normalize(self.alpha * normalized_p - self.beta * normalized_n) self.proto_n = normalize(self.alpha * normalized_n - self.beta * normalized_p) return self
def standalone_rocchio(P, U, alpha=16, beta=4, verbose=False): """1-step Rocchio method""" print("Running Rocchio") if verbose: print("Building Rocchio model to determine Reliable Negative examples") model = rocchio(P, U, alpha=alpha, beta=beta) y_U = model.predict(U) U_minus_RN, RN = partition_pos_neg(U, y_U) if verbose: print("Reliable Negative examples in U:", num_rows(RN), "(", 100 * num_rows(RN) / num_rows(U), "%)") train_report(model, P, U) return model
def get_RN_cosine_rocchio(P, U, noise_lvl=0.20, alpha=16, beta=4, verbose=False): """extract Reliable Negative documents using cosine similarity and Binary Rocchio algorithm similarity is the cosine similarity compared to the mean positive sample. firstly, select Potential Negative docs that have lower similarity than the worst l% in P. source: negative harmful """ if verbose: print("Computing ranking (cosine similarity to mean positive example)") mean_p_ranker = ranking_cos_sim(P) sims_P = mean_p_ranker.predict_proba(P) sims_U = mean_p_ranker.predict_proba(U) if verbose: print("Choosing Potential Negative examples with ranking threshold") _, PN = select_PN_below_score(sims_P, U, sims_U, noise_lvl=noise_lvl, verbose=verbose) if verbose: print("Building Rocchio model to determine Reliable Negative examples") model = rocchio(P, PN, alpha=alpha, beta=beta) y_U = model.predict(U) U_minus_RN, RN = partition_pos_neg(U, y_U) if verbose: print("Reliable Negative examples in U:", num_rows(RN), "(", 100 * num_rows(RN) / num_rows(U), "%)") return U_minus_RN, RN
def iterate_SVM(P, U, RN, max_neg_ratio=0.2, clf_selection=True, kernel=None, C=0.1, n_estimators=9, verbose=False): """runs an SVM classifier trained on P and RN iteratively, augmenting RN after each iteration, the documents in U classified as negative are moved to RN until there are none left. max_neg_ratio is the maximum accepted ratio of P to be classified as negative by final classifier. if clf_selection is true and the final classifier regards more than max_neg_ratio of P as negative, return the initial one.""" y_P = np.ones(num_rows(P)) y_RN = np.zeros(num_rows(RN)) if kernel is not None: if verbose: print("Building initial Bagging SVC (", n_estimators, "clfs)", "with Positive and Reliable Negative docs") clf = (BaggingClassifier( svm.SVC(class_weight='balanced', kernel=kernel, C=C), bootstrap=True, n_estimators=n_estimators, n_jobs=min(n_estimators, cpu_count()), max_samples=(1.0 if n_estimators < 4 else 1.0 / (n_estimators - 2)))) else: if verbose: print( "Building initial linearSVM classifier with Positive and Reliable Negative docs" ) clf = svm.LinearSVC(class_weight='balanced', C=C) initial_model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN))) if num_rows(U) == 0: print("Warning: SVM: All of U was classified as negative.") return initial_model if verbose: print( "Predicting U with initial SVM, adding negatively classified docs to RN for iteration" ) y_U = initial_model.predict(U) Q, W = partition_pos_neg(U, y_U) iteration = 0 model = None if num_rows(Q) == 0 or num_rows(W) == 0: print( "Warning: Returning initial SVM because all of U was assigned label", y_U[0]) return initial_model if clf_selection: y_P_initial = initial_model.predict(P) initial_neg_ratio = 1 - np.average(y_P_initial) if initial_neg_ratio > max_neg_ratio: print("Returning initial SVM ({}% of P classified as negative)". format(100 * initial_neg_ratio)) return initial_model # iterate SVM, each turn augmenting RN by the documents in Q classified negative while np.size(W) and np.size(Q): iteration += 1 RN = concatenate((RN, W)) y_RN = np.zeros(num_rows(RN)) if verbose: print("\nIteration #", iteration, "\tReliable negative examples:", num_rows(RN)) if kernel is not None: clf = (BaggingClassifier( svm.SVC(class_weight='balanced', kernel=kernel, C=C), bootstrap=True, n_estimators=n_estimators, n_jobs=min(n_estimators, cpu_count()), max_samples=(1.0 if n_estimators < 4 else 1.0 / (n_estimators - 2)))) else: clf = svm.LinearSVC(class_weight='balanced', C=C) model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN))) y_Q = model.predict(Q) Q, W = partition_pos_neg(Q, y_Q) if np.size(W): RN = concatenate((RN, W)) model = clf.fit(concatenate((P, RN)), concatenate((y_P, y_RN))) if verbose: print("Iterative SVM converged. Reliable negative examples:", num_rows(RN)) if clf_selection: if verbose: print( "Ratio of positive examples misclassified as negative by initial SVM:", initial_neg_ratio) if model is None: return initial_model y_P_final = model.predict(P) final_neg_ratio = 1 - np.average(y_P_final) if verbose: print( "Ratio of positive examples misclassified as negative by final SVM:", final_neg_ratio) if final_neg_ratio > max_neg_ratio and final_neg_ratio > initial_neg_ratio: print( iteration, "iterations - final SVM discards too many positive examples.", "Returning initial SVM instead") return initial_model print("Returning final SVM after", iteration, "iterations") return model
def pu_scorer(estimator, X, y): y_pred = estimator.predict(X) y_P, y_U = partition_pos_neg(y_pred, y) return pu_score(y_P, y_U)