print("count_one : ", count_one) print("count_zero : ", count_zero) df_new.to_csv('sample_data_4.csv') return df_new def show_topK(self, classifier, vectorizer, categories, K=10): feature_names = np.asarray(vectorizer.get_feature_names()) for i, category in enumerate(categories): topK = np.argsort(classifier.coef_[0])[-K:] print("%s: %s" % (category, " ".join(feature_names[topK]))) if __name__ == '__main__': mf = main_file() cl = classification() df = pd.read_csv(mf.input_file_path, sep=',') X, y = mf.get_input_text_and_label(df) #mf.create_new_dataset(df) #sys.exit() ''' X = data_preprocessing_1().process_data(X) X = data_preprocessing_2().process_data(X) X = data_preprocessing_3().preprocess_text(X) X = mf.remove_nan(X) df = mf.update_dataframe(df, X) df.to_csv(mf.output_file_path)
def basic_model(self, data, X, X_vec, labels, labelled_set, unlabelled_set, n_gram, clf_name): skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=False) #vectorizer = TfidfVectorizer(ngram_range=(1,1), use_idf=True, smooth_idf=True, norm='l2') #X_vec = vectorizer.fit_transform(X) resp_label = np.copy(labels) final_confusion_matrix = [[0, 0], [0, 0]] X_labelled = X[labelled_set] y_labelled = y2[labelled_set] X_unlabelled = X[unlabelled_set] data['feature_response_labels'] = -1 #print(X_vec[labelled_set].shape) for train_index, test_index in skf.split(X_vec[labelled_set], y_labelled): y = np.copy(labels) #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X_labelled[train_index], X_labelled[test_index] y_train, y_test = y_labelled[train_index], y_labelled[test_index] #labelled_set = train_index #print("y shape before ", y.shape) y = np.delete(y, test_index) cl = classification() print("y shape after ", y1.shape, y2.shape) #print("Y before ", labels) print("X train ", X_train.shape) print("X test ", X_test.shape) train_index_orig = labelled_set[train_index] test_index_orig = labelled_set[test_index] print("train_index_orig shape ", train_index_orig.shape) print("test_index_orig shape ", test_index_orig.shape) pipeline = Pipeline([ # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( transformer_list=[ # Pipeline for pulling features from the post's subject line ('deadline_ppl', Pipeline([ ('selector', Custom_features_2(key='deadline_weight')), ])), # Pipeline for standard bag-of-words model for body ('text_ppl', Pipeline([ ('selector', Custom_features_3(key='Text')), ('tfidf', TfidfVectorizer(ngram_range=(n_gram[0], n_gram[1]), use_idf=True, smooth_idf=True, norm='l2')), ])), ], # weight components in FeatureUnion transformer_weights={ 'deadline_ppl': 1.0, 'text_ppl': 1.0, }, )), ]) #vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf=True, smooth_idf=True, norm='l2') X_ = pipeline.fit_transform(data) print(X_.shape) X_train_vec = X_[0:train_index_orig.shape[0]] X_test = X_[train_index_orig.shape[0]:train_index_orig.shape[0] + test_index_orig.shape[0]] X_unlabelled_vec = X_[-unlabelled_set.shape[0]:] y_ = np.concatenate((y_train, resp_label[unlabelled_set]), axis=0) print(X_train_vec.shape, X_unlabelled_vec.shape, X_test.shape) #final_labels, clf = semi_supervised_classification().pseudo_labelling(y, X_train, y_train, X_unlabelled, labelled_set, unlabelled_set, sample_rate) if (clf_name == 'EM'): final_labels, clf = cl.expectation_maximization( X_train_vec, y_train, X_unlabelled_vec) elif (clf_name == 'LS'): final_labels, clf = cl.label_spreading(X_train_vec, y_, X_unlabelled_vec) elif (clf_name == 'LP'): final_labels, clf = cl.label_propagation( X_train_vec, y_, X_unlabelled_vec) #print("Y after ", labels) pred_labels = clf.predict(X_test) print("pred_labels :", pred_labels, "\tReal labels: ", y_test) confusion_mat = confusion_matrix(y_test, pred_labels, labels=[0, 1]) print(confusion_mat) tn, fp, fn, tp = confusion_mat.ravel() print(tn, fp, fn, tp) final_confusion_matrix[0][0] += tn final_confusion_matrix[0][1] += fp final_confusion_matrix[1][0] += fn final_confusion_matrix[1][1] += tp print("Final confiusion matrix ", final_confusion_matrix) tn, fp, fn, tp = np.array(final_confusion_matrix).ravel() u_precision = tp / (tp + fp) u_recall = tp / (tp + fn) u_f1_score = 2 * u_precision * u_recall / (u_precision + u_recall) non_u_precision = tn / (tn + fn) non_u_recall = tn / (tn + fp) non_u_f1_score = 2 * non_u_precision * non_u_recall / ( non_u_precision + non_u_recall) accuracy = (tp + tn) / (tp + tn + fp + fn) return np.array( final_confusion_matrix ), u_precision, u_recall, u_f1_score, non_u_precision, non_u_recall, non_u_f1_score, accuracy
def feature_model(self, data, X, X_vec, y1, y2, labelled_set, unlabelled_set, n_gram, clf_name): skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=False) #vectorizer = TfidfVectorizer(ngram_range=(1,1), use_idf=True, smooth_idf=True, norm='l2') #X_vec = vectorizer.fit_transform(X) resp_label = np.copy(y1) urg_labels = np.copy(y2) final_confusion_matrix = [[0, 0], [0, 0]] X_labelled = X[labelled_set] y_labelled = y2[labelled_set] X_unlabelled = X[unlabelled_set] data['feature_response_labels'] = -1 #print(X_vec[labelled_set].shape) for train_index, test_index in skf.split(X_vec[labelled_set], y_labelled): #y = np.copy(labels) #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X_labelled[train_index], X_labelled[test_index] y_train, y_test = y_labelled[train_index], y_labelled[test_index] #labelled_set = train_index #print("y shape before ", y.shape) y1 = np.delete(y1, test_index) y2 = np.delete(y2, test_index) cl = classification() print("y shape after ", y1.shape, y2.shape) #print("Y before ", labels) print("X train ", X_train.shape) print("X test ", X_test.shape) clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) #clf = IsolationForest(max_samples=100, random_state = np.random.RandomState(42), contamination='auto') ppl = Pipeline([ # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( transformer_list=[ ('Custom_features_ppl', Pipeline([ ('selector', Custom_features()), ])), # Pipeline for standard bag-of-words model for body ('text_ppl', Pipeline([ ('tfidf', TfidfVectorizer(ngram_range=(n_gram[0], n_gram[1]), use_idf=True, smooth_idf=True, norm='l2')), ])), ], # weight components in FeatureUnion transformer_weights={ 'Custom_features_ppl': 1.0, 'text_ppl': 1.0, }, )), #('to_dense', DenseTransformer()), ('clf', clf) ]) lab = data['Response_needed'] unique, count = np.unique(lab, return_counts=True) #print(dict(zip(unique,count))) ppl.fit(X_train) y_pred = ppl.predict(X_unlabelled) filtered_index_orig_one = unlabelled_set[np.where(y_pred == 1)[0]] print(filtered_index_orig_one.shape) y_response_label = np.concatenate( (filtered_index_orig_one, labelled_set), axis=0) print("Shapes ", y1.shape[0] + test_index.shape[0]) response_labels = [] for i in range(data.shape[0]): if i in y_response_label: response_labels.append(1) else: response_labels.append(0) #print(response_labels) p = data['feature_response_labels'] unique, count = np.unique(p, return_counts=True) print(dict(zip(unique, count))) response_labels = pd.Series(response_labels) print(response_labels.shape) #train_df_clf2.iloc[:,28] = combined_response_labels data = data.assign(feature_response_labels=response_labels.values) train_index_orig = labelled_set[train_index] test_index_orig = labelled_set[test_index] print("train_index_orig shape ", train_index_orig.shape) print("test_index_orig shape ", test_index_orig.shape) combined_train_index_orig = np.concatenate( (train_index_orig, test_index_orig, unlabelled_set), axis=0) print("combined_train_index_orig shape ", combined_train_index_orig.shape) train_df = data.iloc[combined_train_index_orig, :] print(train_df.shape) pipeline = Pipeline([ # Use FeatureUnion to combine the features from subject and body ( 'union', FeatureUnion( transformer_list=[ # Pipeline for pulling features from the post's subject line ('deadline_ppl', Pipeline([ ('selector', Custom_features_2(key='deadline_weight')), ])), ('response_label_ppl', Pipeline([ ('selector', Custom_features_2( key='feature_response_labels')), ])), # Pipeline for standard bag-of-words model for body ('text_ppl', Pipeline([ ('selector', Custom_features_3(key='Text')), ('tfidf', TfidfVectorizer(ngram_range=(n_gram[0], n_gram[1]), use_idf=True, smooth_idf=True, norm='l2')), ])), ], # weight components in FeatureUnion transformer_weights={ 'deadline_ppl': 1.0, 'response_label_ppl': 1.0, 'text_ppl': 1.0, }, )), ]) #vectorizer = TfidfVectorizer(ngram_range=(1,2), use_idf=True, smooth_idf=True, norm='l2') X_ = pipeline.fit_transform(train_df) print(X_.shape) X_train_vec = X_[0:train_index_orig.shape[0]] X_test = X_[train_index_orig.shape[0]:train_index_orig.shape[0] + test_index_orig.shape[0]] X_unlabelled_vec = X_[-unlabelled_set.shape[0]:] print(X_train_vec.shape, X_unlabelled_vec.shape, X_test.shape) y_ = np.concatenate((y_train, resp_label[unlabelled_set]), axis=0) #final_labels, clf = semi_supervised_classification().pseudo_labelling(y, X_train, y_train, X_unlabelled, labelled_set, unlabelled_set, sample_rate) if (clf_name == 'EM'): final_labels, clf = cl.expectation_maximization( X_train_vec, y_train, X_unlabelled_vec) elif (clf_name == 'LS'): final_labels, clf = cl.label_spreading(X_train_vec, y_, X_unlabelled_vec) elif (clf_name == 'LP'): final_labels, clf = cl.label_propagation( X_train_vec, y_, X_unlabelled_vec) #print("Y after ", labels) pred_labels = clf.predict(X_test) print("pred_labels :", pred_labels, "\tReal labels: ", y_test) confusion_mat = confusion_matrix(y_test, pred_labels, labels=[0, 1]) print(confusion_mat) tn, fp, fn, tp = confusion_mat.ravel() print(tn, fp, fn, tp) final_confusion_matrix[0][0] += tn final_confusion_matrix[0][1] += fp final_confusion_matrix[1][0] += fn final_confusion_matrix[1][1] += tp print("Final confiusion matrix ", final_confusion_matrix) tn, fp, fn, tp = np.array(final_confusion_matrix).ravel() u_precision = tp / (tp + fp) u_recall = tp / (tp + fn) u_f1_score = 2 * u_precision * u_recall / (u_precision + u_recall) non_u_precision = tn / (tn + fn) non_u_recall = tn / (tn + fp) non_u_f1_score = 2 * non_u_precision * non_u_recall / ( non_u_precision + non_u_recall) accuracy = (tp + tn) / (tp + tn + fp + fn) return np.array( final_confusion_matrix ), u_precision, u_recall, u_f1_score, non_u_precision, non_u_recall, non_u_f1_score, accuracy
def pseudo_labelling(self, final_y, X_train, y_train, X_test, labelled_set, unlabelled_set, sample_rate, clf=None): #def pseudo_labelling(self, X, y, X_train, y_train, X_test, X_orig): if(-1 not in final_y): return final_y, clf num_of_samples = math.ceil(len(X_train) * self.sample_rate) print("num_of_samples : ", num_of_samples) #print("Y Lables: ", final_y, final_y.shape) #print("X_train ", X_train.shape) #print("y_train ", y_train, y_train.shape) #print("x_test ", X_test.shape) #print("labelled set : ", labelled_set, labelled_set.shape) #print("unlabelled set : ", unlabelled_set, unlabelled_set.shape) cl = classification() predicted_labels, prediction_confidence, clf = cl.linear_svc(X_train, y_train, X_test) #sys.exit() #print(predicted_labels, predicted_labels.shape) #print("Prediction_confidence_before along with predicted labels: ", prediction_confidence, "\t", predicted_labels) prediction_confidence = self.normalization(prediction_confidence) pred_conf_sorted = np.argsort(np.absolute(prediction_confidence)) p_index = pred_conf_sorted[-num_of_samples:] #print("Prediction_confidence : \n", prediction_confidence) #print(pred_conf_sorted, "\n", p_index, "\n", prediction_confidence[p_index]) #print(unlabelled_set.shape) ''' deadline_values = metadata().calculate_deadline_weight(unlabelled_set) deadline_val_sorted = np.argsort(np.absolute(deadline_values)) d_index = deadline_val_sorted[-num_of_samples:] ''' #print("deadline values : \n", deadline_values, "\n", deadline_val_sorted, "\n", d_index, "\n", deadline_values[d_index]) #pred_conf = prediction_confidence + 0 * deadline_values pred_conf = prediction_confidence #print("Combined \n:", pred_conf) pseudo_labels, pseudo_labelled_indices = self.compute_final_label(pred_conf, num_of_samples) ''' sorted_indices = np.argsort(np.absolute(pred_conf)) print("Sorted indices: ", sorted_indices) #print("prediction confidence ",prediction_confidence[sorted_indices[-num_of_samples:]]) pseudo_labelled_indices = sorted_indices[-num_of_samples:] print("pseudo_labelled_indices :", pseudo_labelled_indices) #sys.exit() ''' new_train_X = [] new_train_y = [] #unlabelled_indices = unlabelled_set.copy() for ind in pseudo_labelled_indices: #print("Index ", ind) #print("unlabelled_indices :", unlabelled_set) delete_orig_index = unlabelled_set[ind] #print("Delete index : ", delete_orig_index) if(final_y[delete_orig_index] == -1): final_y[delete_orig_index] = pseudo_labels[ind] new_train_y.append(pseudo_labels[ind]) new_train_X.append(X_test[ind]) labelled_set = np.append(labelled_set, delete_orig_index) #unlabelled_set = np.delete(unlabelled_set, ind, axis = 0) else: print("Value already been updated : ", delete_orig_index, final_y[delete_orig_index]) sys.exit() unlabelled_set = np.delete(unlabelled_set, pseudo_labelled_indices, axis = 0) new_train_X = np.array(new_train_X) new_train_y = np.array(new_train_y) #print("New train X : ", new_train_X, new_train_X.shape) #print("New train Y : ", new_train_y, new_train_y.shape) X_train = np.concatenate((X_train, new_train_X), axis = 0) y_train = np.concatenate((y_train, new_train_y), axis = 0) X_test = np.delete(X_test, pseudo_labelled_indices, axis=0) print() return self.pseudo_labelling(final_y, X_train, y_train, X_test, labelled_set, unlabelled_set, sample_rate, clf)
def __init__(self): self.x = 0 self.cl = classification()
def skfold_cv(self, X1, y1, X2, y2, response_labels, labelled_set, unlabelled_set, ppl, data, ngrams, semi_clf): skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=False) labels = np.copy(y2) final_confusion_matrix = [[0,0],[0,0]] X_labelled = X2[labelled_set] y_labelled = y2[labelled_set] X_unlabelled = X2[unlabelled_set] y_unlabelled = y2[unlabelled_set] i = 1 for train_index, test_index in skf.split(X_labelled, y_labelled): print("Cross Validation iteration #",i) i+=1 y2 = np.copy(labels) #print("TRAIN:", train_index, "TEST:", test_index) print("Train_index_shape ", train_index.shape, "\t Test index shape ", test_index.shape) X_train, X_test = X_labelled[train_index], X_labelled[test_index] y_train, y_test = y_labelled[train_index], y_labelled[test_index] response_labels_train = response_labels[train_index] response_labels_test = response_labels[test_index] X_train_clf1 = np.concatenate((X1, X_train),axis=0) y_train_clf1 = np.concatenate((y1, response_labels_train),axis=0) y_train_clf1 = y_train_clf1.astype(int) #labelled_set = train_index print("y shape before ", y2.shape) y2 = np.delete(y2, test_index) print("y shape after ", y2.shape) sample_rate=0.2 #unique, counts = np.unique(y_train_clf1, return_counts=True) #print(dict(zip(unique, counts))) ppl.fit(X_train_clf1, y_train_clf1) y_test_pred = ppl.predict(X_test) #print(y_test_pred, "\n", y_test_pred.shape) y_unlabelled_pred = ppl.predict(X_unlabelled) print(y_unlabelled_pred, y_unlabelled_pred.shape) cl = classification() train_index_orig = labelled_set[train_index] test_index_orig = labelled_set[test_index] #Combining predcited response labels with originial ones to pass as feature for vectorization combined_train_index_orig = np.concatenate((train_index_orig, test_index_orig, unlabelled_set),axis=0) response_label_pred = y_unlabelled_pred combined_response_labels = np.concatenate((response_labels_train, response_labels_test, response_label_pred),axis=0) print(response_label_pred.shape, response_labels_train.shape) print(combined_train_index_orig.shape,combined_response_labels.shape) train_df_clf2 = data.iloc[combined_train_index_orig,:] #combined_response_labels = np.transpose(np.matrix(combined_response_labels)) combined_response_labels = pd.Series(combined_response_labels) response_required_label = combined_response_labels print("Shape before ", train_df_clf2.shape, combined_response_labels.shape) train_df_clf2 = train_df_clf2.assign(response_required_label= response_required_label.values) #print(dict(zip(combined_train_index_orig, combined_response_labels))) #print(train_df_clf2.iloc[130:180,]) pipeline = Pipeline([ # Use FeatureUnion to combine the features from subject and body ('union', FeatureUnion( transformer_list=[ # Pipeline for pulling features from the post's subject line ('deadline_ppl', Pipeline([ ('selector', Custom_features_2(key = 'deadline_weight')), ])), ('response_label_ppl', Pipeline([ ('selector', Custom_features_2(key = 'response_required_label')), ])), # Pipeline for standard bag-of-words model for body ('text_ppl', Pipeline([ ('selector', Custom_features(key = 'Text')), ('tfidf', TfidfVectorizer(ngram_range = ngrams, use_idf=True, smooth_idf=True, norm='l2')), ])), ], # weight components in FeatureUnion transformer_weights={ 'deadline_ppl': 1.0, 'response_label_ppl':1.0, 'text_ppl': 1.0, }, )), ]) X_vec = pipeline.fit_transform(train_df_clf2) #print(X_vec, X_vec.shape) ''' vectorizer = TfidfVectorizer(ngram_range=(1,3), norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) X_vec = vectorizer.fit_transform(X) #print("Vec torized_text \n", X_vec) print(X_vec.shape) ''' X_train_vec = X_vec[0:train_index.shape[0]] X_test_vec = X_vec[train_index.shape[0]:(train_index.shape[0]+test_index.shape[0])] X_unlabelled_vec = X_vec[-X_unlabelled.shape[0]:] print(X_vec.shape, X_train_vec.shape, X_unlabelled_vec.shape, X_test_vec.shape) ''' X_unlabelled_vec = X_vec[0: X_unlabelled.shape[0]] X_labelled_vec = X_vec[-X_labelled.shape[0]:] X_train_vec = X_labelled_vec[train_index] X_test_vec = X_labelled_vec[test_index] print(X_unlabelled_vec.shape, X_labelled_vec.shape) ''' #print(X_unlabelled_vec.shape, X_labelled_vec.shape, y_train.shape) #print("XYZZZZZ \n", X_unlabelled_vec[0]) #predicted_labels, prediction_confidence, clf = cl.linear_svc(X_train, y_train, X_test) y_ = np.concatenate((y_train, y_unlabelled), axis=0) if(semi_clf == 'LS'): predicted_labels, clf = cl.label_spreading(X_train_vec, y_, X_unlabelled_vec) elif(semi_clf == 'EM'): predicted_labels, clf = cl.expectation_maximization(X_train_vec, y_train, X_unlabelled_vec) #print("final_labels :", predicted_labels, predicted_labels.shape) unique, counts = np.unique(predicted_labels, return_counts=True) print("Predicted label summary ", dict(zip(unique, counts))) y_pred = clf.predict(X_test_vec) #print(classification_report(y_test, y_pred)) #print("Accuracy ", accuracy_score(y_test, y_pred)) #print(sklearn.metrics.confusion_matrix(y_test, y_pred)) print("pred_labels :", y_pred, "\tReal labels: ", y_test) confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_pred) print(confusion_matrix) print("Type is ", type(confusion_matrix)) tn, fp, fn, tp = confusion_matrix.ravel() #print(tn, fp, fn, tp) final_confusion_matrix[0][0] += tn final_confusion_matrix[0][1] += fp final_confusion_matrix[1][0] += fn final_confusion_matrix[1][1] += tp #print("Final confiusion matrix ", final_confusion_matrix) #tn, fp, fn, tp = final_confusion_matrix[0][0], final_confusion_matrix[0][1], final_confusion_matrix[1][0], final_confusion_matrix[1][1] tn, fp, fn, tp = np.array(final_confusion_matrix).ravel() u_precision = tp/(tp + fp) u_recall = tp/(tp + fn) u_f1_score = 2 * u_precision * u_recall / (u_precision + u_recall) non_u_precision = tn/(tn + fn) non_u_recall = tn/(tn + fp) non_u_f1_score = 2 * non_u_precision * non_u_recall / (non_u_precision + non_u_recall) accuracy = (tp + tn)/(tp + tn + fp + fn) return np.array(final_confusion_matrix), u_precision, u_recall, u_f1_score, non_u_precision, non_u_recall, non_u_f1_score, accuracy ''' #unique, counts = np.unique(y_unlabelled_pred, return_counts=True) #print(dict(zip(unique, counts))) #sys.exit() final_labels, clf = semi_supervised_classification().pseudo_labelling(y, X_train, y_train, X_unlabelled, labelled_set, unlabelled_set, sample_rate) #final_labels, clf = self.cl.expectation_maximization(X_train, y_train, X_unlabelled) #final_labels, clf = self.cl.label_spreading(X_train, y, X_unlabelled) print("Y after ", labels) pred_labels = clf.predict(X_test) print("pred_labels :", pred_labels, "\tReal labels: ", y_test) print(self.classification_rep(X_train, y_train, clf)) confusion_matrix = self.confusion_mat(X_test, y_test, clf) print(confusion_matrix) tn, fp, fn, tp = confusion_matrix.ravel() print(tn, fp, fn, tp) final_confusion_matrix[0][0] += tn final_confusion_matrix[0][1] += fp final_confusion_matrix[1][0] += fn final_confusion_matrix[1][1] += tp print("Final confiusion matrix ", final_confusion_matrix) tp, fp, fn, tp = final_confusion_matrix[0][0], final_confusion_matrix[0][1], final_confusion_matrix[1][0], final_confusion_matrix[1][1] overall_precision = tp/(tp + fp) overall_recall = tp/(tp + fn) overall_accuracy = (tp + tn)/(tp + tn + fp + fn) overall_f1_score = 2 * overall_precision * overall_recall / (overall_precision + overall_recall) return np.array(final_confusion_matrix), overall_precision, overall_recall, overall_accuracy, overall_f1_score '''