def run_script(tarfname, c=1000): from . import classify sentiment = read_files(tarfname) cls1 = classify.train_classifier(sentiment.trainX, sentiment.trainy, c, 'l1', 'liblinear', 10000) cls2 = classify.train_classifier(sentiment.trainX, sentiment.trainy, c, 'l2', 'lbfgs', 10000) return cls1, cls2, sentiment
def training_and_evaluation(sentiment, iteration, confidence): l = list(range(iteration + 1)) l = l[1:] l[:] = [x * 0.1 for x in l] unlabeled = read_unlabeled(tarfname, sentiment) unlabeled_size = unlabeled.X.shape[0] # training the classifier only on the training data import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') # increase the proportion of unlabeled data by 10%, 20%, ... 100% for i in l: print('\nUnlabeled Data: ' + str(i * 100) + '%') unlabeled_y = write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) # find the instances of unlabeled data which have been predicted with more than confidence% class_probabilities = cls.predict_proba( unlabeled.X[0:int(i * unlabeled_size)]) idx = np.where(class_probabilities > confidence) C = unlabeled.X[0:int(i * unlabeled_size)] D = C.tocsr() D = D[idx[0], :] # build the new training set new_trainX = vstack((sentiment.trainX, D)) new_trainy = np.concatenate((sentiment.trainy, unlabeled_y[idx[0]]), axis=0) print(new_trainX.shape) print(new_trainy.shape) # train the classifier on the expanded data cls = classify.train_classifier(new_trainX, new_trainy) print("Evaluating") yp_train = classify.evaluate(new_trainX, new_trainy, cls, 'train') yp_dev = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') interpretation(cls, sentiment, yp_train, yp_dev) i = 0 j = 0 while i < 10: if (yp_dev[j] != sentiment.devy[j]): print(sentiment.dev_data[j]) i += 1 j += 1 return cls
def semi_supervised_learning(unlabeled, sentiment, f, iters): import classify import numpy as np from sklearn.utils import shuffle import matplotlib.pyplot as plt cls = classify.train_classifier( sentiment.trainX, sentiment.trainy) # initial train with 0 unlabelled predicted initial_preds = cls.predict(unlabeled.X) factor = f # roughly about 10% of the corpus # print(type(sentiment.trainX)) # print(type(sentiment.trainy)) unlabeled.data_temp = unlabeled.data for i in range(iters): end_index = min(len(unlabeled.data), (i * factor) + factor) partition = unlabeled.data_temp[i * factor: end_index] # create partition of data #partition_matrix = sentiment.tfidf_vect.transform(partition) # create tfidf features on corpus partition_matrix = unlabeled.X[i * factor:end_index] yp = cls.predict( partition_matrix ) # predict on this partition of unseen data to create labels decisions = cls.decision_function(partition_matrix) # predict on unseen portion of data #for j in range(len(decisions)): # print(decisions[j]) #print(decisions) #print(decisions) # append this data to the train to create new train with labels for j in range(len(partition)): # check the confidence on each prediction before appending if (abs(decisions[j]) > 3.5): #print("HI") # print(partition[j]) # print(yp[j]) sentiment.train_data.append(partition[j]) sentiment.trainy = np.append(sentiment.trainy, yp[j]) #print(len(sentiment.train_data)) #print(sentiment.trainy.shape) sentiment.trainX = sentiment.tfidf_vect.transform( sentiment.train_data ) # transform new training data with partition addition cls = classify.train_classifier( sentiment.trainX, sentiment.trainy) # train a new classifier classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') # evaluate on dev portion return cls # return this new classifier
def train_part1_model(): print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) save_model({'part1_vect.pk': sentiment.tfidf_vect, 'part1_model.pk': cls})
def run(vector_size, window, iter, min_df, max_df): print("Reading data") data_loc = "data/" speech = read_files(data_loc, vector_size, window, iter, min_df, max_df) print("Training classifier") cls = classify.train_classifier(speech.train_doc_vec, speech.trainy) # cls = classify.semi_supervised_learning(cls, speech.train_doc_vec, speech.trainy, speech.unlabeled_doc_vec, speech.dev_doc_vec, speech.devy) print("Evaluating") train_acc = classify.evaluate(speech.train_doc_vec, speech.trainy, cls) dev_acc = classify.evaluate(speech.dev_doc_vec, speech.devy, cls) print("Writing Kaggle pred file") write_pred_kaggle_file(cls, "data/speech-pred.csv", speech) print("=================================") print("size: " + str(vector_size) + " window: " + str(window) + " iter: " + str(iter)) print("min_df: " + str(min_df) + " max_df: " + str(max_df)) print("train_acc: " + str(train_acc)) print("dev_acc: " + str(dev_acc)) print("=================================") return 0
def create_label(test_doc): # test_doc=['travel-nontravel/tr3.txt'] list_of_label = [] for doc_path in test_doc: doc_matrix = extract_features_for_single_doc(doc_path) model1 = train_classifier() result = model1.predict(doc_matrix) list_of_label.append(result) return list_of_label
def getMyModel(): print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) fns = sentiment.tfidf_vect.get_feature_names() fns = np.array(fns) display_features(fns, cls)
def expand(sentiment, unlabeled, percent): print("Expanding {} * 10 percent...".format(percent)) import classify labeled_data = sentiment.trainX labeled_label = sentiment.trainy iteration = 0 iteration += 1 print("Iteration when prediction {}: ".format(percent)) # train labeled print("Training...") cls = classify.train_classifier(labeled_data, labeled_label) # predict unlabeled data print("Predicting...") predict_data = unlabeled.data to_predict = sentiment.count_vect.transform(predict_data) unlabeled_prediction = cls.predict(to_predict) prediction_prob = cls.predict_proba(to_predict) # choose most confident prediction p > percent print("Choosing most confident predictions...") change_list = [] for i in range(len(prediction_prob)): if prediction_prob[i][0] > percent or prediction_prob[i][1] > percent: change_list.append(i) # expand confident prediction to labeled print("Expanding...") from scipy.sparse import vstack import numpy as np for i in change_list: labeled_data = vstack([labeled_data, to_predict[i, :]]) labeled_label = np.append(labeled_label, [unlabeled_prediction[i]]) # train labeled print("Training again...") cls = classify.train_classifier(labeled_data, labeled_label) return cls
def train_part2_model(): columns_train = preprocess("train.csv") data_train, y_train = generate_data(columns_train, 'Train') columns_test = preprocess("test.csv") columns_test_labels = preprocess("test_labels.csv") columns_test_labels["comment_text"] = columns_test["comment_text"] data_test, y_test = generate_data(columns_test_labels, 'Test') data = data_train + data_test tfidf_vect = TfidfVectorizer() tfidf_vect.fit(data) X_train = tfidf_vect.transform(data_train) cls = classify.train_classifier(X_train, y_train) save_model({'part2_vect.pk': tfidf_vect, 'part2_model.pk': cls})
def expand_data(speech): unlabeledX = speech.unlabeledX trainX = speech.trainX trainy = speech.trainy unlabeledX = sklearn.utils.shuffle(unlabeledX) total_unlabeled_count = unlabeledX.shape[0] best_clf = None best_acc = 0 best_i = 0 unlabeled_results = dict() n_samples = 100 n_iterations = int(total_unlabeled_count / n_samples) print("Doing ", n_iterations, " iterations, with a sample size of ", n_samples) for i in range(n_iterations): clf = classify.train_classifier(trainX, trainy) # acc_before = evaluate(trainX, trainy, clf) newX = unlabeledX[:n_samples] unlabeledX = unlabeledX[n_samples:] newy = clf.predict(newX) trainX = scipy.sparse.vstack([trainX, newX]) trainy = numpy.concatenate([trainy, newy]) acc = classify.evaluate(speech.devX, speech.devy, clf) unlabeled_results[(i + 1) * n_samples] = acc if acc > best_acc: best_acc = acc best_clf = clf best_i = i print("Iteration: ", i, " Accuracy: ", acc) util.print_dict_tofile(unlabeled_results) print("Best accuracy: ", best_acc, " samples of unlabeled data used", (best_i + 1) * n_samples) return best_clf
def train(self): importlib.reload(sentimentinterface) print("Reading data") tarfname = "data/news.tar.gz" sentiment = sentimentinterface.read_data(tarfname) sentiment.stop_words = sentimentinterface.generate_stop_words( sentiment, diff=0.4) from sklearn.feature_extraction.text import CountVectorizer sentiment.cv = CountVectorizer(min_df=3) sentiment.cv.fit_transform(sentiment.train_data) sentiment.mindf_stop_words = sentiment.cv.stop_words_ sentiment.cv = CountVectorizer(max_df=0.2) sentiment.cv.fit_transform(sentiment.train_data) sentiment.maxdf_stop_words = sentiment.cv.stop_words_ sentiment.cv = CountVectorizer() sentiment.cv.fit_transform(sentiment.train_data) sentiment.training_set_vocabulary = sentiment.cv.vocabulary_ sentimentinterface.vectorize_data(sentiment, stop_words=sentiment.stop_words, max_df=0.2, min_df=3) cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, C=3.7) classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') # print("\nReading unlabeled data") # unlabeled = sentimentinterface.read_unlabeled(tarfname, sentiment) # print("Writing predictions to a file") # sentimentinterface.write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) # Logistic Regression Interception self.intercept = copy.deepcopy(cls.intercept_)[0] # Vectorizer vocaulary list (ordered) cv = sentiment.count_vect.vocabulary_ cv = [(v, w) for w, v in cv.items()] cv.sort() cv = [x[1] for x in cv] self.cv = cv return sentiment, cls
def main(): try: dictionary = Dictionary.load_from_text("dictionary.txt") except: dictionary = Dictionary(rcv1_train) dictionary.filter_extremes() dictionary.save_as_text("dictionary.txt") class RCV1BowCorpus(object): def __iter__(self): for document in rcv1_train: yield dictionary.doc2bow(document) ln.debug("Training model on %s documents" % len(rcv1_train)) try: vector_model = LsiModel.load("lsi_model") except: vector_model = LsiModel(corpus=RCV1BowCorpus(), num_topics=100, id2word=dictionary) vector_model.save("lsi_model") def get_lsi_features(text): """ Must return either numpy array or dictionary """ res = vector_model[dictionary.doc2bow(text)] return dict(res) def get_bow_features(text): return dict(dictionary.doc2bow(text)) clf = train_classifier(train_samples=rcv1_train, train_targets=rcv1_train_target, get_features=get_lsi_features, classifier="sgd") evaluate_classifier(clf, rcv1_test, rcv1_test_target, get_features=get_lsi_features)
def supervised(): print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) from scipy.sparse import vstack test = vstack([sentiment.trainX, unlabeled.X]) print(test.shape) print("Writing predictions to a file") write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) decisive_features(cls, sentiment)
(label, review) = line.strip().split("\t") i += 1 f.write(str(i)) f.write(",") f.write("POSITIVE") f.write("\n") f.close() if __name__ == "__main__": print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) print("Writing predictions to a file") write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) #write_basic_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-basic.csv") # You can't run this since you do not have the true labels # print "Writing gold file" # write_gold_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-gold.csv")
f.write(",") f.write("POSITIVE") f.write("\n") f.close() if __name__ == "__main__": if(len(sys.argv) != 2): print("Please enter two arguments") sys.exit(1) if(sys.argv[1] == "run_model"): print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) print(lexicon_stuff) cls = semi_supervised_learning(unlabeled, sentiment) print("Writing predictions to a file") write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment) #write_basic_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-basic.csv") # You can't run this since you do not have the true labels # print "Writing gold file" # write_gold_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-gold.csv")
def semi_supervise(sentiment, unlabeled, iter, num_conf): import classify best_dev = [] # from scipy.sparse import vstack for i in range(iter): print("\nTraining classifier") sentiment = tfidfvectorizer_feat(sentiment) # reference: https://stackoverflow.com/questions/45232671/obtain-tf-idf-weights-of-words-with-sklearn index_value = { i[1]: i[0] for i in sentiment.count_vect.vocabulary_.items() } fully_indexed = {} for row in sentiment.trainX: for (column, value) in zip(row.indices, row.data): fully_indexed[index_value[column]] = value print( sorted(fully_indexed.items(), key=lambda x: x[1], reverse=True)[:10]) unlabeled.X = sentiment.count_vect.transform(unlabeled.data) cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, 1000) acc = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') if i != 0: best_dev.append(acc) # preds = cls.predict(unlabeled.X) # conf_score = np.max(cls.predict_proba(unlabeled.X), axis=1) conf_score = np.apply_along_axis( lambda x: np.random.choice(x, 1, p=x)[0], 1, cls.predict_proba(unlabeled.X)) preds = np.array([int(i >= 0.5) for i in conf_score]) # conf_score = np.absolute(cls.decision_function(unlabeled.X)) # conf_idx = np.argsort(conf_score) ''' reference: https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array ''' # def find_nearest(array, value): # array = np.asarray(array) # idx = (np.abs(array - value)).argmin() # return idx sum_conf = np.sum(conf_score) conf_score = conf_score / sum_conf conf_idx = np.random.choice(list(range(len(conf_score))), num_conf, p=conf_score) # conf_idx = [] # for i in conf_tmp: # conf_idx.append(find_nearest(conf_score,i)) # conf_idx = np.nonzero(conf_score > 0.99)[0] # print(len(conf_idx)) # if len(conf_idx) < 1000: # return unlabeled, cls, sentiment # new_labeled_X = np.array(unlabeled.data)[conf_idx[-num_conf:]] # new_labeled_y = preds[conf_idx[-num_conf:]] new_labeled_X = np.array(unlabeled.data)[conf_idx] new_labeled_y = preds[conf_idx] tmp_idx = [i for i in range(len(conf_score)) if i not in conf_idx] sentiment.train_data = np.concatenate( (sentiment.train_data, new_labeled_X)) sentiment.trainy = np.concatenate((sentiment.trainy, new_labeled_y)) # unlabeled.data = np.array(unlabeled.data)[conf_idx[:-num_conf]] unlabeled.data = np.array(unlabeled.data)[tmp_idx] return unlabeled, cls, sentiment, max(best_dev)
tarfname = "data/speech.tar.gz" speech = read_files(tarfname) print("Training classifier") import classify C_range = [1, 10, 100, 1000] solvers = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"] for solver in ["saga"]: print("Using " + solver) for c in [10]: print("Evaluating at C=" + str(c)) for tfidf in [True]: print("With tfidf" if tfidf else "Without tfidf") cls = classify.train_classifier( speech.trainX_tfidf if tfidf else speech.trainX, speech.trainy, c=c, solver=solver) print("Acc on Training Data") classify.evaluate( speech.trainX_tfidf if tfidf else speech.trainX, speech.trainy, cls) print("Acc on Dev Data") classify.evaluate(speech.devX_tfidf if tfidf else speech.devX, speech.devy, cls) print("\n") print("Reading unlabeled data") unlabeled = read_unlabeled(tarfname, speech) # numBatches = 10 # labeledXBatches = np.split(speech.trainX_tfidf.toarray(), numBatches)
f.close() def read_instance(tar, ifname): inst = tar.getmember(ifname) ifile = tar.extractfile(inst) content = ifile.read().strip() return content if __name__ == "__main__": print("Reading data") tarfname = "data/speech.tar.gz" speech = read_files(tarfname) print("Training classifier") import classify cls = classify.train_classifier(speech.trainX, speech.trainy) print("Evaluating") classify.evaluate(speech.trainX, speech.trainy, cls) classify.evaluate(speech.devX, speech.devy, cls) print("Reading unlabeled data") unlabeled = read_unlabeled(tarfname, speech) print("Writing pred file") write_pred_kaggle_file(unlabeled, cls, "data/speech-pred.csv", speech) # You can't run this since you do not have the true labels # print "Writing gold file" # write_gold_kaggle_file("data/speech-unlabeled.tsv", "data/speech-gold.csv") # write_basic_kaggle_file("data/speech-unlabeled.tsv", "data/speech-basic.csv")
f.close() if __name__ == "__main__": tarfname = "data/sentiment.tar.gz" maxdf = 1.0 mindf = 1 solve_name = 'sag' penalty = 'l2' print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname, min_df=mindf, max_df=maxdf) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy) print("\nEvaluating") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') print("\nReading unlabeled data") unlabeled = read_unlabeled(tarfname, sentiment) #probability =[0.6,0.7,0.75,0.8,0.85,0.9,0.95,0.98] #for p in probability: cls = train_classifier(sentiment.trainX, sentiment.trainy, penalty=penalty, solver=solve_name)
elif y == 'FALSE': X.append(x) Y.append(0) # The first 90% are train data, the last 10% are test data n = int(len(X) * .9) XY_train = list(zip(X, Y))[:n] XY_test = list(zip(X, Y))[n:] data_train, y_train = [x for x, y in XY_train], [y for x, y in XY_train] data_test, y_test = [x for x, y in XY_test], [y for x, y in XY_test] print("Train data has %d positive reviews" % y_train.count(1)) print("Train data has %d negative reviews" % y_train.count(0)) print("Test data has %d positive reviews" % y_test.count(1)) print("Test data has %d negative reviews" % y_test.count(0)) # Testing print("Testing CountVectorizer...") count_vect = CountVectorizer() count_vect.fit(data) X_train = count_vect.transform(data_train) X_test = count_vect.transform(data_test) cls = classify.train_classifier(X_train, y_train) classify.evaluate(X_test, y_test, cls, 'test') print("Testing TfidfVectorizer...") tfidf_vect = TfidfVectorizer() tfidf_vect.fit(data) X_train = tfidf_vect.transform(data_train) X_test = tfidf_vect.transform(data_test) cls = classify.train_classifier(X_train, y_train) classify.evaluate(X_test, y_test, cls, 'test')
.format(prediction_result, prob_prediction, top_n, top_words, top_coef)) print('Let\'s see it in a bar chart!') plt.figure plt.barh(top_words, top_coef) if do_we_reverse: plt.xlim(0, np.max(coefficients)) else: plt.xlim(0, np.min(coefficients)) plt.title('Weights assigned among different features') plt.ylabel('token') plt.xlabel('weight') if __name__ == "__main__": print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, C = 0.625) # make prediction on a new sample and then try to make explanations index = 1 # randomly select an index encoded_sentence = sentiment.trainX[index, :] # sentence = sentiment.train_data[index].split() classifier_explanation(cls, sentiment, encoded_sentence, top_n = 10)
#!/bin/python ''' Train a vanilla logistic regression. ''' import classify import matplotlib.pyplot as plt import seaborn as sns import numpy as np import sentiment as sent USE_BOG = True if __name__ == "__main__": print("Reading data") datafile = "data/bayzick_clean.csv" sentiment = sent.read_files(datafile, use_bow=USE_BOG) print("\nTraining supervised classifier") cls, cv_results, c_list = classify.train_classifier(sentiment.trainX, sentiment.trainy, rtn_cv_results=True) import pickle pickle.dump(cls, open("lin_reg_unsup.pkl", "wb")) pickle.dump(sentiment, open("sen.pkl", "wb")) print("\nEvaluating Supervised") classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
f.write("POSITIVE") f.write("\n") f.close() if __name__ == "__main__": print("Reading data") tarfname = "data/sentiment.tar.gz" sentiment = read_files(tarfname) print("\nTraining classifier") import classify test_acc, dev_acc, max_dev_acc, best_c, best_p = [], [], 0.0, 0.0, 'l2' testacc, devacc = [], [] for c in [0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0]: for p in ['l1', 'l2']: cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, c, p) print("\nEvaluating at C = ", c, " , Penalty = ", p) t_acc = classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train') d_acc = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev') if p == 'l1': test_acc.append(t_acc) dev_acc.append(d_acc) else: testacc.append(t_acc) devacc.append(d_acc) if d_acc > max_dev_acc: best_c = c best_p = p max_dev_acc = d_acc