def split_by_class(dir, pctTest=10): if not dir.endswith("\\"): dir = dir + "\\" data = GwData.GwData() pctTrain = (1.0 - (pctTest / 100.0)) for code in data.sm_codes: s = data.sentences_for_code(code) not_s = data.sentences_not_for_code(code) train_s_cnt = int(len(s) * pctTrain) train_s = s[:train_s_cnt] test_s = s[train_s_cnt:] train_not_s_cnt = int(len(not_s) * pctTrain) train_not_s = not_s[:train_not_s_cnt] test_not_s = not_s[train_not_s_cnt:] friendly_code = code.replace(".", "_") write_to_file(dir, friendly_code, friendly_code + ".txt", train_s) write_to_file(dir, friendly_code, "test_" + friendly_code + ".txt", test_s) write_to_file(dir, friendly_code, "not_" + friendly_code + ".txt", train_not_s) write_to_file(dir, friendly_code, "test_not_" + friendly_code + ".txt", test_not_s) print "Done"
def main(): #SETTINGS best_n_words = 10000 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + GwData.FOLDER #TOKENIZE data = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5) term_freq = TermFrequency.TermFrequency(tokenized_docs) #NLTK Decision Tree list_of_dicts = Converter.vector_space_to_dict_list( term_freq.matrix, term_freq.id2Word, Converter.to_binary) labels = data.causal_per_document causal_count = sum(labels) relative_word_frequency = DocumentFrequency.document_frequency_ratio( list_of_dicts, labels, lambda l: l == 1) condensed_data = extract_best_n_words(relative_word_frequency, best_n_words, list_of_dicts) labelled_data = zip(condensed_data, labels) td_size = int(0.75 * len(labelled_data)) training_data = labelled_data[:td_size] validation_data = labelled_data[td_size:] dt = nltk.DecisionTreeClassifier.train(training_data) #RESULTS classifications = [dt.classify(rcd) for rcd, lbl in validation_data] results = ResultsHelper.rfp(labels[td_size:], classifications) results += "Num Words Used : " + str(best_n_words) + "\n" results += "\n" error = dt.error(labelled_data) results += "ERROR: : " + str(error * 100) + "%\n" results += "\n" results += "PSEUDOCODE:\n" results += dt.pseudocode(depth=1000) + "\n" print results #DUMP TO FILE fName = results_dir + "Causal_Relation_DT.txt" handle = open(fName, mode="w+") handle.write(results) handle.close() #binary_matrix = term_freq.binary_matrix() #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy') #decision_tree.fit(binary_matrix, labels) # Test with CL1 labels raw_input("Press Enter to quit")
def test_learner_on_data(): import GwData import WordTokenizer code = "50" data = GwData.GwData() xs = WordTokenizer.tokenize(data.documents, spelling_correct=False) ys = data.labels_for(code) def rule_score_fn(act_ys, predicted): return precision(act_ys, predicted) * (recall(act_ys, predicted)** 0.5) learner = RegExLearner(precision, f1_score, 2.5) learner.fit(xs, ys) pred = learner.predict(xs) # TD Performance print_positives(xs, ys) r, p, f1 = rpf1(ys, pred) print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format( r, p, f1) print str(learner) pass
def test_learner_on_data(): import GwData import WordTokenizer import numpy as np MINIMUM_COVERAGE_PCT = 2.0 code = "53" print "Learning rules for code: " + code # '%%' is how you print a '%' in python given that it is a special char print "Mininum coverage: %d%%\n" % (MINIMUM_COVERAGE_PCT) data = GwData.GwData() xs = WordTokenizer.tokenize(data.documents, stem=False, spelling_correct=False, remove_stop_words=False, min_word_count=1) ys = data.labels_for(code) def rule_score_fn(act_ys, predicted): r, p, f1 = rpf1(act_ys, predicted) return r * (p**0.5) shuffled_ixs = np.array(range(len(xs))) np.random.shuffle(shuffled_ixs) shuffled_xs = np.array(xs)[shuffled_ixs] shuffled_ys = np.array(ys)[shuffled_ixs] td_size = int(len(xs) * 0.9) td_xs, td_ys = shuffled_xs[0:td_size], shuffled_ys[0:td_size] vd_xs, vd_ys = shuffled_xs[td_size:], shuffled_ys[td_size:] assert len(td_xs) + len(vd_xs) == len(xs), "|TD| + |VD| == |D|" learner = RegExLearner(precision, f1_score, MINIMUM_COVERAGE_PCT) learner.fit(td_xs, td_ys) print_positives(xs, ys) print str(learner) # TD Performance td_pred = learner.predict(td_xs) r, p, f1 = rpf1(td_ys, td_pred) print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format( r, p, f1) # VD performance vd_pred = learner.predict(vd_xs) r, p, f1 = rpf1(vd_ys, vd_pred) print "VD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format( r, p, f1) pass
def test_on_data(): import GwData import WordTokenizer import TfIdf import Converter import MatrixHelper data = GwData.GwData() tokenized = WordTokenizer.tokenize(data.documents) tfidf = TfIdf.TfIdf(data.documents)
def run_supervised(): import GwData gwData = GwData.GwData() xs = get_data(gwData) def flip(i): if i == 0: return 1 return 0 ys = [[lbl, flip(lbl)] for lbl in gwData.labels_for("50")] xs = np.array(xs) ys = np.array(ys) td_size = 2500 td_x = xs[0:td_size] vd_x = xs[td_size:] dbnetwork = DeepNet([td_x.shape[1], 600, 400], ['sigmoid', 'sigmoid', 'sigmoid']) dbnetwork.train(td_x, [1000, 1000], [0.1, 0.1]) out = dbnetwork.run_through_network(xs) top_layer = backprop.NeuralNet( layer_sizes=[out.shape[1], int(out.shape[1]), 2], layer_types=['sigmoid', 'sigmoid', 'sigmoid']) o_td_x = out[0:td_size] o_vd_x = out[td_size:] td_y = ys[0:td_size] vd_y = ys[td_size:] top_layers = top_layer.train(top_layer.network, o_td_x, td_y, o_vd_x, vd_y, 10, 'classification', 'crossEntropy', 0, 25) #TODO We need to train a top layer neural network from the top DBNN layer to the output #TODO Then we create a final network composed of the two concatenated together mlp = to_feed_forward_network(dbnetwork, top_layers) trained = mlp.train(mlp.network, td_x, td_y, vd_x, vd_y, max_iter=30, validErrFunc='classification', targetCost='crossEntropy') print out.shape np.save('output.npy', out)
def get_data(): import GwData import WordTokenizer import TermFrequency import MatrixHelper import Converter data = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5) tf = TermFrequency.TermFrequency(tokenized_docs) ts = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0, Converter.to_binary) return ts
def main(): #SETTINGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + GwData.FOLDER #TOKENIZE data = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5) term_freq = TermFrequency.TermFrequency(tokenized_docs) #tfidf = TfIdf.TfIdf(tokenized_docs) #NLTK Decision Tree list_of_dicts = Converter.vector_space_to_dict_list( term_freq.distance_matrix, term_freq.id2Word, Converter.to_binary) #list_of_dicts = Converter.vector_space_to_dict_list(tfidf.matrix, tfidf.id2Word) labels = data.causal_per_document labelled_data = zip(list_of_dicts, labels) td_size = int(0.75 * len(labelled_data)) training_data = labelled_data[:td_size] validation_data = labelled_data[td_size:] me = nltk.MaxentClassifier.train(training_data, algorithm="GIS") #RESULTS classifications = [me.classify(rcd) for rcd, lbl in validation_data] results = ResultsHelper.rfp(labels[td_size:], classifications) print results #print "EXPLAIN:\n" #me.explain(condensed_data[0], 100) #DUMP TO FILE fName = results_dir + "Causal_Relation_MaxEnt.txt" handle = open(fName, mode="w+") handle.write(results) handle.close() #binary_matrix = term_freq.binary_matrix() #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy') #decision_tree.fit(binary_matrix, labels) # Test with CL1 labels raw_input("Press Enter to quit")
def __init__(self): import GwData import WordTokenizer from py_word2vec import Word2Vec data = GwData.GwData() # Ensure we train on all words here (as a sequence model), stem matches # above setting, and we do NOT remove stop words (breaks sequencing) # spelling correct must match below tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=1, stem = stem, remove_stop_words=False, spelling_correct=spelling_correct, number_fn=NumberStrategy.collapse_num) self.wd2vec = Word2Vec(tokenized_docs, topics, min_count=2)
def main(): #SETTINGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + GwData.FOLDER #TOKENIZE data = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count = 5) term_freq = TermFrequency.TermFrequency(tokenized_docs) #NB list_of_dicts = Converter.vector_space_to_dict_list(term_freq.distance_matrix, term_freq.id2Word, Converter.to_binary) labels = data.causal_per_document labelled_data = zip(list_of_dicts, labels) td_size = int(0.75 * len(labelled_data)) training_data = labelled_data[:td_size] validation_data = labelled_data[td_size:] nb = nltk.NaiveBayesClassifier.train(training_data) #RESULTS classifications = [nb.classify(rcd) for rcd,lbl in validation_data] results = ResultsHelper.rfp(labels[td_size:], classifications) results += "(100) MOST INFORMATIVE FEATURES:\n" features = nb.most_informative_features(100) for i,(f,val) in enumerate(features): results += "\t" + str(i + 1) + " : " + f + " -> " + str(val) + "\n" print results #DUMP TO FILE fName = results_dir + "Causal_Relation_NB.txt" handle = open(fName, mode = "w+") handle.write(results) handle.close() #binary_matrix = term_freq.binary_matrix() #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy') #decision_tree.fit(binary_matrix, labels) # Test with CL1 labels raw_input("Press Enter to quit")
def __init__(self): QMainWindow.__init__(self) Ui_MainWindow.__init__(self) self.setupUi(self) self.setWindowTitle('GAMEWIKI') # set up configuration self.config = GwConfig.GwConfig(self) self.config.Load() self.data = GwData.GwData(self) # connect buttons to functions self.actionOpen_Project.triggered.connect(self.data.OpenProjectDialog) # initialize tree self.data.SetProjectPath(self.config.LastProject) self.treeView_2.clicked.connect(self.data.OpenClickedFile) # set up a var for our last open path self.openFilePath = '' # set up text processing self.parser = GwParse.GwParse(self) self.text = GwText.GwText(self)
def __init__(self, lsa, k, code): self.lsa = lsa self.k = k self.code = code self.data = GwData.GwData(load_essays=False, load_source=True) tokenized_docs = self.data.documents self.labels = self.data.labels_for(code) if code != "bck": bck_codes = self.data.labels_for("bck") tokenized_docs = [ d for i, d in enumerate(self.data.documents) if bck_codes[i] == 0 ] self.labels = [ lbl for i, lbl in enumerate(self.labels) if bck_codes[i] == 0 ] tokenized_docs = WordTokenizer.tokenize(tokenized_docs) self.distance_matrix = self.lsa.project_matrix(tokenized_docs)
def get_data(self, settings): return GwData.GwData(directory=settings.data_directory + "\\" + GwData.FOLDER)
def train(): #SETTINGS cv_folds = 10 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + +GwData.FOLDER num_lsa_topics = 100 #TOKENIZE xs = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5) tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics) #NLTK SVM linear kernel xs = MatrixHelper.gensim_to_numpy_array(lsa.distance_matrix, initial_value=0) total_recall, total_precision, total_f1 = 0.0, 0.0, 0.0 all_results = "LSA Dimensions: " + str(num_lsa_topics) print all_results processed_code_count = 0 #MIN_CODE_COUNT = 5 MIN_CODE_COUNT = 1 codes = [ c for c in xs.sm_codes # Exclude pure vague codes if c != "v" and # Exclude doc codes. Need whole doc to classify them not c.startswith("s") ] for code in codes: code_count = xs.sm_code_count[code] if code_count <= MIN_CODE_COUNT: continue processed_code_count += 1 labels = map(Converter.get_svm_val, xs.labels_for(code)) classifier = svm.LinearSVC(C=1) recall, precision, f1_score = cross_validation_score(xs, labels, classifier, cv_folds, class_value=1.0) results = "Code: {0} Count: {1}, Recall: {2}, Precision: {3}, F1: {4}\n".format( code.ljust(10), code_count, recall, precision, f1_score) all_results += results total_recall += recall total_precision += precision total_f1 += f1_score print results, #num_codes = len(xs.sm_codes) num_codes = processed_code_count result = "AGGREGATE\n\t Recall: {0}, Precision: {1}, F1: {2}\n".format( total_recall / num_codes, total_precision / num_codes, total_f1 / num_codes) all_results += result print result #DUMP TO FILE fName = results_dir + "Codes_ClassifyUsing_SVM_with_EssayBasedLSA_Dims_" + str( num_lsa_topics) + ".txt" handle = open(fName, mode="w+") handle.write(all_results) handle.close()
def get_data(self, settings): return GwData.GwData(directory=settings.data_directory + self.sub_dir())
def main(): #SETTINGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + GwData.FOLDER #TOKENIZE data = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5) tfidf = TfIdf.TfIdf(tokenized_docs) #NLTK Decision Tree np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix, initial_value=0) labels = data.causal_per_document def get_svm_val(x): if x <= 0: return -1 return 1 labels = map(get_svm_val, labels) td_size = int(0.75 * len(np_matrix)) td_x = np_matrix[:td_size] td_y = labels[:td_size] vd_x = np_matrix[td_size:] vd_y = labels[td_size:] rng = array(range(1, 21, 1)) c_vals = rng / 10.0 all_results = "" for c in c_vals: classifier = svm.LinearSVC(C=c) classifier.fit(td_x, td_y) #RESULTS classifications = classifier.predict(vd_x) results = "\nC VALUE: " + str(c) + "\n" results += ResultsHelper.rfp(vd_y, classifications) print results all_results += results #print "EXPLAIN:\n" #me.explain(condensed_data[0], 100) #DUMP TO FILE fName = results_dir + "Causal_Relation_SVM.txt" handle = open(fName, mode="w+") handle.write(all_results) handle.close() #binary_matrix = term_freq.binary_matrix() #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy') #decision_tree.fit(binary_matrix, labels) # Test with CL1 labels raw_input("Press Enter to quit")
print "I/O error({0}): {1}".format(e.errno, e.strerror) except: print "Unexpected error:", sys.exc_info()[0] def tokenize(documents, min_word_count=5, stem=True, lemmatize=False, remove_stop_words=True, spelling_correct=True, number_fn=None): tokenizer = WordTokenizer(min_word_count=min_word_count, stem=stem, lemmatize=lemmatize, remove_stop_words=remove_stop_words, spelling_correct=spelling_correct, number_fn=number_fn) return tokenizer.tokenize(documents) if __name__ == "__main__": import GwData data = GwData.GwData() tokens = tokenize(data.documents, stem=True, remove_stop_words=True, spelling_correct=True) pass
def get_binary_data(): import GwData ts = GwData.GwData().as_binary() return ts
def get_binary_data(self): return GwData.GwData(load_essays=True, load_source=False)
from SDA_Layers import * import numpy as np if __name__ == "__main__": import GwData data = GwData.GwData.as_binary() fullData = GwData.GwData() y = np.asarray([[l] for l in fullData.labels_for("50")]) autoencoder = StackedDA([300], alpha=0.1) autoencoder.pre_train(data, 50) autoencoder.finalLayer(y, 10, 1) autoencoder.fine_tune(data, y, 50) pass
return self.distance_matrix[self.words[wd]].flatten().tolist()[0] def project(self, item): if type(item) == type(""): return self.project(item) l = [] for w in item: if w in self.words: l.append(self.project(w)) return l if __name__ == "__main__": import GwData import TfIdf import WordTokenizer e = Embeddings() d = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(d.documents, min_word_count=1, stem=False, remove_stop_words=False) tf = TfIdf.TfIdf(tokenized_docs) ewds = set(e.words) dwds = set([w for w in tf.id2Word.values()]) pass
def test_dA(learning_rate=0.5, training_epochs=100, batch_size=270): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ xs = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5) tf = TermFrequency.TermFrequency(tokenized_docs) def to_binary(val): if val <= 0: return float(0) return float(1) distance_matrix = MatrixHelper.gensim_to_numpy_array( tf.distance_matrix, None, 0, to_binary)[0:-94] cols = len(distance_matrix[0]) rows = len(distance_matrix) train_set_x = theano.shared(numpy.asarray(distance_matrix, dtype=theano.config.floatX), borrow=True) #distance_matrix = numpy.ndarray((rows,cols), distance_matrix) #distance_matrix = shared(distance_matrix) # compute number of minibatches for training, validation and testing n_train_batches = int(len(distance_matrix) / batch_size) hidden = 300 corruption_level = 0.0 # allocate symbolic variables for the xs index = T.iscalar() # index to a [mini]batch x = T.distance_matrix('x') # the xs is presented as rasterized images #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=cols, n_hidden=hidden) cost, updates = da.get_cost_updates(corruption_level=corruption_level, learning_rate=learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) ##################################### # BUILDING THE MODEL CORRUPTION 30% # ##################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=cols, n_hidden=hidden) cost, updates = da.get_cost_updates(corruption_level=0.3, learning_rate=learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The 30% corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % (training_time / 60.))
return len(self.rows) """ End Interface methods """ class PpmiLatentWordVectors(ProjectionABC.ProjectionABC): def __init__(self, tokenized_docs, num_topics=100): self.corpus = PpmiWordVectors(tokenized_docs) self.lsa = LsiModel(self.corpus, num_topics, self.corpus.id2word) distance_matrix = self.lsa[self.corpus] def gensim_to_vector(gensim_vect): return map(lambda (id, val): val, gensim_vect) self.rows = map(gensim_to_vector, distance_matrix) def project(self, item): ix = self.corpus.word2rowindex[item] return self.rows[ix] if __name__ == "__main__": import GwData as d import WordTokenizer as t data = d.GwData() tokenized_docs = t.tokenize(data.documents, spelling_correct=False) model = PpmiLatentWordVectors(tokenized_docs) vector = model.project("essay") pass