def train(num_lsa_topics, k): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceFragmentData.SentenceFragmentData() tokenizer = WordTokenizer.WordTokenizer(min_word_count = 5) tokenized_docs = tokenizer.tokenize(xs.documents) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, num_topics = num_lsa_topics) full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(lsa.distance_matrix, num_lsa_topics) #Filter To just sm codes sm_code_lsa_matrix = ListHelper.filter_list_by_index(full_lsa_matrix, xs.sm_code_indices) #CLUSTER clusterer = Clusterer.Clusterer(k) labels = clusterer.Run(sm_code_lsa_matrix) #OUTPUT - Filter by SM Code only this time file_name_code_clusters = "LSA_SMCODES_Fragments_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics) sm_codes_per_doc = ListHelper.filter_list_by_index(xs.codes_per_document, xs.sm_code_indices) ClustersToFile.clusters_to_file(file_name_code_clusters, labels, sm_codes_per_doc, "Chicago") file_name_category_clusters = "LSA_Categories_Fragments_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics) categories_per_doc = ListHelper.filter_list_by_index(xs.categories_per_document, xs.sm_code_indices) ClustersToFile.clusters_to_file(file_name_category_clusters, labels, categories_per_doc, "Chicago") print "Finished processing lsa clustering for dims: {0} and k: {1}".format(num_lsa_topics, k)
def train(num_lsa_topics, k, window_size): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceData.SentenceData() tokenizer = dbnetwork.WordTokenizer(min_word_count = 5) tokenized_docs = tokenizer.tokenize(xs.documents) windowed_docs, window_indices = split_documents_into_windows(tokenized_docs, window_size) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(windowed_docs) lsa = Lsa.Lsa(tfidf, num_topics = num_lsa_topics) full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(lsa.distance_matrix, num_lsa_topics) #CLUSTER clusterer = Clusterer.Clusterer(k) window_labels = clusterer.Run(full_lsa_matrix) #Extract the labeld for the original sentences using the indices build earlier labels = pivot_window_labels(window_labels, window_indices) #OUTPUT file_name_code_clusters = "Windowed_LSA_SMCODES_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format(window_size, k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_code_clusters, labels, xs.codes_per_document, "Chicago") file_name_category_clusters = "Windowed_LSA_Categories_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format(window_size, k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_category_clusters, labels, xs.categories_per_document, "Chicago") logging.info("Finished processing lsa clustering for dims: {0} and k: {1}".format(num_lsa_topics, k))
def __init__(self, lsa, code, templates): match = filter(lambda tpl: code in tpl[0], templates) self.lsa = lsa distance_matrix = [lsa.project(d) for c,d in match] self.templates = MatrixHelper.gensim_to_numpy_array(distance_matrix, lsa.num_topics)
def __init__(self, lsa, code, templates): match = filter(lambda tpl: code in tpl[0], templates) self.lsa = lsa distance_matrix = [lsa.project(d) for c, d in match] self.templates = MatrixHelper.gensim_to_numpy_array( distance_matrix, lsa.num_topics)
def train(num_lsa_topics, k): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceData.SentenceData() tokenizer = WordTokenizer.WordTokenizer(min_word_count = 5) tokenized_docs = tokenizer.tokenize(xs.documents) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, num_topics = num_lsa_topics) full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(lsa.distance_matrix, num_lsa_topics) #TODO Partition into Docs by LSA sim txt_codes = xs.text_codes clusters_per_text_code = int(round( k/ float((len(txt_codes))))) #Extract the sm code rows from LSA smCodeRows = ListHelper.filter_list_by_index(full_lsa_matrix, xs.sm_code_indices) smCodeClassifications = ListHelper.filter_list_by_index(xs.codes_per_document, xs.sm_code_indices) smCodeCategoryClassifications = ListHelper.filter_list_by_index(xs.categories_per_document, xs.sm_code_indices) # Dict of <code, list[list]]> - LSA row vectors logging.info("Partitioning LSA distance_matrix by Source Document") txtMatrixByCode = PartitionByCode.partition(full_lsa_matrix, xs, xs.text_codes) closest_docs = [find_closest_document(txtMatrixByCode, row) for row in smCodeRows] matrix_by_doc = collections.defaultdict(list) for i, doc in enumerate(closest_docs): matrix_by_doc[doc].append(smCodeRows[i]) #Stores all cluster labels logging.info("Clustering within a document") all_smcode_labels = [] label_offset = 0 for doc in xs.text_codes: distance_matrix = matrix_by_doc[doc] #CLUSTER clusterer = Clusterer.Clusterer(clusters_per_text_code) labels = clusterer.Run(distance_matrix) all_smcode_labels = all_smcode_labels + [int(l + label_offset) for l in labels] label_offset += clusters_per_text_code #OUTPUT file_name_code_clusters = "Partition_By_Doc_LSA_SMCODES_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_code_clusters, all_smcode_labels, smCodeClassifications, "Chicago") file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeCategoryClassifications, "Chicago") #TODO - filter the category and the docs per docs to the sm codes and output #file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.txt".format(k, num_lsa_topics) #ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeClassifications, "Chicago") print "Finished processing lsa clustering for dims: {0} and k: {1}".format(num_lsa_topics, k)
def binary_matrix(self): """ Turns a regular tf distance_matrix into a binary distance_matrix """ def get_binary_data(val): if val <= 0: return 0 return 1 full_matrix = MatrixHelper.gensim_to_python_mdarray( self.distance_matrix, self.num_unique_words) return [[get_binary_data(cell) for cell in row] for row in full_matrix]
def get_data(xs): import WordTokenizer import TermFrequency import MatrixHelper import Converter tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5) tf = TermFrequency.TermFrequency(tokenized_docs) arr = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0, Converter.to_binary) return arr
def binary_matrix(self): """ Turns a regular tf distance_matrix into a binary distance_matrix """ def get_binary_data(val): if val <= 0: return 0 return 1 full_matrix = MatrixHelper.gensim_to_python_mdarray(self.distance_matrix, self.num_unique_words) return [[get_binary_data(cell) for cell in row] for row in full_matrix]
def get_binary_data(xs): import WordTokenizer import TermFrequency import MatrixHelper import Converter tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5) tf = TermFrequency.TermFrequency(tokenized_docs) arr = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0, Converter.to_binary) return arr
def train(num_lsa_topics, k, window_size): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceData.SentenceData() tokenizer = dbnetwork.WordTokenizer(min_word_count=5) tokenized_docs = tokenizer.tokenize(xs.documents) windowed_docs, window_indices = split_documents_into_windows( tokenized_docs, window_size) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(windowed_docs) lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics) full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray( lsa.distance_matrix, num_lsa_topics) #CLUSTER clusterer = Clusterer.Clusterer(k) window_labels = clusterer.Run(full_lsa_matrix) #Extract the labeld for the original sentences using the indices build earlier labels = pivot_window_labels(window_labels, window_indices) #OUTPUT file_name_code_clusters = "Windowed_LSA_SMCODES_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format( window_size, k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_code_clusters, labels, xs.codes_per_document, "Chicago") file_name_category_clusters = "Windowed_LSA_Categories_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format( window_size, k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_category_clusters, labels, xs.categories_per_document, "Chicago") logging.info( "Finished processing lsa clustering for dims: {0} and k: {1}".format( num_lsa_topics, k))
def main(): #SETTINGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + GwData.FOLDER #TOKENIZE data = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5) tfidf = TfIdf.TfIdf(tokenized_docs) #NLTK Decision Tree np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix, initial_value=0) labels = data.causal_per_document def get_svm_val(x): if x <= 0: return -1 return 1 labels = map(get_svm_val, labels) td_size = int(0.75 * len(np_matrix)) td_x = np_matrix[:td_size] td_y = labels[:td_size] vd_x = np_matrix[td_size:] vd_y = labels[td_size:] rng = array(range(1, 21, 1)) c_vals = rng / 10.0 all_results = "" for c in c_vals: classifier = svm.LinearSVC(C=c) classifier.fit(td_x, td_y) #RESULTS classifications = classifier.predict(vd_x) results = "\nC VALUE: " + str(c) + "\n" results += ResultsHelper.rfp(vd_y, classifications) print results all_results += results #print "EXPLAIN:\n" #me.explain(condensed_data[0], 100) #DUMP TO FILE fName = results_dir + "Causal_Relation_SVM.txt" handle = open(fName, mode="w+") handle.write(all_results) handle.close() #binary_matrix = term_freq.binary_matrix() #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy') #decision_tree.fit(binary_matrix, labels) # Test with CL1 labels raw_input("Press Enter to quit")
docs = data.documents tokenized_docs = WordTokenizer.tokenize( docs, min_word_count=5, stem=False, lemmatize=True, remove_stop_words=True, spelling_correct=True, number_fn=NumberStrategy.collapse_dates) lsa_v = LatentWordVectors.LsaSpace(tokenized_docs, 100) wds = lsa_v.word_to_index.keys() wds = sorted(wds) u_vecs = [MatrixHelper.unit_vector(lsa_v.project(v)) for v in wds] km = cluster.KMeans(n_clusters=50, init='k-means++', n_init=10, verbose=1, n_jobs=1) predictions = km.fit_predict(u_vecs) clusters = set(predictions) word2cluster = dict(zip(wds, predictions)) km_clusters = extract_clusters_from_kmeans(km, wds) km_clusters = sorted(km_clusters, key=lambda i: len(i)) for cl in km_clusters:
def Run(self, results_file_name, cv_folds = 10, min_word_count = 5, stem = True, lemmatize = False, remove_stop_words = True, one_code = None, spelling_correct = True, one_fold = False): self.min_word_count = min_word_count #SETTINGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) results_dir = self.__get_results_folder__() self.__ensure_dir__(results_dir) print "Results filename: " + results_file_name results_file_path = results_dir + results_file_name vd_hits_and_misses_fname = results_file_path.replace(".txt", "_VD_hits_misses.txt") td_hits_and_misses_fname = results_file_path.replace(".txt", "_TD_hits_misses.txt") #TOKENIZE data = self.get_data(ExperimentBase.__settings__) tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count = min_word_count, stem = stem, lemmatize=lemmatize, remove_stop_words = remove_stop_words, spelling_correct=spelling_correct, number_fn = NumberStrategy.collapse_num) empty_ixs = set([i for i, doc in enumerate(tokenized_docs) if len(doc) < ExperimentBase.MIN_DOC_LENGTH]) tokenized_docs = [t for i, t in enumerate(tokenized_docs) if i not in empty_ixs] #TRAINING DATA #TODO Make this one call from docs -> td (distance_matrix, id2word) = self.get_vector_space(tokenized_docs) xs = self.get_training_data(distance_matrix, id2word) matrix_mapper = self.matrix_value_mapper() if matrix_mapper: xs = MatrixHelper.map_matrix(matrix_mapper, xs) all_results = self.get_params() + "\n" print all_results, MIN_CODE_COUNT = 1 vd_metrics, td_metrics = [], [] label_mapper = self.label_mapper() # Stop logging now logging.disable(logging.INFO) # So we can test on one code only codes_to_process = self.__get_codes_to_process__(data, one_code) # Store the indices into the inputs that detail # the true and false positives and negatives vd_hits_misses_by_code = dict() td_hits_misses_by_code = dict() for code in codes_to_process: ys = self.__get_labels_for_code__(code, data, empty_ixs, label_mapper, xs) total_codes = len([item for item in ys if item == 1]) if total_codes <= MIN_CODE_COUNT: continue # Yes, that is a lot I know vd_r, vd_p, vd_f1, vd_a, \ td_r, td_p, td_f1, td_a, \ vd_tp_ix, vd_fp_ix, vd_fn_ix, vd_tn_ix, \ td_tp_ix, td_fp_ix, td_fn_ix, td_tn_ix \ = cross_validation_score_generic( xs, ys, self.create_classifier(code), self.classify(), cv_folds, class_value = self.get_class_value(), one_fold = one_fold) vd_metric, td_metric = rpfa(vd_r, vd_p, vd_f1, vd_a, total_codes), rpfa(td_r, td_p, td_f1, td_a, total_codes) vd_metrics.append(vd_metric) td_metrics.append(td_metric) vd_hits_misses_by_code[code] = (vd_tp_ix, vd_fp_ix, vd_fn_ix, vd_tn_ix) td_hits_misses_by_code[code] = (td_tp_ix, td_fp_ix, td_fn_ix, td_tn_ix) results = "Code: {0} Count: {1} VD[ {2} ]\tTD[ {3} ]\n".format(code.ljust(7), str(total_codes).rjust(4), vd_metric.to_str(), td_metric.to_str()) print results, all_results += results """ Dump results to file in case of crash """ self.__dump_results_to_file__(all_results, results_file_path) dump_hits_and_misses(vd_hits_misses_by_code, xs, vd_hits_and_misses_fname) dump_hits_and_misses(td_hits_misses_by_code, xs, td_hits_and_misses_fname) """ Compute mean metrics """ """ MEAN """ mean_vd_metrics, mean_td_metrics = mean_rpfa(vd_metrics), mean_rpfa(td_metrics) """ WEIGHTED MEAN """ wt_mean_vd_metrics, wt_mean_td_metrics = weighted_mean_rpfa(vd_metrics), weighted_mean_rpfa(td_metrics) str_aggregate_results = self.__build_aggregate_results_string__(mean_td_metrics, mean_vd_metrics, wt_mean_td_metrics, wt_mean_vd_metrics) print str_aggregate_results all_results += str_aggregate_results #DUMP TO FILE print "Writing results to: " + results_file_path print "TD Hits and Misses: " + td_hits_and_misses_fname print "VD Hits and Misses: " + vd_hits_and_misses_fname self.__dump_results_to_file__(all_results, results_file_path) return (mean_vd_metrics, wt_mean_vd_metrics)
def main(): #SETTINGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + GwData.FOLDER #TOKENIZE data = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count = 5) tfidf = TfIdf.TfIdf(tokenized_docs) #NLTK Decision Tree np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix, initial_value = 0) labels = data.causal_per_document def get_svm_val(x): if x <= 0: return -1 return 1 labels = map(get_svm_val,labels) td_size = int(0.75 * len(np_matrix)) td_x = np_matrix[:td_size] td_y = labels[:td_size] vd_x = np_matrix[td_size:] vd_y = labels[td_size:] rng = array(range(1,21,1)) c_vals = rng / 10.0 all_results = "" for c in c_vals: classifier = svm.LinearSVC(C = c) classifier.fit(td_x, td_y) #RESULTS classifications = classifier.predict(vd_x) results = "\nC VALUE: " + str(c) + "\n" results += ResultsHelper.rfp(vd_y, classifications) print results all_results += results #print "EXPLAIN:\n" #me.explain(condensed_data[0], 100) #DUMP TO FILE fName = results_dir + "Causal_Relation_SVM.txt" handle = open(fName, mode = "w+") handle.write(all_results) handle.close() #binary_matrix = term_freq.binary_matrix() #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy') #decision_tree.fit(binary_matrix, labels) # Test with CL1 labels raw_input("Press Enter to quit")
def test_dA(learning_rate=0.5, training_epochs=100, batch_size=270): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ xs = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5) tf = TermFrequency.TermFrequency(tokenized_docs) def to_binary(val): if val <= 0: return float(0) return float(1) distance_matrix = MatrixHelper.gensim_to_numpy_array( tf.distance_matrix, None, 0, to_binary)[0:-94] cols = len(distance_matrix[0]) rows = len(distance_matrix) train_set_x = theano.shared(numpy.asarray(distance_matrix, dtype=theano.config.floatX), borrow=True) #distance_matrix = numpy.ndarray((rows,cols), distance_matrix) #distance_matrix = shared(distance_matrix) # compute number of minibatches for training, validation and testing n_train_batches = int(len(distance_matrix) / batch_size) hidden = 300 corruption_level = 0.0 # allocate symbolic variables for the xs index = T.iscalar() # index to a [mini]batch x = T.distance_matrix('x') # the xs is presented as rasterized images #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=cols, n_hidden=hidden) cost, updates = da.get_cost_updates(corruption_level=corruption_level, learning_rate=learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) ##################################### # BUILDING THE MODEL CORRUPTION 30% # ##################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=cols, n_hidden=hidden) cost, updates = da.get_cost_updates(corruption_level=0.3, learning_rate=learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The 30% corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % (training_time / 60.))
def get_sparse_matrix_data(self, distance_matrix, id2word): return MatrixHelper.gensim_to_numpy_array(distance_matrix, initial_value = 0)
def train(num_lsa_topics, k): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #TOKENIZE xs = SentenceData.SentenceData() tokenizer = WordTokenizer.WordTokenizer(min_word_count=5) tokenized_docs = tokenizer.tokenize(xs.documents) #MAP TO VECTOR AND SEMANTIC SPACE tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics) full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray( lsa.distance_matrix, num_lsa_topics) #TODO Partition into Docs by LSA sim txt_codes = xs.text_codes clusters_per_text_code = int(round(k / float((len(txt_codes))))) #Extract the sm code rows from LSA smCodeRows = ListHelper.filter_list_by_index(full_lsa_matrix, xs.sm_code_indices) smCodeClassifications = ListHelper.filter_list_by_index( xs.codes_per_document, xs.sm_code_indices) smCodeCategoryClassifications = ListHelper.filter_list_by_index( xs.categories_per_document, xs.sm_code_indices) # Dict of <code, list[list]]> - LSA row vectors logging.info("Partitioning LSA distance_matrix by Source Document") txtMatrixByCode = PartitionByCode.partition(full_lsa_matrix, xs, xs.text_codes) closest_docs = [ find_closest_document(txtMatrixByCode, row) for row in smCodeRows ] matrix_by_doc = collections.defaultdict(list) for i, doc in enumerate(closest_docs): matrix_by_doc[doc].append(smCodeRows[i]) #Stores all cluster labels logging.info("Clustering within a document") all_smcode_labels = [] label_offset = 0 for doc in xs.text_codes: distance_matrix = matrix_by_doc[doc] #CLUSTER clusterer = Clusterer.Clusterer(clusters_per_text_code) labels = clusterer.Run(distance_matrix) all_smcode_labels = all_smcode_labels + [ int(l + label_offset) for l in labels ] label_offset += clusters_per_text_code #OUTPUT file_name_code_clusters = "Partition_By_Doc_LSA_SMCODES_k-means_k_{0}_dims_{1}.csv".format( k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_code_clusters, all_smcode_labels, smCodeClassifications, "Chicago") file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.csv".format( k, num_lsa_topics) ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeCategoryClassifications, "Chicago") #TODO - filter the category and the docs per docs to the sm codes and output #file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.txt".format(k, num_lsa_topics) #ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeClassifications, "Chicago") print "Finished processing lsa clustering for dims: {0} and k: {1}".format( num_lsa_topics, k)
def test_dA(learning_rate=0.5, training_epochs=100, batch_size=270): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ xs = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count = 5) tf = TermFrequency.TermFrequency(tokenized_docs) def to_binary(val): if val <= 0: return float(0) return float(1) distance_matrix = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0, to_binary)[0:-94] cols = len(distance_matrix[0]) rows = len(distance_matrix) train_set_x = theano.shared(numpy.asarray(distance_matrix, dtype=theano.config.floatX), borrow=True) #distance_matrix = numpy.ndarray((rows,cols), distance_matrix) #distance_matrix = shared(distance_matrix) # compute number of minibatches for training, validation and testing n_train_batches = int(len(distance_matrix) / batch_size) hidden = 300 corruption_level = 0.0 # allocate symbolic variables for the xs index = T.iscalar() # index to a [mini]batch x = T.distance_matrix('x') # the xs is presented as rasterized images #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible= cols, n_hidden=hidden) cost, updates = da.get_cost_updates(corruption_level=corruption_level, learning_rate=learning_rate) train_da = theano.function([index], cost, updates=updates, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) ##################################### # BUILDING THE MODEL CORRUPTION 30% # ##################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible= cols, n_hidden=hidden) cost, updates = da.get_cost_updates(corruption_level=0.3, learning_rate=learning_rate) train_da = theano.function([index], cost, updates=updates, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]}) start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) print 'Training epoch %d, cost ' % epoch, numpy.mean(c) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The 30% corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % (training_time / 60.))
def train(): #SETTINGS cv_folds = 10 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) settings = Settings.Settings() results_dir = settings.results_directory + +GwData.FOLDER num_lsa_topics = 100 #TOKENIZE xs = GwData.GwData() tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5) tfidf = TfIdf.TfIdf(tokenized_docs) lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics) #NLTK SVM linear kernel xs = MatrixHelper.gensim_to_numpy_array(lsa.distance_matrix, initial_value=0) total_recall, total_precision, total_f1 = 0.0, 0.0, 0.0 all_results = "LSA Dimensions: " + str(num_lsa_topics) print all_results processed_code_count = 0 #MIN_CODE_COUNT = 5 MIN_CODE_COUNT = 1 codes = [ c for c in xs.sm_codes # Exclude pure vague codes if c != "v" and # Exclude doc codes. Need whole doc to classify them not c.startswith("s") ] for code in codes: code_count = xs.sm_code_count[code] if code_count <= MIN_CODE_COUNT: continue processed_code_count += 1 labels = map(Converter.get_svm_val, xs.labels_for(code)) classifier = svm.LinearSVC(C=1) recall, precision, f1_score = cross_validation_score(xs, labels, classifier, cv_folds, class_value=1.0) results = "Code: {0} Count: {1}, Recall: {2}, Precision: {3}, F1: {4}\n".format( code.ljust(10), code_count, recall, precision, f1_score) all_results += results total_recall += recall total_precision += precision total_f1 += f1_score print results, #num_codes = len(xs.sm_codes) num_codes = processed_code_count result = "AGGREGATE\n\t Recall: {0}, Precision: {1}, F1: {2}\n".format( total_recall / num_codes, total_precision / num_codes, total_f1 / num_codes) all_results += result print result #DUMP TO FILE fName = results_dir + "Codes_ClassifyUsing_SVM_with_EssayBasedLSA_Dims_" + str( num_lsa_topics) + ".txt" handle = open(fName, mode="w+") handle.write(all_results) handle.close()
def RunStacked(self, results_file, cv_folds = 10, min_word_count = 5, stem = True, lemmatize = False, remove_stop_words = True, layers = 2): #SETTINGS logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print "Results filename: " + results_file settings = Settings.Settings() results_dir = settings.results_directory + self.sub_dir() + "\\" fName = results_dir + results_file #TOKENIZE data = self.get_data(settings) tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=min_word_count, stem=stem, lemmatize=lemmatize, remove_stop_words=remove_stop_words, spelling_correct=True, number_fn=NumberStrategy.collapse_num) empty_ixs = set([i for i, doc in enumerate(tokenized_docs) if len(doc) < StackedExperimentRunner.__MIN_DOC_LENGTH__]) tokenized_docs = [t for i, t in enumerate(tokenized_docs) if i not in empty_ixs] #TRAINING DATA #TODO Make this one call from docs -> td (distance_matrix, id2word) = self.get_vector_space(tokenized_docs) xs = self.get_training_data(distance_matrix, id2word) matrix_mapper = self.matrix_value_mapper() if matrix_mapper: xs = MatrixHelper.map_matrix(matrix_mapper, xs) all_results = self.get_params() + "\n" print all_results, MIN_CODE_COUNT = 3 codes = set(self.get_codes(data.sm_codes)) label_mapper = self.label_mapper() # Stop logging now logging.disable(logging.INFO) xs = ensure_np_array(xs) edges = cross_validation_edges(len(xs), cv_folds) ys_by_code = {} positive_count_by_code = {} for code in codes.copy(): ys = self.get_ys(code, data, empty_ixs, label_mapper, xs) ys_by_code[code] = ys positive_count = len([item for item in ys if item == 1]) positive_count_by_code[code] = positive_count if positive_count < MIN_CODE_COUNT: codes.remove(code) dct_td_predictions_by_fold = {} dct_vd_predictions_by_fold = {} dct_actual_by_fold = {} for layer in range(layers): print("Layer: {0}".format(layer)) vd_metrics_for_layer, td_metrics_for_layer = [], [] vd_metrics_by_code = defaultdict(lambda: []) td_metrics_by_code = defaultdict(lambda: []) for fold in range(cv_folds): l, r = edges[fold] #Note these are numpy obj's and cannot be treated as lists td_x = np.concatenate((xs[:l], xs[r:])) vd_x = xs[l:r] predictions_from_previous_layer = None if layer > 0: # Seed with an empty lists lst_td_preds = self.__extract_predictions__(codes, dct_td_predictions_by_fold[fold], td_x) td_x = np.concatenate((td_x, np.array(lst_td_preds)), 1) lst_vd_preds = self.__extract_predictions__(codes, dct_vd_predictions_by_fold[fold], vd_x) vd_x = np.concatenate((vd_x, np.array(lst_vd_preds)), 1) dct_td_predictions_per_code = {} dct_vd_predictions_per_code = {} dct_actual_per_code = {} dct_td_predictions_by_fold[fold] = dct_td_predictions_per_code dct_vd_predictions_by_fold[fold] = dct_vd_predictions_per_code dct_actual_by_fold[fold] = dct_actual_per_code class_value = self.get_class_value() for code in codes: total_codes = positive_count_by_code[code] ys = ys_by_code[code] td_y = np.concatenate((ys[:l], ys[r:])) vd_y = ys[l:r] if min(td_y) == max(td_y): val = td_y[0] td_predictions = np.array([val for y in td_y]) vd_predictions = np.array([val for y in vd_y]) else: create_classifier_func = self.create_classifier(code) classify_func = self.classify() classifier = create_classifier_func(td_x, td_y) td_predictions = classify_func(classifier, td_x) vd_predictions = classify_func(classifier, vd_x) dct_td_predictions_per_code[code] = td_predictions dct_vd_predictions_per_code[code] = vd_predictions dct_actual_per_code[code] = td_y td_r, td_p, td_f1, td_a = Metrics.rpf1a(td_y, td_predictions, class_value=class_value) vd_r, vd_p, vd_f1, vd_a = Metrics.rpf1a(vd_y, vd_predictions, class_value=class_value) vd_metric, td_metric = self.rpfa(vd_r, vd_p, vd_f1, vd_a, total_codes), \ self.rpfa(td_r, td_p, td_f1, td_a, total_codes) vd_metrics_for_layer.append(vd_metric) td_metrics_for_layer.append(td_metric) vd_metrics_by_code[code].append(vd_metric) td_metrics_by_code[code].append(td_metric) pass # End for code in codes pass #END for fold in folds for code in sorted(codes): positive_count = positive_count_by_code[code] vd_metric, td_metric = self.mean_rpfa(vd_metrics_by_code[code]), self.mean_rpfa(td_metrics_by_code[code]) results = "Code: {0} Count: {1} VD[ {2} ]\tTD[ {3} ]\n".format(code.ljust(7), str(positive_count).rjust(4), vd_metric.to_str(), td_metric.to_str()) print results, mean_vd_metrics, mean_td_metrics = self.mean_rpfa(vd_metrics_for_layer), self.mean_rpfa(td_metrics_for_layer) wt_mean_vd_metrics, wt_mean_td_metrics = self.weighted_mean_rpfa(vd_metrics_for_layer), self.weighted_mean_rpfa( td_metrics_for_layer) aggregate_results = "\n" aggregate_results += "VALIDATION DATA -\n" aggregate_results += "\tMEAN\n\t\t {0}\n".format(mean_vd_metrics.to_str(True)) aggregate_results += "\tWEIGHTED MEAN\n\t\t {0}\n".format(wt_mean_vd_metrics.to_str(True)) aggregate_results += "\n" aggregate_results += "TRAINING DATA -\n" aggregate_results += "\tMEAN\n\t\t {0}\n".format(mean_td_metrics.to_str(True)) aggregate_results += "\tWEIGHTED MEAN\n\t\t {0}\n".format(wt_mean_td_metrics.to_str(True)) print aggregate_results pass #End for layer in layers pass #End fold """ Dump results to file in case of crash """ #DUMP TO FILE """ print "Writing results to: " + fName handle = open(fName, mode="w+") handle.write(all_results) handle.close() """ #return (mean_vd_metrics, wt_mean_vd_metrics)