def main(): config = getOptions() docwords = [] for doc_filename in config['doc_filenames']: with open(doc_filename) as DOC: doc = DOC.readlines() doc = "".join(doc) wordsInSentences, wc = extractSentenceWords(doc, 2) print "%d words extracted from '%s'" % (wc, doc_filename) docwords.append(wordsInSentences) topicvec = topicvecDir(**config) topicvec.setDocs(docwords, config['doc_filenames']) if 'evalKmeans' in config and config['evalKmeans']: topicvec.kmeans() topicvec.printTopWordsInTopic(None, True) exit(0) best_last_Ts, Em, docs_Em, Pi = topicvec.inference() basename = os.path.basename(config['logfilename']) basetrunk = os.path.splitext(basename)[0] best_it, best_T, best_loglike = best_last_Ts[0] save_matrix_as_text(basetrunk + "-em%d-best.topic.vec" % best_it, "topic", best_T) if best_last_Ts[1]: last_it, last_T, last_loglike = best_last_Ts[1] save_matrix_as_text(basetrunk + "-em%d-last.topic.vec" % last_it, "topic", last_T)
def main(): config = getOptions() docwords = [] for doc_filename in config['doc_filenames']: with open(doc_filename) as DOC: doc = DOC.readlines() doc = "".join(doc) wordsInSentences, wc = extractSentenceWords(doc, 2) print "%d words extracted from '%s'" %(wc, doc_filename) docwords.append(wordsInSentences) topicvec = topicvecDir(**config) topicvec.setDocs( docwords, config['doc_filenames'] ) if 'evalKmeans' in config and config['evalKmeans']: topicvec.kmeans() topicvec.printTopWordsInTopic(None, True) exit(0) best_last_Ts, Em, docs_Em, Pi = topicvec.inference() basename = os.path.basename(config['logfilename']) basetrunk = os.path.splitext(basename)[0] best_it, best_T, best_loglike = best_last_Ts[0] save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T ) if best_last_Ts[1]: last_it, last_T, last_loglike = best_last_Ts[1] save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T )
def main(): config = getOptions() docwords = [] csvfiles_filecount = 0 csvfiles_wc = 0 csvfiles_rowcount = 0 file_rownames = [] for csv_filename in config['csv_filenames']: csvfile_wc = 0 csvfile_rowcount = 0 with open(csv_filename) as DOC: docreader = csv.reader(DOC) for row in docreader: doc = row[0] wordsInSentences, wc = extractSentenceWords(doc, min_length=2) csvfile_wc += wc csvfile_rowcount += 1 docwords.append(wordsInSentences) file_rownames.append("%s-row%d" % (csv_filename, csvfile_rowcount)) csvfile_avgwc = csvfile_wc * 1.0 / csvfile_rowcount print "%d words extracted from %d rows in '%s'. Avg %.1f words each row" % ( csvfile_wc, csvfile_rowcount, csv_filename, csvfile_avgwc) csvfiles_wc += csvfile_wc csvfiles_rowcount += csvfile_rowcount csvfiles_filecount += 1 csvfiles_avgwc = csvfiles_wc * 1.0 / csvfiles_rowcount if csvfiles_filecount > 1: print "%d words extracted from %d rows in %d csv files. Avg %.1f words each row" % ( csvfiles_wc, csvfiles_rowcount, csvfiles_filecount, csvfiles_avgwc) topicvec = topicvecDir(**config) topicvec.setDocs(docwords, file_rownames) if 'evalKmeans' in config and config['evalKmeans']: topicvec.kmeans() topicvec.printTopWordsInTopic(None, True) exit(0) best_last_Ts, Em, docs_Em, Pi = topicvec.inference() basename = os.path.basename(config['logfilename']) basetrunk = os.path.splitext(basename)[0] best_it, best_T, best_loglike = best_last_Ts[0] save_matrix_as_text(basetrunk + "-em%d-best.topic.vec" % best_it, "topic", best_T) if best_last_Ts[1]: last_it, last_T, last_loglike = best_last_Ts[1] save_matrix_as_text(basetrunk + "-em%d-last.topic.vec" % last_it, "topic", last_T)
def main(): config = getOptions() docwords = [] csvfiles_filecount = 0 csvfiles_wc = 0 csvfiles_rowcount = 0 file_rownames = [] for csv_filename in config['csv_filenames']: csvfile_wc = 0 csvfile_rowcount = 0 with open(csv_filename) as DOC: docreader = csv.reader(DOC) for row in docreader: doc = row[0] wordsInSentences, wc = extractSentenceWords(doc, min_length=2) csvfile_wc += wc csvfile_rowcount += 1 docwords.append(wordsInSentences) file_rownames.append( "%s-row%d" %(csv_filename, csvfile_rowcount) ) csvfile_avgwc = csvfile_wc * 1.0 / csvfile_rowcount print "%d words extracted from %d rows in '%s'. Avg %.1f words each row" %( csvfile_wc, csvfile_rowcount, csv_filename, csvfile_avgwc ) csvfiles_wc += csvfile_wc csvfiles_rowcount += csvfile_rowcount csvfiles_filecount += 1 csvfiles_avgwc = csvfiles_wc * 1.0 / csvfiles_rowcount if csvfiles_filecount > 1: print "%d words extracted from %d rows in %d csv files. Avg %.1f words each row" %(csvfiles_wc, csvfiles_rowcount, csvfiles_filecount, csvfiles_avgwc) topicvec = topicvecDir(**config) topicvec.setDocs( docwords, file_rownames ) if 'evalKmeans' in config and config['evalKmeans']: topicvec.kmeans() topicvec.printTopWordsInTopic(None, True) exit(0) best_last_Ts, Em, docs_Em, Pi = topicvec.inference() basename = os.path.basename(config['logfilename']) basetrunk = os.path.splitext(basename)[0] best_it, best_T, best_loglike = best_last_Ts[0] save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T ) if best_last_Ts[1]: last_it, last_T, last_loglike = best_last_Ts[1] save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T )
def main(): config = getOptions() snip_filename = config['snip_filename'] snips_words = [] snips_name = [] with open(snip_filename) as DOC: snip_lines = [] snipcount = 0 snips_wc = 0 for line in DOC: line = line.strip() if line: snip_lines.append(line) else: sniptext = " ".join(snip_lines) wordsInSentences, wc = extractSentenceWords( sniptext, remove_punc="iso-8859-1") snips_wc += wc snipcount += 1 snips_words.append(wordsInSentences) snips_name.append("%s-row%d" % (snip_filename, snipcount)) snipfile_avgwc = snips_wc * 1.0 / snipcount print "%d words extracted from %d snippets in '%s'. Avg %.1f words each row" % ( snips_wc, snipcount, snip_filename, snipfile_avgwc) topicvec = topicvecDir(**config) topicvec.setDocs(snips_words, snips_name) best_last_Ts, Em, docs_Em, Pi = topicvec.inference() basename = os.path.basename(config['logfilename']) basetrunk = os.path.splitext(basename)[0] best_it, best_T, best_loglike = best_last_Ts[0] save_matrix_as_text(basetrunk + "-em%d-best.topic.vec" % best_it, "topic", best_T) if best_last_Ts[1]: last_it, last_T, last_loglike = best_last_Ts[1] save_matrix_as_text(basetrunk + "-em%d-last.topic.vec" % last_it, "topic", last_T)
def run_topicvec(config, docwords, file_rownames): """Runs the TopicVec pipeline. Adapted from the original open source code.""" topicvec = topicvecDir(**config) topicvec.setDocs(docwords, file_rownames) best_last_Ts, Em, docs_Em, Pi = topicvec.inference() basename = os.path.basename(config['logfilename']) basetrunk = os.path.splitext(basename)[0] best_it, best_T, best_loglike = best_last_Ts[0] save_matrix_as_text(basetrunk + "-em%d-best.topic.vec" % best_it, "topic", best_T) if best_last_Ts[1]: last_it, last_T, last_loglike = best_last_Ts[1] save_matrix_as_text(basetrunk + "-em%d-last.topic.vec" % last_it, "topic", last_T) return topicvec
def main(): config = getOptions() snip_filename = config['snip_filename'] snips_words = [] snips_name = [] with open(snip_filename) as DOC: snip_lines = [] snipcount = 0 snips_wc = 0 for line in DOC: line = line.strip() if line: snip_lines.append(line) else: sniptext = " ".join(snip_lines) wordsInSentences, wc = extractSentenceWords(sniptext, remove_punc="iso-8859-1") snips_wc += wc snipcount += 1 snips_words.append(wordsInSentences) snips_name.append( "%s-row%d" %(snip_filename, snipcount) ) snipfile_avgwc = snips_wc * 1.0 / snipcount print "%d words extracted from %d snippets in '%s'. Avg %.1f words each row" %( snips_wc, snipcount, snip_filename, snipfile_avgwc ) topicvec = topicvecDir(**config) topicvec.setDocs( snips_words, snips_name ) best_last_Ts, Em, docs_Em, Pi = topicvec.inference() basename = os.path.basename(config['logfilename']) basetrunk = os.path.splitext(basename)[0] best_it, best_T, best_loglike = best_last_Ts[0] save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T ) if best_last_Ts[1]: last_it, last_T, last_loglike = best_last_Ts[1] save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T )
ORIG.write("\n") ORIG.close() print "%d original docs saved in '%s'" % (setDocNum, orig_filename) if onlyGetOriginalText: continue if si == 0: if onlyInferTopicProp: topicfile_trunk = topic_vec_file.split(".")[0] topicTraits = topicfile_trunk.split("-")[3:] topicTraitStr = "-".join(topicTraits) T = load_matrix_from_text(topic_vec_file, "topic") config['K'] = T.shape[0] topicvec = topicvecDir(**config) out = topicvec.genOutputter(0) docs_idx = topicvec.setDocs(orig_docs_words, orig_docs_name) docs_name = [orig_docs_name[i] for i in docs_idx] docs_cat = [orig_docs_cat[i] for i in docs_idx] readDocNum = len(docs_idx) out("%d docs left after filtering empty docs" % (readDocNum)) assert readDocNum == topicvec.D, "Returned %d doc idx != %d docs in Topicvec" % ( readDocNum, topicvec.D) if onlyGetWidMapping: sorted_wids = sorted(topicvec.wid2freq.keys()) uniq_wid_num = len(sorted_wids) for i, wid in enumerate(sorted_wids): wid2compactId[wid] = i
config['MAX_EM_ITERS'] = MAX_ITERS loader = corpus2loader[corpusName] wid2compactId = {} compactId_words = [] hasIdMapping = False if onlyInferTopicProp: topicfile_trunk = topic_vec_file.split(".")[0] topicTraits = topicfile_trunk.split("-")[3:] topicTraitStr = "-".join(topicTraits) T = load_matrix_from_text( topic_vec_file, "topic" ) config['K'] = T.shape[0] config['logfilename'] = corpusName topicvec = topicvecDir(**config) out = topicvec.genOutputter(0) for si, subsetName in enumerate(subsetNames): print "Process subset '%s':" %subsetName if subsetName == 'all-mapping': subsetName = 'all' onlyGetWidMapping = True else: onlyGetWidMapping = False subsetDocNum, orig_docs_words, orig_docs_name, orig_docs_cat, cats_docsWords, \ cats_docNames, category_names = loader(subsetName) catNum = len(category_names) basename = "%s-%s-%d" %( corpusName, subsetName, subsetDocNum )