from topicmod.util import flags flags.define_string("input_base", "output/20_news/iter_100_PMI_", \ "Input file folder") flags.define_string("output_base", "output/20_news/iter_100_PMI", \ "Output file name") flags.define_string("PMI_file", "PMI_score", \ "Output file name") flags.define_int("round_num", "5", "Number of iteractive rounds") if __name__ == "__main__": flags.InitFlags() results = dict() rounds = flags.round_num + 1 for ii in range(0, rounds): filename = flags.input_base + str(ii) + "/" + flags.PMI_file inputfile = open(filename, 'r') for line in inputfile: line = line.strip() words = line.split('\t') if words[0].find('total') >= 0: word_key = -1 else: word_key = int(words[0]) if word_key not in results.keys(): results[word_key] = [] results[word_key].append(words[2]) outputfile = open(flags.output_base, 'w') for tt in results.keys():
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.flat import FlatCorpus #from topicmod.corpora.flat import FlatEmailCorpus flags.define_int("doc_limit", -1, "How many documents \ we add") flags.define_string("base", "../../data/yn_toy/", \ "Where we look for data") flags.define_string("output", "../../data/yn_toy/numeric", \ "Where we write output") if __name__ == "__main__": flags.InitFlags() corpus = FlatCorpus(flags.base, flags.doc_limit) corpus.add_language("*", ENGLISH) print flags.output corpus.write_proto(flags.output, "yn_toy")
import sys import os from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_int("num_iterations", 1, "Number of iterations") flags.define_string("model_name", "output/model", "Where we find data") flags.define_string("corpus", None, "The source corpus") flags.define_bool("hadoop", False, "Do we use hadoop or local batch") flags.define_bool("doc_step", True, "Do we call the document-centric parts") flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)") flags.define_bool( "update_global", True, "Do we compute new transition and DP variational parameters") class Array: def __init__(self, name): self._rows = {} self._name = name def __getitem__(self, index): if not index in self._rows: self._rows[index] = defaultdict(float) return self._rows[index] def __iter__(self):
import re import os.path from proto.corpus_pb2 import * from proto.wordnet_file_pb2 import * from topicmod.util import flags from topicmod.util.sets import read_pickle, write_pickle flags.define_int("option", 0, \ "change the whole documents or just the topics of just the word") flags.define_string("ldawnoutput", "output/nsf", "ldawn output directory") flags.define_string("maps", "output/nsf", "mapping files directory") flags.define_string("wordnet", "wn/output.0", "contraint source") flags.define_string("assignment_path", None, "Where the assignments live") def checkSame(cons, old_cons): if len(cons) != len(old_cons): return False for key in cons: if key not in old_cons: return False return True def getMappingDicts_reGen(corpusdir, mapsdir, cons): # check the old constraint.dict exists or not cons_file = corpusdir + "/constraint.set" if (not os.path.exists(cons_file)): # Regenerate (word_wid_dic, wid_did_dic, did_doc_dic) = \ getNewMappingDicts(corpusdir, mapsdir) else:
print "" if __name__ == "__main__": from topicmod.util import flags mkdir("/tmp/%s" % USER) mkdir("/tmp/%s/qsub-scripts" % USER) flags.define_string("template", "", "Where we read the template file from") flags.define_dict("args", {}, "Substitute values for the template") flags.define_dict("defaults", {}, "Default args") flags.define_string("wall", "24:00:00", "The wall time") flags.define_string("name", "", "Name given to job on cluster") flags.define_string("mem", "4gb", "How much memory we give") flags.define_string("queue", "shallow", "Which queue do we submit to") flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster") flags.define_bool("delete_scripts", True, "Do we delete after we're done?") flags.define_bool("submit", True, "Do we submit") flags.InitFlags() template = open(flags.template).read() d = flags.defaults d["wall"] = flags.wall d["mem"] = flags.mem for ii in flags.args: d[ii] = flags.args[ii] if flags.name: d["name"] = flags.name
len(word_pairs)) + "\t" + str(pmi_score) + "\n" infile.write(tmp) total_pmi_score += pmi_score total_pmi_score /= len(topics.keys()) tmp = "total" + "\t" + str(len( topics.keys())) + "\t" + str(total_pmi_score) + "\n" infile.write(tmp) infile.close() flags.define_string("vocab", "", "Where we find the vocab") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("stats", None, "Where we find the stat_file") flags.define_int("topics_cutoff", 30, "Number of topics") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_string("output", "output/PMI_score", "PMI Output filename") if __name__ == "__main__": flags.InitFlags() print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab) vocab_size = len(vocab_word_index) print "Reading topic words" [topics, topic_word_set] = readTopics(flags.model, flags.topics_cutoff) #print "Get statistics"
from math import log from random import random from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_string("vocab", None, "Size of vocabulary") flags.define_int("num_docs", None, "Numer of documents") flags.define_int("num_topics", 128, "Number topics") flags.define_string("model_name", "output/model", "Name of model") flags.define_bool("finite", False, "Use finite model") flags.define_bool("ignore_trans", False, "Use only documents") flags.define_bool("ignore_docs", False, "Use only syntax") flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible") flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document") flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions") flags.define_int("alpha_trans", 1.0, "DP parameter for transition distributions") flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution") flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx") if __name__ == "__main__": flags.InitFlags() params = SyntopParameters() params.finite = flags.finite params.ignore_trans = flags.ignore_trans params.ignore_docs = flags.ignore_docs params.shortcut_gsl = flags.shortcut_gsl
path_assignments_out.close() docs_in.close() docs_out.close() return new_topics + 1 flags.define_string("corpus", None, "Where we find the input corpora") flags.define_string("mapping", None, "Filename of mapping") flags.define_string("cons_file", "", "Constraints filename") flags.define_glob("wordnet", "wn/output.0", "contraint source") flags.define_string("input_base", "output/nih", "Input filename") flags.define_string("output_base", "output/nih_ned", "Output filename") flags.define_string("resume_type", "clear", "resume type: clear or split") flags.define_string("update_strategy", "doc", "update strategy: term or doc") flags.define_int("doc_limit", -1, "Number of documents to process") flags.define_int("num_topics", 0, "Current number of topics") if __name__ == "__main__": flags.InitFlags() if re.search("doc", flags.update_strategy): update_strategy = 1 elif re.search("term", flags.update_strategy): update_strategy = 0 else: print "Wrong update strategy!" exit() # Build index if it doesn't already exist if os.path.exists(flags.mapping):
tmp = str(tt) + "\t" + str(len(word_pairs)) + "\t" + str(pmi_score) + "\n" infile.write(tmp) total_pmi_score += pmi_score total_pmi_score /= len(topics.keys()) tmp = "total" + "\t" + str(len(topics.keys())) + "\t" + str(total_pmi_score) + "\n" infile.write(tmp) infile.close() flags.define_string("vocab", "", "Where we find the vocab") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("stats", None, "Where we find the stat_file") flags.define_int("topics_cutoff", 30, "Number of topics") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_string("output", "output/PMI_score", "PMI Output filename") if __name__ == "__main__": flags.InitFlags() print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab) vocab_size = len(vocab_word_index) print "Reading topic words" [topics, topic_word_set] = readTopics(flags.model, flags.topics_cutoff) #print "Get statistics"
count = 0 for (w1, w2) in must.keys(): if count < must_links_num: pmi = must[(w1, w2)] tmp = 'MERGE_\t' + w1 + '\t' + w2 + '\n' output_file.write(tmp) count += 1 output_file.close() flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", "", "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("output", "constraints/tmp", "Output filename") flags.define_int("topics_cutoff", 30, "Number of topic words") flags.define_int("cannot_links", 0, "Number of cannot links that we want") flags.define_int("must_links", 0, "Number of must links that we want") flags.define_int("num_topics", 20, "Number of topics") flags.define_bool("train_only", False, "Using only train data to \ generate the constraints") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf") if __name__ == "__main__": flags.InitFlags() # getting statistics: slower version, full statistics, memory cost #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
from topicmod.util import flags from topicmod.util.sets import count_line from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.ml_vocab import Vocab from collections import defaultdict import codecs flags.define_string("output", "", "Where we write output") flags.define_glob("doc_roots", "", "The document vocab") flags.define_string("vocab", "", "The vocab file") flags.define_string("location", "", "Where the data live") flags.define_int("min_length", 50, "Minimum number of tokens") flags.define_int("num_docs", -1, "Number of documents we write") flags.define_string("language", "en", "What language this is") kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE} def lda_line(filename, full_vocab, filtered_vocab): d = defaultdict(int) doc = Document() doc.ParseFromString(open(filename, 'rb').read()) num_words = 0 for sent in doc.sentences: for word in sent.words:
word_senses_count[word] = 0 count_word += 1 tmp = word for pos in multipaths[word]: tmp += '\t' + pos for index in multipaths[word][pos]: word_senses_count[word] += 1 count_sense += 1 tmp += '\t' + str(index) if word_senses_count[word] > 1: im_words += word + " " outfile.write(tmp + '\n') outfile.write("\nThe total number of cons words: " + str(count_word) + "\n") outfile.write("\nThe total number of cons words senses: " + str(count_sense) + "\n") outfile.write("\nInteresting words: " + im_words + "\n") outfile.close() flags.define_string("vocab", None, "The input vocab") flags.define_string("output", None, "The output constraint file") flags.define_int("num_cons", 0, "The number of constraints we want") if __name__ == "__main__": flags.InitFlags() wordnet_path = "../../../data/wordnet/" eng_wn = load_wn("3.0", wordnet_path, "wn") vocab = readVocab(flags.vocab) generateCons(vocab, eng_wn, flags.output, flags.num_cons)
from topicmod.corpora.wacky import * from topicmod.util import flags flags.define_string("wackypedia_base", "../../data/wackypedia/compressed/", "Where we find the wackypedia corpus") flags.define_string("output", "/tmp/jbg/wackypedia/", "Where we write output") flags.define_int("doc_limit", 10, "Max number of docs") flags.define_list("langs", ["en"], "Which languages") if __name__ == "__main__": flags.InitFlags() wacky = WackyCorpus(flags.wackypedia_base, flags.doc_limit) for ii in flags.langs: wacky.add_language("wackypedia_%s*.gz" % ii) wacky.write_proto(flags.output + "numeric", "wpdia", 10000)
import sys import os from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_int("num_iterations", 1, "Number of iterations") flags.define_string("model_name", "output/model", "Where we find data") flags.define_string("corpus", None, "The source corpus") flags.define_bool("hadoop", False, "Do we use hadoop or local batch") flags.define_bool("doc_step", True, "Do we call the document-centric parts") flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)") flags.define_bool("update_global", True, "Do we compute new transition and DP variational parameters") class Array: def __init__(self, name): self._rows = {} self._name = name def __getitem__(self, index): if not index in self._rows: self._rows[index] = defaultdict(float) return self._rows[index] def __iter__(self): for ii in self._rows: yield self._rows[ii] def parse(self, key, val):
from topicmod.util import flags from topicmod.util.sets import count_line from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.ml_vocab import Vocab from collections import defaultdict import codecs flags.define_string("output", "", "Where we write output") flags.define_glob("doc_roots", "", "The document vocab") flags.define_string("vocab", "", "The vocab file") flags.define_string("location", "", "Where the data live") flags.define_int("min_length", 50, "Minimum number of tokens") flags.define_int("num_docs", -1, "Number of documents we write") flags.define_string("language", "en", "What language this is") kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE} def lda_line(filename, full_vocab, filtered_vocab): d = defaultdict(int) doc = Document() doc.ParseFromString(open(filename, 'rb').read()) num_words = 0 for sent in doc.sentences: for word in sent.words: new_word = full_vocab.get_word(doc.language, word.token)
output_file.write(tmp) tmp = 'MERGE_' for word in merge2: tmp += '\t' + word tmp += '\n' output_file.write(tmp) output_file.close() flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", None, "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("output", "constraints/tmp", "Output filename") flags.define_int("topics_cutoff", 30, "Number of topic words") flags.define_float("tfidf_thresh", 0, "threshold for tfidf") if __name__ == "__main__": flags.InitFlags() # getting statistics: slower version, full statistics, memory cost #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \ # = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \ # flags.topics_cutoff, flags.window_size, flags.train_only) # getting statistics: faster version, partial statistics, memory efficient print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags # Input flags.define_string("doc_filter", None, "Files to filter out") flags.define_string("vocab", None, "The file that defines the vocab") flags.define_string("state_file", None, \ "The state file that create the corpus") # Output flags.define_string("state_output", None, "Where we write state file") flags.define_string("corpus_output_path", None, "Where we write the corpus") flags.define_string("corpus_name", "NIH", "Name of the corpus") # Options flags.define_int("docs_per_index", 5000, "Number of docs per section") flags.define_int("doc_limit", -1, "Cap on number of documents") class MalletAssignment: def __init__(self, line, debug=False): if debug: for ii in xrange(len(line.split())): print ii, line.split()[ii] self.doc, foo, self.index, self.term_id, self.term, self.assignment = \ line.split() self.doc = int(self.doc) self.index = int(self.index) self.term_id = int(self.term_id) self.assignment = int(self.assignment)
# # wacky_reducer.py # # File to turn protocol buffers into a test-only input file readable by # mapreduce implementation of syntactic topic model. from collections import defaultdict from nltk import FreqDist from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper from topicmod.util import flags from parse_reader import * flags.define_int("docs_per_file", 100, "Number of documents per file") flags.define_int("vocab_size", 5000, "Maximum vocabulary size") flags.define_bool("remove_stop", False, "remove stopwords") flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens") flags.define_bool("use_relation", False, "Use relation (synset) instead of pos") flags.define_glob("vocab_source", None, "Where we get initial vocabulary") flags.define_string("output_path", None, "Where we write the translated corpuss") flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index") flags.define_int("min_length", 100, "Number of characters in document line") class CorpusTranslator: def __init__(self, output_path, use_lemma, docs_per_file): self.output_base = output_path self.document_list = [] self.use_lemma = use_lemma
from topicmod.corpora.nyt_reader import * from topicmod.util import flags flags.define_string("nyt_base", "../../data/new_york_times/", "Where we find the nyt corpus") flags.define_int("doc_limit", -1, "How many documents") flags.define_string("output", "/tmp/jbg/nyt/", "Where we write data") flags.define_float("bigram_limit", 0.9, "p-value for bigrams") if __name__ == "__main__": flags.InitFlags() nyt = NewYorkTimesReader(flags.nyt_base, flags.doc_limit, flags.bigram_limit) nyt.add_language_list("../../data/new_york_times/editorial_file_list") nyt.write_proto(flags.output + "numeric", "nyt", 1000)
from math import log from random import random from topicmod.util import flags from syntop_parameters_pb2 import * flags.define_string("vocab", None, "Size of vocabulary") flags.define_int("num_docs", None, "Numer of documents") flags.define_int("num_topics", 128, "Number topics") flags.define_string("model_name", "output/model", "Name of model") flags.define_bool("finite", False, "Use finite model") flags.define_bool("ignore_trans", False, "Use only documents") flags.define_bool("ignore_docs", False, "Use only syntax") flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible") flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document") flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions") flags.define_int("alpha_trans", 1.0, "DP parameter for transition distributions") flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution") flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx") if __name__ == "__main__": flags.InitFlags() params = SyntopParameters()
if __name__ == "__main__": from topicmod.util import flags mkdir("/tmp/%s" % USER) mkdir("/tmp/%s/qsub-scripts" % USER) flags.define_string("template", "", "Where we read the template file from") flags.define_dict("args", {}, "Substitute values for the template") flags.define_dict("defaults", {}, "Default args") flags.define_string("wall", "24:00:00", "The wall time") flags.define_string("name", "", "Name given to job on cluster") flags.define_string("mem", "4gb", "How much memory we give") flags.define_string("queue", "shallow", "Which queue do we submit to") flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster") flags.define_bool("delete_scripts", True, "Do we delete after we're done?") flags.define_bool("submit", True, "Do we submit") flags.InitFlags() template = open(flags.template).read() d = flags.defaults d["wall"] = flags.wall d["mem"] = flags.mem for ii in flags.args: d[ii] = flags.args[ii] if flags.name: d["name"] = flags.name
# # wacky_reducer.py # # File to turn protocol buffers into a test-only input file readable by # mapreduce implementation of syntactic topic model. from collections import defaultdict from nltk import FreqDist from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper from topicmod.util import flags from parse_reader import * flags.define_int("docs_per_file", 100, "Number of documents per file") flags.define_int("vocab_size", 5000, "Maximum vocabulary size") flags.define_bool("remove_stop", False, "remove stopwords") flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens") flags.define_bool("use_relation", False, "Use relation (synset) instead of pos") flags.define_glob("vocab_source", None, "Where we get initial vocabulary") flags.define_string("output_path", None, "Where we write the translated corpuss") flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index") flags.define_int("min_length", 100, "Number of characters in document line") class CorpusTranslator: def __init__(self, output_path, use_lemma, docs_per_file):
output_file.write(tmp) tmp = 'MERGE_' for word in merge2: tmp += '\t' + word tmp += '\n' output_file.write(tmp) output_file.close() flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab") flags.define_string("stats", None, "Where we find the stat_file") flags.define_string("model", "", "The model files folder of topic models") flags.define_string("output", "constraints/tmp", "Output filename") flags.define_int("topics_cutoff", 30, "Number of topic words") flags.define_float("tfidf_thresh", 0, "threshold for tfidf") if __name__ == "__main__": flags.InitFlags() # getting statistics: slower version, full statistics, memory cost #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \ # = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \ # flags.topics_cutoff, flags.window_size, flags.train_only) # getting statistics: faster version, partial statistics, memory efficient print "Reading vocab" [vocab_word_index, vocab_index_word] = readVocab(flags.vocab) vocab_size = len(vocab_word_index)
rank = tfidf else: rank = frequency o = codecs.open(outputname, 'w', 'utf-8') for ii in rank: count = 0 for jj in rank[ii]: count += 1 if count <= vocab_limit and frequency[ii][jj] >= freq_limit: word = vocab[ii][jj] o.write(u"%i\t%s\t%f\t%i\n" % (ii, word, tfidf[ii][jj], frequency[ii][jj])) o.close() flags.define_string("proto_corpus", None, "The proto files") flags.define_bool("lemma", False, "Use lemma or tokens") flags.define_bool("select_tfidf", False, "select the vocab by tfidf or frequency") flags.define_string("output", "", "Where we output the preprocessed data") flags.define_string("vocab", None, "Where we output the vocab") flags.define_int("vocab_limit", 10000, "The vocab size") flags.define_int("freq_limit", 20, "The minimum frequency of each word") if __name__ == "__main__": flags.InitFlags() [vocab, tfidf, frequency] = gen_files(flags.proto_corpus, flags.output, flags.lemma) gen_vocab(vocab, tfidf, frequency, flags.select_tfidf, flags.vocab, flags.vocab_limit, flags.freq_limit)
from topicmod.util import flags from topicmod.corpora.vocab_compiler import VocabCompiler flags.define_glob("corpus_parts", None, "Where we look for vocab") flags.define_filename("output", None, "Where we write the new vocab") flags.define_int("min_freq", 10, "Minimum frequency for inclusion") flags.define_int("vocab_limit", 5000, "Maximum vocab size") flags.define_bool("exclude_stop", True, "Do we throw out stop words") flags.define_bool("exclude_punc", True, "Do we exclude punctuation") flags.define_bool("exclude_digits", True, "Do we exclude digits") flags.define_list("special_stop", [], "Special stop words") flags.define_int("min_length", 3, "Minimum length for tokens") flags.define_bool("stem", False, "Stem words") flags.define_bool("bigram", False, "Use bigrams") if __name__ == "__main__": flags.InitFlags() assert not (flags.stem and flags.bigram), "Can't use stem and bigram" v = VocabCompiler() for ii in flags.corpus_parts: print ii v.addVocab(ii, flags.exclude_stop, flags.special_stop, \ flags.exclude_punc, flags.exclude_digits, \ flags.stem, flags.bigram, flags.min_length) v.writeVocab(flags.output, flags.vocab_limit, flags.min_freq)
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags from topicmod.corpora.flat import FlatCorpus # from topicmod.corpora.flat import FlatEmailCorpus flags.define_int( "doc_limit", -1, "How many documents \ we add", ) flags.define_string("base", "../../data/yn_toy/", "Where we look for data") flags.define_string("output", "../../data/yn_toy/numeric", "Where we write output") if __name__ == "__main__": flags.InitFlags() corpus = FlatCorpus(flags.base, flags.doc_limit) corpus.add_language("*", ENGLISH) print flags.output corpus.write_proto(flags.output, "yn_toy")
from random import random from collections import defaultdict import os import numpy from numpy import zeros from numpy.random.mtrand import dirichlet from numpy.random import multinomial from numpy.random import normal from math import isnan, isinf from topicmod.util import flags from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("num_docs", 500, "Number of documents") flags.define_int("num_topics", 5, "Number of topics") flags.define_int("doc_length", 5, "Length of every document") flags.define_int("num_langs", 2, "Number of languages") flags.define_float("variance", 0.5, "Variance of distribution") flags.define_float("gamma", 1.0, "Vocabulary hyperparameter") flags.define_float("alpha", 0.1, "Document topic hyperparameter") flags.define_string("output_base", "data/synthetic", "Where we write the data") flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas") flags.define_int("num_groups", 2, "Number of splits") flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab") flags.define_int("topic_output_size", 15, "Number of words to display when we output topics") ml_vocab = [{0: ["dog", "cat", "moose", "butterfly"], 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 2:["toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo", "mosquito", "arana", "pavo"]},
from collections import defaultdict from topicmod.util import flags from topicmod.util.wordnet import load_wn from topicmod.ling.dictionary import * from topicmod.ling.snowball_wrapper import Snowball from topicmod.corpora.ontology_writer import OntologyWriter from topicmod.corpora.ontology_writer import orderedTraversal from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("limit", 250, "How many items in our MuTo matching") flags.define_bool("dictionary", False, "Use a dictionary") flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary") flags.define_bool("translation", False, "Use translation matrix") flags.define_bool("greedy_matching", False, "Use a matching from dictionary") flags.define_bool("wordnet", False, "Use WordNet as scaffold") flags.define_bool("german", False, "Include German") flags.define_bool("chinese", False, "Include Chinese") flags.define_string("output", "", "Where we write ontology") flags.define_float("trans_cutoff", 0.5, "Min value for using the translation") flags.define_string("wn_version", "3.0", "Which version of WN we use") flags.define_string("filter_vocab", "", "Filter entries based on vocabulary") flags.define_bool("stem", False, "Stem words") flags.define_bool("id_strings", False, "Add identical strings") # generate an updated vocab: note not all the words in the original vocab will be included # in the generated wordnet, generate a new vocab only contains the words in the wordnet. flags.define_string("updated_vocab", "", "generate a new vocab")
from collections import defaultdict from topicmod.util import flags from topicmod.util.wordnet import load_wn from topicmod.ling.dictionary import * from topicmod.ling.snowball_wrapper import Snowball from topicmod.corpora.ontology_writer import OntologyWriter from topicmod.corpora.ontology_writer import orderedTraversal from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("limit", 250, "How many items in our MuTo matching") flags.define_bool("dictionary", False, "Use a dictionary") flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary") flags.define_bool("translation", False, "Use translation matrix") flags.define_bool("greedy_matching", False, "Use a matching from dictionary") flags.define_bool("wordnet", False, "Use WordNet as scaffold") flags.define_bool("german", False, "Include German") flags.define_bool("chinese", False, "Include Chinese") flags.define_string("output", "", "Where we write ontology") flags.define_float("trans_cutoff", 0.5, "Min value for using the translation") flags.define_string("wn_version", "3.0", "Which version of WN we use") flags.define_string("filter_vocab", "", "Filter entries based on vocabulary") flags.define_bool("stem", False, "Stem words") flags.define_bool("id_strings", False, "Add identical strings") # generate an updated vocab: note not all the words in the original # vocab will be included in the generated wordnet, generate a new # vocab only contains the words in the wordnet.
infile = open(infilename, 'r') vocab = defaultdict(FreqDist) for line in infile: line = line.strip() ww = line.split('\t') lang = ww[0] if source[lang][ww[1]] == 0: print source[lang][ww[1]], ww[1] vocab[lang].inc(ww[1], source[lang][ww[1]]) infile.close() outfile = codecs.open(outfilename, 'w', 'utf-8') for ii in vocab: for jj in vocab[ii]: outfile.write(u"%s\t%s\n" % (ii, jj)) #outfile.write(u"%s\t%s\t%f\t%i\n" % (ii, jj, tfidf[ii][jj], frequency[ii][jj])) outfile.close() flags.define_string("stats_vocab", None, "The proto files") flags.define_string("input_vocab", None, "Where we get the original vocab") flags.define_int("option", 0, "1: tfidf; others: frequency") flags.define_string("sorted_vocab", None, "Where we output the vocab") if __name__ == "__main__": flags.InitFlags() [tfidf, frequency] = readStats(flags.stats_vocab) sortVocab(flags.input_vocab, tfidf, frequency, flags.option, flags.sorted_vocab)
from random import random from collections import defaultdict import os import numpy from numpy import zeros from numpy.random.mtrand import dirichlet from numpy.random import multinomial from numpy.random import normal from math import isnan, isinf from topicmod.util import flags from topicmod.corpora.proto.corpus_pb2 import * flags.define_int("num_docs", 500, "Number of documents") flags.define_int("num_topics", 5, "Number of topics") flags.define_int("doc_length", 5, "Length of every document") flags.define_int("num_langs", 2, "Number of languages") flags.define_float("variance", 0.5, "Variance of distribution") flags.define_float("gamma", 1.0, "Vocabulary hyperparameter") flags.define_float("alpha", 0.1, "Document topic hyperparameter") flags.define_string("output_base", "data/synthetic", "Where we write the data") flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas") flags.define_int("num_groups", 2, "Number of splits") flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab") flags.define_int("topic_output_size", 15, "Number of words to display when we output topics") ml_vocab = [{
from topicmod.corpora.proto.corpus_pb2 import * from topicmod.util import flags # Input flags.define_string("doc_filter", None, "Files to filter out") flags.define_string("vocab", None, "The file that defines the vocab") flags.define_string("state_file", None, \ "The state file that create the corpus") # Output flags.define_string("state_output", None, "Where we write state file") flags.define_string("corpus_output_path", None, "Where we write the corpus") flags.define_string("corpus_name", "NIH", "Name of the corpus") # Options flags.define_int("docs_per_index", 5000, "Number of docs per section") flags.define_int("doc_limit", -1, "Cap on number of documents") class MalletAssignment: def __init__(self, line, debug=False): if debug: for ii in xrange(len(line.split())): print ii, line.split()[ii] self.doc, foo, self.index, self.term_id, self.term, self.assignment = \ line.split() self.doc = int(self.doc) self.index = int(self.index) self.term_id = int(self.term_id) self.assignment = int(self.assignment)
# write coccurance: outputfile = self._output_dir + "/cooccurance.txt" outfile = open(outputfile, 'w') for w1 in self._cooccur.keys(): for w2 in self._cooccur[w1].keys(): if self._cooccur[w1][w2] != 0: tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n" outfile.write(tmp) outfile.close() flags.define_string("corpus", None, "Where we find the input corpora") flags.define_string("proto_corpus", None, "Where we find the input proto corpora") flags.define_string("vocab", "", "The model files folder of topic models") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_string("output", "PMI_stat/20_news", "PMI stat output filename") flags.define_int("option", "2", "0: 20_news; 1: wikipedia") if __name__ == "__main__": flags.InitFlags() # {0: 'english', 1: 'german'} lang = 0 cp = corpusParser(lang, flags.vocab, flags.corpus, flags.window_size, flags.output) if flags.option == 0: cp.parseCorpus20news() get_tfidf(flags.proto_corpus, flags.vocab, flags.output) elif flags.option == 1: cp.parseCorpusWiki() get_tfidf(flags.proto_corpus, flags.vocab, flags.output) elif flags.option == 2: