def nyt_tfidf(): vocab_file = "vocab/nyt.voc" proto_corpus_dir = "output/nyt/iter_1_all/model_topic_assign/" output_dir = "PMI_stat/nyt/" get_tfidf(proto_corpus_dir, vocab_file, output_dir)
def get_20news_tfidf(): vocab_file = "vocab/20_news_stem_all.voc" proto_corpus_dir = "output/20_news_stem_tfidf/iter_1_all/model_topic_assign/" #proto_corpus_dir = "../../data/20_news_date/numeric" output_dir = "PMI_stat/20_news_stem_tfidf/" get_tfidf(proto_corpus_dir, vocab_file, output_dir)
for w1 in self._cooccur.keys(): for w2 in self._cooccur[w1].keys(): if self._cooccur[w1][w2] != 0: tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n" outfile.write(tmp) outfile.close() flags.define_string("corpus", None, "Where we find the input corpora") flags.define_string("proto_corpus", None, "Where we find the input proto corpora") flags.define_string("vocab", "", "The model files folder of topic models") flags.define_int("window_size", 10, "Size of window for computing coocurrance") flags.define_string("output", "PMI_stat/20_news", "PMI stat output filename") flags.define_int("option", "2", "0: 20_news; 1: wikipedia") if __name__ == "__main__": flags.InitFlags() # {0: 'english', 1: 'german'} lang = 0 cp = corpusParser(lang, flags.vocab, flags.corpus, flags.window_size, flags.output) if flags.option == 0: cp.parseCorpus20news() get_tfidf(flags.proto_corpus, flags.vocab, flags.output) elif flags.option == 1: cp.parseCorpusWiki() get_tfidf(flags.proto_corpus, flags.vocab, flags.output) elif flags.option == 2: cp.parseCorpusNyt() get_tfidf(flags.proto_corpus, flags.vocab, flags.output)