from collections import defaultdict from nltk import FreqDist from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper from topicmod.util import flags from parse_reader import * flags.define_int("docs_per_file", 100, "Number of documents per file") flags.define_int("vocab_size", 5000, "Maximum vocabulary size") flags.define_bool("remove_stop", False, "remove stopwords") flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens") flags.define_bool("use_relation", False, "Use relation (synset) instead of pos") flags.define_glob("vocab_source", None, "Where we get initial vocabulary") flags.define_string("output_path", None, "Where we write the translated corpuss") flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index") flags.define_int("min_length", 100, "Number of characters in document line") class CorpusTranslator: def __init__(self, output_path, use_lemma, docs_per_file): self.output_base = output_path self.document_list = [] self.use_lemma = use_lemma # A lookup for each language self.vocab = defaultdict(dict) self.roles = defaultdict(dict) self.output_corpus = Corpus()
from collections import defaultdict from nltk import FreqDist from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper from topicmod.util import flags from parse_reader import * flags.define_int("docs_per_file", 100, "Number of documents per file") flags.define_int("vocab_size", 5000, "Maximum vocabulary size") flags.define_bool("remove_stop", False, "remove stopwords") flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens") flags.define_bool("use_relation", False, "Use relation (synset) instead of pos") flags.define_glob("vocab_source", None, "Where we get initial vocabulary") flags.define_string("output_path", None, "Where we write the translated corpuss") flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index") flags.define_int("min_length", 100, "Number of characters in document line") class CorpusTranslator: def __init__(self, output_path, use_lemma, docs_per_file): self.output_base = output_path self.document_list = [] self.use_lemma = use_lemma # A lookup for each language self.vocab = defaultdict(dict)
path_assignments_out.write(path_assign) topic_assignments_in.close() topic_assignments_out.close() path_assignments_in.close() path_assignments_out.close() docs_in.close() docs_out.close() return new_topics + 1 flags.define_string("corpus", None, "Where we find the input corpora") flags.define_string("mapping", None, "Filename of mapping") flags.define_string("cons_file", "", "Constraints filename") flags.define_glob("wordnet", "wn/output.0", "contraint source") flags.define_string("input_base", "output/nih", "Input filename") flags.define_string("output_base", "output/nih_ned", "Output filename") flags.define_string("resume_type", "clear", "resume type: clear or split") flags.define_string("update_strategy", "doc", "update strategy: term or doc") flags.define_int("doc_limit", -1, "Number of documents to process") flags.define_int("num_topics", 0, "Current number of topics") if __name__ == "__main__": flags.InitFlags() if re.search("doc", flags.update_strategy): update_strategy = 1 elif re.search("term", flags.update_strategy): update_strategy = 0 else:
from topicmod.util import flags from topicmod.corpora.vocab_compiler import VocabCompiler flags.define_glob("corpus_parts", None, "Where we look for vocab") flags.define_filename("output", None, "Where we write the new vocab") flags.define_int("min_freq", 10, "Minimum frequency for inclusion") flags.define_int("vocab_limit", 5000, "Maximum vocab size") flags.define_bool("exclude_stop", True, "Do we throw out stop words") flags.define_bool("exclude_punc", True, "Do we exclude punctuation") flags.define_bool("exclude_digits", True, "Do we exclude digits") flags.define_list("special_stop", [], "Special stop words") flags.define_int("min_length", 3, "Minimum length for tokens") flags.define_bool("stem", False, "Stem words") flags.define_bool("bigram", False, "Use bigrams") if __name__ == "__main__": flags.InitFlags() assert not (flags.stem and flags.bigram), "Can't use stem and bigram" v = VocabCompiler() for ii in flags.corpus_parts: print ii v.addVocab(ii, flags.exclude_stop, flags.special_stop, \ flags.exclude_punc, flags.exclude_digits, \ flags.stem, flags.bigram, flags.min_length) v.writeVocab(flags.output, flags.vocab_limit, flags.min_freq)
from topicmod.util import flags from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.ml_vocab import Vocab from topicmod.corpora.ml_vocab import MultilingualVocab import codecs from collections import defaultdict flags.define_glob("wordnet", "", "The wordnet files") flags.define_string("vocab", "", "The vocab file") flags.define_glob("docs", "", "The documents we want to view") flags.define_glob("doc_roots", "", "The document vocab") def print_doc(filename, full, flat, wn): doc = Document() doc.ParseFromString(open(filename, 'rb').read()) print "-------------------------------" print "Original document:" for sent in doc.sentences: for word in sent.words: print "|%i:%i:%s|" % (doc.language, word.token, \ full.get_word(doc.language, \ word.token).encode("ascii", \ 'ignore')), print ""
from topicmod.util import flags from topicmod.util.sets import count_line from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.ml_vocab import Vocab from collections import defaultdict import codecs flags.define_string("output", "", "Where we write output") flags.define_glob("doc_roots", "", "The document vocab") flags.define_string("vocab", "", "The vocab file") flags.define_string("location", "", "Where the data live") flags.define_int("min_length", 50, "Minimum number of tokens") flags.define_int("num_docs", -1, "Number of documents we write") flags.define_string("language", "en", "What language this is") kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE} def lda_line(filename, full_vocab, filtered_vocab): d = defaultdict(int) doc = Document() doc.ParseFromString(open(filename, 'rb').read()) num_words = 0 for sent in doc.sentences: for word in sent.words: new_word = full_vocab.get_word(doc.language, word.token)
from topicmod.util import flags from topicmod.util.sets import count_line from topicmod.corpora.proto.corpus_pb2 import * from topicmod.corpora.proto.wordnet_file_pb2 import * from topicmod.corpora.ml_vocab import MultilingualVocab from topicmod.corpora.ml_vocab import Vocab from collections import defaultdict import codecs flags.define_string("output", "", "Where we write output") flags.define_glob("doc_roots", "", "The document vocab") flags.define_string("vocab", "", "The vocab file") flags.define_string("location", "", "Where the data live") flags.define_int("min_length", 50, "Minimum number of tokens") flags.define_int("num_docs", -1, "Number of documents we write") flags.define_string("language", "en", "What language this is") kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE} def lda_line(filename, full_vocab, filtered_vocab): d = defaultdict(int) doc = Document() doc.ParseFromString(open(filename, 'rb').read()) num_words = 0 for sent in doc.sentences: for word in sent.words: