d[en][ENGLISH].add(en) d[en][CHINESE].add(zh) return d if __name__ == "__main__": flags.InitFlags() langs = [] if flags.german: langs.append(GERMAN) if flags.chinese: langs.append(CHINESE) if flags.filter_vocab: filter_list = MultilingualVocab(flags.filter_vocab) if flags.greedy_matching: assert flags.german and not flags.chinese mapping = greedy_german_matching(filter_list, flags.limit, flags.stem) else: mapping = createDictionary(flags.dictionary, flags.dic_dir, flags.translation, langs, flags.stem) if flags.filter_vocab: mapping = filterDictionary(mapping, filter_list) if flags.wordnet: writeTranslatedWN(mapping, flags.output) else: writeTranslations(mapping, flags.output, flags.updated_vocab)
o2 = open(flags.output + ".rat", 'w') o3 = open(flags.output + ".doc", 'w') num_docs = 0 skipped = 0 filter_vocab = Vocab(flags.vocab, kLANGUAGE_ID[flags.language]) for root in flags.doc_roots: print "Reading root", root corpus = Corpus() f = open(root, "rb") corpus.ParseFromString(f.read()) # This allows possibly inconsistent ways of assigning numbers of words, but # the final voc should make them consistent superset_vocab = MultilingualVocab() for ii in corpus.tokens: for jj in ii.terms: # print ii.language, jj.id, jj.original superset_vocab.set(ii.language, jj.id, jj.original) for ii in corpus.doc_filenames: rating, line, title, num_words = lda_line(flags.location + ii, \ superset_vocab, \ filter_vocab) if num_words >= flags.min_length: num_docs += 1 o1.write(line) o2.write("%i\n" % rating) o3.write("%s\n" % title) else:
for sent in doc.sentences: for word in sent.words: new_word = full.get_word(doc.language, word.token) new_id = flat.get_id(doc.language, new_word) if new_id != -1: if new_word in wn[doc.language]: print "*%i" % new_id, else: print new_id, print "" if __name__ == "__main__": flags.InitFlags() # Get the corpus vocab superset_vocab = MultilingualVocab() for root in flags.doc_roots: print "Reading root", root corpus = Corpus() f = open(root, "rb") corpus.ParseFromString(f.read()) for ii in corpus.tokens: for jj in ii.terms: superset_vocab.set(ii.language, jj.id, jj.original) # Get the culled vocab filtered_vocab = MultilingualVocab(flags.vocab) # Get the wn vocab, limited by the vocabulary wn_vocab = defaultdict(set)