コード例 #1
0
                    d[en][ENGLISH].add(en)
                    d[en][CHINESE].add(zh)

    return d


if __name__ == "__main__":
    flags.InitFlags()
    langs = []
    if flags.german:
        langs.append(GERMAN)
    if flags.chinese:
        langs.append(CHINESE)

    if flags.filter_vocab:
        filter_list = MultilingualVocab(flags.filter_vocab)

    if flags.greedy_matching:
        assert flags.german and not flags.chinese
        mapping = greedy_german_matching(filter_list, flags.limit, flags.stem)
    else:
        mapping = createDictionary(flags.dictionary, flags.dic_dir,
                                   flags.translation, langs, flags.stem)

        if flags.filter_vocab:
            mapping = filterDictionary(mapping, filter_list)

    if flags.wordnet:
        writeTranslatedWN(mapping, flags.output)
    else:
        writeTranslations(mapping, flags.output, flags.updated_vocab)
コード例 #2
0
    o2 = open(flags.output + ".rat", 'w')
    o3 = open(flags.output + ".doc", 'w')

    num_docs = 0
    skipped = 0

    filter_vocab = Vocab(flags.vocab, kLANGUAGE_ID[flags.language])
    for root in flags.doc_roots:
        print "Reading root", root
        corpus = Corpus()
        f = open(root, "rb")
        corpus.ParseFromString(f.read())

        # This allows possibly inconsistent ways of assigning numbers of words, but
        # the final voc should make them consistent
        superset_vocab = MultilingualVocab()
        for ii in corpus.tokens:
            for jj in ii.terms:
                # print ii.language, jj.id, jj.original
                superset_vocab.set(ii.language, jj.id, jj.original)

        for ii in corpus.doc_filenames:
            rating, line, title, num_words = lda_line(flags.location + ii, \
                                                        superset_vocab, \
                                                        filter_vocab)
            if num_words >= flags.min_length:
                num_docs += 1
                o1.write(line)
                o2.write("%i\n" % rating)
                o3.write("%s\n" % title)
            else:
コード例 #3
0
  for sent in doc.sentences:
    for word in sent.words:
      new_word = full.get_word(doc.language, word.token)
      new_id = flat.get_id(doc.language, new_word)
      if new_id != -1:
        if new_word in wn[doc.language]:
          print "*%i" % new_id,
        else:
          print new_id,
  print ""

if __name__ == "__main__":
  flags.InitFlags()

  # Get the corpus vocab
  superset_vocab = MultilingualVocab()
  for root in flags.doc_roots:
    print "Reading root", root
    corpus = Corpus()
    f = open(root, "rb")
    corpus.ParseFromString(f.read())

    for ii in corpus.tokens:
      for jj in ii.terms:
        superset_vocab.set(ii.language, jj.id, jj.original)

  # Get the culled vocab
  filtered_vocab = MultilingualVocab(flags.vocab)

  # Get the wn vocab, limited by the vocabulary
  wn_vocab = defaultdict(set)
コード例 #4
0
ファイル: ldac_format_writer.py プロジェクト: NetBUG/topicmod
  o2 = open(flags.output + ".rat", 'w')
  o3 = open(flags.output + ".doc", 'w')

  num_docs = 0
  skipped = 0

  filter_vocab = Vocab(flags.vocab, kLANGUAGE_ID[flags.language])
  for root in flags.doc_roots:
    print "Reading root", root
    corpus = Corpus()
    f = open(root, "rb")
    corpus.ParseFromString(f.read())

    # This allows possibly inconsistent ways of assigning numbers of words, but
    # the final voc should make them consistent
    superset_vocab = MultilingualVocab()
    for ii in corpus.tokens:
      for jj in ii.terms:
        # print ii.language, jj.id, jj.original
        superset_vocab.set(ii.language, jj.id, jj.original)

    for ii in corpus.doc_filenames:
        rating, line, title, num_words = lda_line(flags.location + ii, \
                                                    superset_vocab, \
                                                    filter_vocab)
        if num_words >= flags.min_length:
          num_docs += 1
          o1.write(line)
          o2.write("%i\n" % rating)
          o3.write("%s\n" % title)
        else: