コード例 #1
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.flat import FlatCorpus
#from topicmod.corpora.flat import FlatEmailCorpus

flags.define_int("doc_limit", -1, "How many documents \
                                   we add")
flags.define_string("base", "../../data/yn_toy/", \
                      "Where we look for data")
flags.define_string("output", "../../data/yn_toy/numeric", \
                      "Where we write output")

if __name__ == "__main__":
  flags.InitFlags()
  corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus.add_language("*", ENGLISH)
  print flags.output

  corpus.write_proto(flags.output, "yn_toy")
コード例 #2
0
ファイル: yn_toy_corpus.py プロジェクト: NetBUG/topicmod
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.flat import FlatCorpus

# from topicmod.corpora.flat import FlatEmailCorpus

flags.define_int(
    "doc_limit",
    -1,
    "How many documents \
                                   we add",
)
flags.define_string("base", "../../data/yn_toy/", "Where we look for data")
flags.define_string("output", "../../data/yn_toy/numeric", "Where we write output")

if __name__ == "__main__":
    flags.InitFlags()
    corpus = FlatCorpus(flags.base, flags.doc_limit)
    corpus.add_language("*", ENGLISH)
    print flags.output

    corpus.write_proto(flags.output, "yn_toy")
コード例 #3
0
ファイル: values_turk.py プロジェクト: hxsebastien/topicmod
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
#from topicmod.corpora.flat import FlatCorpus
from topicmod.corpora.flat import FlatCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_string("base", "../../data/values_turk/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

if __name__ == "__main__":
  flags.InitFlags()
  #corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus.add_language("1/*")
  corpus.add_language("2/*")

  corpus.write_proto(flags.output + "numeric", "values_turk")
コード例 #4
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.flat import FlatCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_string("base", "../../data/de_news/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

if __name__ == "__main__":
    flags.InitFlags()
    corpus = FlatCorpus(flags.base, flags.doc_limit)
    corpus.add_language("*.en.txt", ENGLISH)
    corpus.add_language("*.en.txt", GERMAN)

    corpus.write_proto(flags.output + "numeric", "20_news")
コード例 #5
0
ファイル: de_news_corpus.py プロジェクト: NetBUG/topicmod
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.flat import FlatCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_string("base", "../../data/de_news/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

if __name__ == "__main__":
  flags.InitFlags()
  corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus.add_language("*.en.txt", ENGLISH)
  corpus.add_language("*.en.txt", GERMAN)

  corpus.write_proto(flags.output + "numeric", "20_news")