Example #1
0
from collections import defaultdict

from nltk import FreqDist

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper
from topicmod.util import flags
from parse_reader import *

flags.define_int("docs_per_file", 100, "Number of documents per file")
flags.define_int("vocab_size", 5000, "Maximum vocabulary size")
flags.define_bool("remove_stop", False, "remove stopwords")
flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens")
flags.define_bool("use_relation", False, "Use relation (synset) instead of pos")
flags.define_glob("vocab_source", None, "Where we get initial vocabulary")
flags.define_string("output_path", None, "Where we write the translated corpuss")
flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index")
flags.define_int("min_length", 100, "Number of characters in document line")


class CorpusTranslator:
    def __init__(self, output_path, use_lemma, docs_per_file):
        self.output_base = output_path
        self.document_list = []
        self.use_lemma = use_lemma

        # A lookup for each language
        self.vocab = defaultdict(dict)
        self.roles = defaultdict(dict)
        self.output_corpus = Corpus()
Example #2
0
from collections import defaultdict

from nltk import FreqDist

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper
from topicmod.util import flags
from parse_reader import *

flags.define_int("docs_per_file", 100, "Number of documents per file")
flags.define_int("vocab_size", 5000, "Maximum vocabulary size")
flags.define_bool("remove_stop", False, "remove stopwords")
flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens")
flags.define_bool("use_relation", False,
                  "Use relation (synset) instead of pos")
flags.define_glob("vocab_source", None, "Where we get initial vocabulary")
flags.define_string("output_path", None,
                    "Where we write the translated corpuss")
flags.define_string("output_filename", "wacky_en_reduced.index",
                    "Filename of index")
flags.define_int("min_length", 100, "Number of characters in document line")


class CorpusTranslator:
    def __init__(self, output_path, use_lemma, docs_per_file):
        self.output_base = output_path
        self.document_list = []
        self.use_lemma = use_lemma

        # A lookup for each language
        self.vocab = defaultdict(dict)
Example #3
0
      path_assignments_out.write(path_assign)

  topic_assignments_in.close()
  topic_assignments_out.close()
  path_assignments_in.close()
  path_assignments_out.close()
  docs_in.close()
  docs_out.close()

  return new_topics + 1


flags.define_string("corpus", None, "Where we find the input corpora")
flags.define_string("mapping", None, "Filename of mapping")
flags.define_string("cons_file", "", "Constraints filename")
flags.define_glob("wordnet", "wn/output.0", "contraint source")
flags.define_string("input_base", "output/nih", "Input filename")
flags.define_string("output_base", "output/nih_ned", "Output filename")
flags.define_string("resume_type", "clear", "resume type: clear or split")
flags.define_string("update_strategy", "doc", "update strategy: term or doc")
flags.define_int("doc_limit", -1, "Number of documents to process")
flags.define_int("num_topics", 0, "Current number of topics")

if __name__ == "__main__":
  flags.InitFlags()

  if re.search("doc", flags.update_strategy):
    update_strategy = 1
  elif re.search("term", flags.update_strategy):
    update_strategy = 0
  else:
Example #4
0
from topicmod.util import flags
from topicmod.corpora.vocab_compiler import VocabCompiler

flags.define_glob("corpus_parts", None, "Where we look for vocab")
flags.define_filename("output", None, "Where we write the new vocab")
flags.define_int("min_freq", 10, "Minimum frequency for inclusion")
flags.define_int("vocab_limit", 5000, "Maximum vocab size")
flags.define_bool("exclude_stop", True, "Do we throw out stop words")
flags.define_bool("exclude_punc", True, "Do we exclude punctuation")
flags.define_bool("exclude_digits", True, "Do we exclude digits")
flags.define_list("special_stop", [], "Special stop words")
flags.define_int("min_length", 3, "Minimum length for tokens")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("bigram", False, "Use bigrams")

if __name__ == "__main__":
  flags.InitFlags()

  assert not (flags.stem and flags.bigram), "Can't use stem and bigram"

  v = VocabCompiler()
  for ii in flags.corpus_parts:
    print ii
    v.addVocab(ii, flags.exclude_stop, flags.special_stop, \
                 flags.exclude_punc, flags.exclude_digits, \
                 flags.stem, flags.bigram, flags.min_length)
  v.writeVocab(flags.output, flags.vocab_limit, flags.min_freq)
Example #5
0
from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.ml_vocab import Vocab
from topicmod.corpora.ml_vocab import MultilingualVocab

import codecs

from collections import defaultdict

flags.define_glob("wordnet", "", "The wordnet files")
flags.define_string("vocab", "", "The vocab file")
flags.define_glob("docs", "", "The documents we want to view")
flags.define_glob("doc_roots", "", "The document vocab")


def print_doc(filename, full, flat, wn):
  doc = Document()
  doc.ParseFromString(open(filename, 'rb').read())

  print "-------------------------------"
  print "Original document:"
  for sent in doc.sentences:
    for word in sent.words:
      print "|%i:%i:%s|" % (doc.language, word.token, \
                              full.get_word(doc.language, \
                                              word.token).encode("ascii", \
                                                                   'ignore')),
  print ""
Example #6
0
from topicmod.util import flags
from topicmod.util.sets import count_line
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.ml_vocab import MultilingualVocab
from topicmod.corpora.ml_vocab import Vocab

from collections import defaultdict
import codecs

flags.define_string("output", "", "Where we write output")
flags.define_glob("doc_roots", "", "The document vocab")
flags.define_string("vocab", "", "The vocab file")
flags.define_string("location", "", "Where the data live")
flags.define_int("min_length", 50, "Minimum number of tokens")
flags.define_int("num_docs", -1, "Number of documents we write")
flags.define_string("language", "en", "What language this is")

kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE}


def lda_line(filename, full_vocab, filtered_vocab):
    d = defaultdict(int)

    doc = Document()
    doc.ParseFromString(open(filename, 'rb').read())

    num_words = 0
    for sent in doc.sentences:
        for word in sent.words:
            new_word = full_vocab.get_word(doc.language, word.token)
Example #7
0
from topicmod.util import flags
from topicmod.util.sets import count_line
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.ml_vocab import MultilingualVocab
from topicmod.corpora.ml_vocab import Vocab

from collections import defaultdict
import codecs

flags.define_string("output", "", "Where we write output")
flags.define_glob("doc_roots", "", "The document vocab")
flags.define_string("vocab", "", "The vocab file")
flags.define_string("location", "", "Where the data live")
flags.define_int("min_length", 50, "Minimum number of tokens")
flags.define_int("num_docs", -1, "Number of documents we write")
flags.define_string("language", "en", "What language this is")

kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE}


def lda_line(filename, full_vocab, filtered_vocab):
  d = defaultdict(int)

  doc = Document()
  doc.ParseFromString(open(filename, 'rb').read())

  num_words = 0
  for sent in doc.sentences:
    for word in sent.words: