Beispiel #1
0
    else:
      topic_assignments_out.write(topic_assign)
      path_assignments_out.write(path_assign)

  topic_assignments_in.close()
  topic_assignments_out.close()
  path_assignments_in.close()
  path_assignments_out.close()
  docs_in.close()
  docs_out.close()

  return new_topics + 1


flags.define_string("corpus", None, "Where we find the input corpora")
flags.define_string("mapping", None, "Filename of mapping")
flags.define_string("cons_file", "", "Constraints filename")
flags.define_glob("wordnet", "wn/output.0", "contraint source")
flags.define_string("input_base", "output/nih", "Input filename")
flags.define_string("output_base", "output/nih_ned", "Output filename")
flags.define_string("resume_type", "clear", "resume type: clear or split")
flags.define_string("update_strategy", "doc", "update strategy: term or doc")
flags.define_int("doc_limit", -1, "Number of documents to process")
flags.define_int("num_topics", 0, "Current number of topics")

if __name__ == "__main__":
  flags.InitFlags()

  if re.search("doc", flags.update_strategy):
    update_strategy = 1
Beispiel #2
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.amazon import AmazonCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_list("langs", ["en"], "Which langauges do we add")
flags.define_string("base", "../../data/multiling-sent/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

LANGUAGE_CONSTANTS = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE, \
                        "fr": FRENCH, "es": SPANISH, "ar": ARABIC}

if __name__ == "__main__":
    flags.InitFlags()
    corpus = AmazonCorpus(flags.base, flags.doc_limit)
    for ll in flags.langs:
        corpus.add_language("amzn-%s/*/*" % ll, LANGUAGE_CONSTANTS[ll])

    corpus.write_proto(flags.output + "numeric", "amazon")
from collections import defaultdict

from topicmod.util import flags
from topicmod.util.wordnet import load_wn
from topicmod.ling.dictionary import *
from topicmod.ling.snowball_wrapper import Snowball
from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original
# vocab will be included  in the generated wordnet, generate a new
# vocab only contains the words in the wordnet.
Beispiel #4
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.flat import FlatCorpus

# from topicmod.corpora.flat import FlatEmailCorpus

flags.define_int(
    "doc_limit",
    -1,
    "How many documents \
                                   we add",
)
flags.define_string("base", "../../data/yn_toy/", "Where we look for data")
flags.define_string("output", "../../data/yn_toy/numeric", "Where we write output")

if __name__ == "__main__":
    flags.InitFlags()
    corpus = FlatCorpus(flags.base, flags.doc_limit)
    corpus.add_language("*", ENGLISH)
    print flags.output

    corpus.write_proto(flags.output, "yn_toy")
Beispiel #5
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.flat import FlatCorpus
#from topicmod.corpora.flat import FlatEmailCorpus

flags.define_int("doc_limit", -1, "How many documents \
                                   we add")
flags.define_string("base", "../../data/yn_toy/", \
                      "Where we look for data")
flags.define_string("output", "../../data/yn_toy/numeric", \
                      "Where we write output")

if __name__ == "__main__":
  flags.InitFlags()
  corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus.add_language("*", ENGLISH)
  print flags.output

  corpus.write_proto(flags.output, "yn_toy")
Beispiel #6
0
        pmi_score /= len(word_pairs)

        tmp = str(tt) + "\t" + str(
            len(word_pairs)) + "\t" + str(pmi_score) + "\n"
        infile.write(tmp)

        total_pmi_score += pmi_score

    total_pmi_score /= len(topics.keys())
    tmp = "total" + "\t" + str(len(
        topics.keys())) + "\t" + str(total_pmi_score) + "\n"
    infile.write(tmp)
    infile.close()


flags.define_string("vocab", "", "Where we find the vocab")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_int("topics_cutoff", 30, "Number of topics")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_string("output", "output/PMI_score", "PMI Output filename")

if __name__ == "__main__":

    flags.InitFlags()

    print "Reading vocab"
    [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
    vocab_size = len(vocab_word_index)

    print "Reading topic words"
Beispiel #7
0
from topicmod.corpora.semcor import SemcorCorpus
from topicmod.util import flags

flags.define_string("semcor_base", "../../data/semcor-%s/", \
                      "Where we find the semcor corpus")
flags.define_string("wordnet_base", "../../data/wordnet/", \
                      "Where we find the wordnet corpus")
flags.define_string("version", "3.0", "Version of WordNet used")
flags.define_string("semcor_output", None, "Where we write the output")

if __name__ == "__main__":
    flags.InitFlags()
    semcor = SemcorCorpus(flags.semcor_base % flags.version)

    semcor.load_wn(flags.wordnet_base, flags.version)
    semcor.add_language("brown1/tagfiles/*")
    semcor.add_language("brown2/tagfiles/*")
    #semcor.add_language("brownv/tagfiles/br-e*")

    semcor.write_proto(flags.semcor_output, "semcor", 80)
Beispiel #8
0
from topicmod.util import flags
import re
from numpy import zeros

flags.define_string("folder_base", "output/20_news/", \
                                  "Input file folder")
flags.define_string("output_base", "output/20_news/results_compare", \
                                  "Output file name")


if __name__ == "__main__":
  flags.InitFlags()

  # comparing 1

  filename = flags.output_base + 'results_compare_1.csv'
  outputfile = open(filename, 'w')

  folders = open(flags.folder_base + 'folders.txt', 'r')
  for folder in folders:
    folder = folder.strip()

    tmp = folder + '\n\n'
    outputfile.write(tmp)
    filename = folder.replace('results_', 'iter_100_')
    filename = flags.folder_base + folder + '/' + filename + '.txt'
    inputfile = open(filename, 'r')
    for line in inputfile:
      outputfile.write(line)
    inputfile.close()
    outputfile.write('\n\n\n')
Beispiel #9
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.europarl import EuroparlCorpus

flags.define_string("base", "../../data/europarl/", "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")
flags.define_int("doc_limit", -1, "How many documents we add")

if __name__ == "__main__":
    flags.InitFlags()

    for ii in xrange(96, 107):
        year = ii % 100
        print " *** YEAR %i *** " % year
        corpus = EuroparlCorpus(flags.base, flags.doc_limit)
        corpus.add_language("english/ep-%02i-*.en" % year, ENGLISH)
        corpus.add_language("german/ep-%02i-*.de" % year, GERMAN)

        corpus.write_proto(flags.output + "numeric", "europarl%02i" % year,
                           1000)
Beispiel #10
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags

# from topicmod.corpora.flat import FlatCorpus
from topicmod.corpora.crossfire import CrossfireCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_string("base", "../../data/crossfire/cf/clean/", "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

if __name__ == "__main__":
    flags.InitFlags()
    # corpus = FlatCorpus(flags.base, flags.doc_limit)
    corpus = CrossfireCorpus(flags.base, flags.doc_limit)
    corpus.add_language("*.txt")

    corpus.write_proto(flags.output + "numeric", "crossfire")
Beispiel #11
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.pang_lee_movie import PangLeeMovieCorpus

flags.define_string("base", "../../data/movies/", "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")
flags.define_string("response", "rating", "Which rating format we use")
flags.define_int("doc_limit", -1, "How many documents we add")

if __name__ == "__main__":
  flags.InitFlags()
  corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit)
  corpus.add_language("pang_lee/*/subj.*", flags.response, ENGLISH)
  corpus.write_proto(flags.output + "numeric", "movies", 1000)

  corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit)
  corpus.add_language("filmrezension.de_lines/*/subj.*", flags.response, \
                        GERMAN)
  corpus.write_proto(flags.output + "numeric", "movies", 100)
Beispiel #12
0
  print path2

  common = set(path1).intersection(set(path2))
  first = min(common)
  assert(first >= len(word_list))
  first -= len(word_list) 
  cluster_root = Z[first][0]
  merge1 = findCluster(Z, cluster_root, word_list)
  cluster_root = Z[first][1]
  merge2 = findCluster(Z, cluster_root, word_list)

  print merge1
  print merge2


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("constraint", "constraints/tmp", "Original constraint file")
flags.define_int("topics_cutoff", 30, "Number of topic words")

if __name__ == "__main__":

  # test()

  flags.InitFlags()

  # getting statistics: faster version, partial statistics, memory efficient
  print "Reading vocab"
  [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
  vocab_size = len(vocab_word_index)
from numpy import *

from topicmod.external.moremath import *

from topicmod.util import flags

flags.define_string("alpha", None, "The current value of alpha")
flags.define_string("gamma", None, "The current gamma matrix")
flags.define_float("tolerance", 0.001, "Toleranceg for convergence")

NEGATIVE_INFINITY = -float("inf")

def l_alpha(alpha, M, K, gamma_grad):
  val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha)
  val *= M
  for ii in xrange(K):
    val += alpha[ii] * gamma_grad[ii]
  return val

def compute_gamma_gradient(gamma, K):
  """
  Compute the components of the derivative that gamma contributes to. 
  """

  grad = zeros(K)

  for gamma_d in gamma:
    digam_gamma_sum = digamma(sum(gamma_d))
    for ii in xrange(K):
      grad[ii] += digamma(gamma_d[ii]) - digam_gamma_sum
Beispiel #14
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
#from topicmod.corpora.flat import FlatCorpus
from topicmod.corpora.flat import FlatCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_string("base", "../../data/values_turk/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

if __name__ == "__main__":
  flags.InitFlags()
  #corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus.add_language("1/*")
  corpus.add_language("2/*")

  corpus.write_proto(flags.output + "numeric", "values_turk")
Beispiel #15
0
from topicmod.util import flags
import re
from numpy import zeros

flags.define_string("folder_base", "output/20_news/", \
                                  "Input file folder")
flags.define_string("output_base", "output/20_news/results_compare", \
                                  "Output file name")

if __name__ == "__main__":
    flags.InitFlags()

    # comparing 1

    filename = flags.output_base + 'results_compare_1.csv'
    outputfile = open(filename, 'w')

    folders = open(flags.folder_base + 'folders.txt', 'r')
    for folder in folders:
        folder = folder.strip()

        tmp = folder + '\n\n'
        outputfile.write(tmp)
        filename = folder.replace('results_', 'iter_100_')
        filename = flags.folder_base + folder + '/' + filename + '.txt'
        inputfile = open(filename, 'r')
        for line in inputfile:
            outputfile.write(line)
        inputfile.close()
        outputfile.write('\n\n\n')
Beispiel #16
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.pang_lee_movie import PangLeeMovieCorpus

flags.define_string("base", "../../data/movies/", "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")
flags.define_string("response", "rating", "Which rating format we use")
flags.define_int("doc_limit", -1, "How many documents we add")

if __name__ == "__main__":
    flags.InitFlags()
    corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit)
    corpus.add_language("pang_lee/*/subj.*", flags.response, ENGLISH)
    corpus.write_proto(flags.output + "numeric", "movies", 1000)

    corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit)
    corpus.add_language("filmrezension.de_lines/*/subj.*", flags.response, \
                          GERMAN)
    corpus.write_proto(flags.output + "numeric", "movies", 100)
from numpy import *

from topicmod.external.moremath import *

from topicmod.util import flags

flags.define_string("alpha", None, "The current value of alpha")
flags.define_string("gamma", None, "The current gamma matrix")
flags.define_float("tolerance", 0.001, "Toleranceg for convergence")

NEGATIVE_INFINITY = -float("inf")


def l_alpha(alpha, M, K, gamma_grad):
    val = lngamma(sum(alpha)) - sum(lngamma(x) for x in alpha)
    val *= M
    for ii in xrange(K):
        val += alpha[ii] * gamma_grad[ii]
    return val


def compute_gamma_gradient(gamma, K):
    """
  Compute the components of the derivative that gamma contributes to. 
  """

    grad = zeros(K)

    for gamma_d in gamma:
        digam_gamma_sum = digamma(sum(gamma_d))
        for ii in xrange(K):
Beispiel #18
0
from collections import defaultdict
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
import re

flags.define_string("vocab", "vocab/semcor.voc", \
                      "the vocabulary used for building the tree")
flags.define_string("wnname", "wn/wordnet.wn", "Where we write output")
flags.define_string(
    "constraints", "",
    "where we get the constraints, " + "one tab-delimited constraint per line")

flags.define_bool("write_constraints", False, "Write out example constraint")
flags.define_bool("write_wordnet", False, "Write out wordnet")
flags.define_bool("write_toy", False, "Write out a toy wordnet")
flags.define_bool("merge_constraints", True,
                  "Put duplicate constraints into" + " a single constraint")


def orderedTraversal(wn, pos='n', limit_depth=-1, reverse_depth=False):
    """
    Given a wordnet object, give the synsets in order of internal nodes first,
    followed by leaves.

    @param pos Which part of speech we search
    @param limit_depth Don't consider nodes deeper than this
    @param reverse Reverse the order of the search (leaves first)
    """

    # Find the max depth synset
Beispiel #19
0
from nltk import FreqDist

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper
from topicmod.util import flags
from parse_reader import *

flags.define_int("docs_per_file", 100, "Number of documents per file")
flags.define_int("vocab_size", 5000, "Maximum vocabulary size")
flags.define_bool("remove_stop", False, "remove stopwords")
flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens")
flags.define_bool("use_relation", False,
                  "Use relation (synset) instead of pos")
flags.define_glob("vocab_source", None, "Where we get initial vocabulary")
flags.define_string("output_path", None,
                    "Where we write the translated corpuss")
flags.define_string("output_filename", "wacky_en_reduced.index",
                    "Filename of index")
flags.define_int("min_length", 100, "Number of characters in document line")


class CorpusTranslator:
    def __init__(self, output_path, use_lemma, docs_per_file):
        self.output_base = output_path
        self.document_list = []
        self.use_lemma = use_lemma

        # A lookup for each language
        self.vocab = defaultdict(dict)
        self.roles = defaultdict(dict)
        self.output_corpus = Corpus()
Beispiel #20
0
from topicmod.corpora.proto.corpus_pb2 import Corpus
from topicmod.util import flags

flags.define_glob("corpus_parts", None, "Where we look for vocab")
flags.define_string("output", None, "Where we write the mapping")

if __name__ == "__main__":
    flags.InitFlags()

    mapping = {}

    for ii in flags.corpus_parts:
        print ii
        cp = Corpus()
        cp.ParseFromString(open(ii, 'r').read())

        for ii in cp.authors.terms:
            if ii.id in mapping:
                assert mapping[ii.id] == ii.original
            mapping[ii.id] = ii.original

    o = open(flags.output, 'w')
    for ii in xrange(max(mapping)):
        o.write("%s\n" % mapping[ii])
Beispiel #21
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.pang_lee_movie import PangLeeMovieCorpus

flags.define_string("base", "../../data/rdd/moviestyledata/",
                    "Where we look for data")
flags.define_string("output", "../../data/rdd/moviestyleproto/numeric/",
                    "Where we write output")
flags.define_string("response", "rating", "Which rating format we use")
flags.define_int("doc_limit", -1, "How many documents we add")

if __name__ == "__main__":
    flags.InitFlags()
    corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit)
    corpus.add_language("*/subj.*", flags.response, DIXIE)
    corpus.write_proto(flags.output + "numeric", "richmond", 1000)

    #corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit)
    #corpus.add_language("filmrezension.de_lines/*/subj.*", flags.response, \
    #                     GERMAN)
    #corpus.write_proto(flags.output + "numeric", "richmond", 100)
Beispiel #22
0
from topicmod.corpora.wacky import *
from topicmod.util import flags

flags.define_string("wackypedia_base", "../../data/wackypedia/compressed/",
                    "Where we find the wackypedia corpus")
flags.define_string("output", "/tmp/jbg/wackypedia/", "Where we write output")
flags.define_int("doc_limit", 10, "Max number of docs")
flags.define_list("langs", ["en"], "Which languages")

if __name__ == "__main__":
  flags.InitFlags()
  wacky = WackyCorpus(flags.wackypedia_base, flags.doc_limit)
  for ii in flags.langs:
    wacky.add_language("wackypedia_%s*.gz" % ii)

  wacky.write_proto(flags.output + "numeric",
                    "wpdia", 10000)
from topicmod.util import flags

flags.define_string("input_base", "output/20_news/iter_100_PMI_", \
                                  "Input file folder")
flags.define_string("output_base", "output/20_news/iter_100_PMI", \
                                  "Output file name")
flags.define_string("PMI_file", "PMI_score", \
                                  "Output file name")
flags.define_int("round_num", "5", "Number of iteractive rounds")

if __name__ == "__main__":
  flags.InitFlags()

  results = dict()
  rounds = flags.round_num + 1
  for ii in range(0, rounds):
    filename = flags.input_base + str(ii) + "/" + flags.PMI_file
    inputfile = open(filename, 'r')
    for line in inputfile:
      line = line.strip()
      words = line.split('\t')
      if words[0].find('total') >= 0:
        word_key = -1
      else:
        word_key = int(words[0])
      if word_key not in results.keys():
        results[word_key] = []
      results[word_key].append(words[2])

  outputfile = open(flags.output_base, 'w')
  for tt in results.keys():
    word_senses_count[word] = 0
    count_word += 1
    tmp = word
    for pos in multipaths[word]:
      tmp += '\t' + pos
      for index in multipaths[word][pos]:
        word_senses_count[word] += 1
        count_sense += 1
        tmp += '\t' + str(index)
    if word_senses_count[word] > 1:
      im_words += word + " "
    outfile.write(tmp + '\n')
  outfile.write("\nThe total number of cons words: " + str(count_word) + "\n")
  outfile.write("\nThe total number of cons words senses: " + str(count_sense) + "\n")
  outfile.write("\nInteresting words: " + im_words + "\n")
  outfile.close()


flags.define_string("vocab", None, "The input vocab")
flags.define_string("output", None, "The output constraint file")
flags.define_int("num_cons", 0, "The number of constraints we want")

if __name__ == "__main__":

  flags.InitFlags()
  wordnet_path = "../../../data/wordnet/" 
  eng_wn = load_wn("3.0", wordnet_path, "wn")
  vocab = readVocab(flags.vocab)
  generateCons(vocab, eng_wn, flags.output, flags.num_cons)
  
Beispiel #25
0
import sys
import os

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_int("num_iterations", 1, "Number of iterations")
flags.define_string("model_name", "output/model", "Where we find data")

flags.define_string("corpus", None, "The source corpus")

flags.define_bool("hadoop", False, "Do we use hadoop or local batch")
flags.define_bool("doc_step", True, "Do we call the document-centric parts")
flags.define_bool("merge_step", True, "Do we merge doc step results (and compute new topics)")
flags.define_bool("update_global", True, "Do we compute new transition and DP variational parameters")

class Array:
  def __init__(self, name):
    self._rows = {}
    self._name = name

  def __getitem__(self, index):
    if not index in self._rows:
      self._rows[index] = defaultdict(float)
    return self._rows[index]

  def __iter__(self):
    for ii in self._rows:
      yield self._rows[ii]

  def parse(self, key, val):
Beispiel #26
0
from topicmod.util import flags
from topicmod.util.sets import count_line
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.ml_vocab import MultilingualVocab
from topicmod.corpora.ml_vocab import Vocab

from collections import defaultdict
import codecs

flags.define_string("output", "", "Where we write output")
flags.define_glob("doc_roots", "", "The document vocab")
flags.define_string("vocab", "", "The vocab file")
flags.define_string("location", "", "Where the data live")
flags.define_int("min_length", 50, "Minimum number of tokens")
flags.define_int("num_docs", -1, "Number of documents we write")
flags.define_string("language", "en", "What language this is")

kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE}


def lda_line(filename, full_vocab, filtered_vocab):
  d = defaultdict(int)

  doc = Document()
  doc.ParseFromString(open(filename, 'rb').read())

  num_words = 0
  for sent in doc.sentences:
    for word in sent.words:
Beispiel #27
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
#from topicmod.corpora.flat import FlatCorpus
from topicmod.corpora.flat import FlatEmailCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_string("base", "../../data/20_news_date/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

if __name__ == "__main__":
  flags.InitFlags()
  #corpus = FlatCorpus(flags.base, flags.doc_limit)
  corpus = FlatEmailCorpus(flags.base, flags.doc_limit)
  corpus.add_language("train/*/*")
  corpus.add_language("test/*/*")

  corpus.write_proto(flags.output + "numeric", "20_news_date")
Beispiel #28
0
      pmi_score += pmi

    pmi_score /= len(word_pairs)

    tmp = str(tt) + "\t" + str(len(word_pairs)) + "\t" + str(pmi_score) + "\n"
    infile.write(tmp)

    total_pmi_score += pmi_score

  total_pmi_score /= len(topics.keys())
  tmp = "total" + "\t" + str(len(topics.keys())) + "\t" + str(total_pmi_score) + "\n"
  infile.write(tmp)
  infile.close()


flags.define_string("vocab", "", "Where we find the vocab")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_int("topics_cutoff", 30, "Number of topics")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_string("output", "output/PMI_score", "PMI Output filename")

if __name__ == "__main__":

  flags.InitFlags()

  print "Reading vocab"
  [vocab_word_index, vocab_index_word] = readVocab(flags.vocab)
  vocab_size = len(vocab_word_index)

  print "Reading topic words"
Beispiel #29
0
from numpy.random.mtrand import dirichlet
from numpy.random import multinomial
from numpy.random import normal
from math import isnan, isinf

from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("num_docs", 500, "Number of documents")
flags.define_int("num_topics", 5, "Number of topics")
flags.define_int("doc_length", 5, "Length of every document")
flags.define_int("num_langs", 2, "Number of languages")
flags.define_float("variance", 0.5, "Variance of distribution")
flags.define_float("gamma", 1.0, "Vocabulary hyperparameter")
flags.define_float("alpha", 0.1, "Document topic hyperparameter")
flags.define_string("output_base", "data/synthetic", "Where we write the data")
flags.define_string("doc_proportion", "synthetic.theta",
                    "Where we write doc thetas")
flags.define_int("num_groups", 2, "Number of splits")
flags.define_string("vocab_output", "vocab/synthetic.voc",
                    "Where we write vocab")
flags.define_int("topic_output_size", 15,
                 "Number of words to display when we output topics")

ml_vocab = [{
    0: ["dog", "cat", "moose", "butterfly"],
    1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"],
    2: [
        "toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo",
        "mosquito", "arana", "pavo"
    ]
Beispiel #30
0
      tmp = 'SPLIT_\t' + w1 + '\t' + w2 + '\n'
      output_file.write(tmp)
      count += 1

  count = 0
  for (w1, w2) in must.keys():
    if count < must_links_num:
      pmi = must[(w1, w2)]
      tmp = 'MERGE_\t' + w1 + '\t' + w2 + '\n'
      output_file.write(tmp)
      count += 1

  output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", "", "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_int("cannot_links", 0, "Number of cannot links that we want")
flags.define_int("must_links", 0, "Number of must links that we want")

flags.define_int("num_topics", 20, "Number of topics")
flags.define_bool("train_only", False, "Using only train data to \
                                        generate the constraints")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_float("tfidf_thresh", 0.0, "threshold for tfidf")

if __name__ == "__main__":
Beispiel #31
0
from collections import defaultdict

from nltk import FreqDist

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.corpus_vocab_wrapper import CorpusVocabWrapper
from topicmod.util import flags
from parse_reader import *

flags.define_int("docs_per_file", 100, "Number of documents per file")
flags.define_int("vocab_size", 5000, "Maximum vocabulary size")
flags.define_bool("remove_stop", False, "remove stopwords")
flags.define_bool("use_lemma", False, "Use lemmas instead of raw tokens")
flags.define_bool("use_relation", False, "Use relation (synset) instead of pos")
flags.define_glob("vocab_source", None, "Where we get initial vocabulary")
flags.define_string("output_path", None, "Where we write the translated corpuss")
flags.define_string("output_filename", "wacky_en_reduced.index", "Filename of index")
flags.define_int("min_length", 100, "Number of characters in document line")


class CorpusTranslator:
    def __init__(self, output_path, use_lemma, docs_per_file):
        self.output_base = output_path
        self.document_list = []
        self.use_lemma = use_lemma

        # A lookup for each language
        self.vocab = defaultdict(dict)
        self.roles = defaultdict(dict)
        self.output_corpus = Corpus()
Beispiel #32
0
import codecs

from collections import defaultdict

from topicmod.util import flags
from topicmod.ling.dictionary import DingEntries

flags.define_string("vocab", "", "Where we read vocab")
flags.define_float("smoothing", 0.001, "Smoothing amount")
flags.define_float("hit", 1.0, "Value if there's a hit")
flags.define_string("output", "lda/lambda", "Lambda output")

if __name__ == "__main__":
  flags.InitFlags()

  vocab = defaultdict(dict)
  index = defaultdict(int)

  for ii in codecs.open(flags.vocab):
    lang, word = ii.split("\t")
    lang = int(lang)
    vocab[lang][word.strip()] = index[lang]
    index[lang] += 1

  trans = defaultdict(set)
  sum = defaultdict(float)
  for ii in vocab[0]:
    for jj in vocab[1]:
      if ii == jj:
        if vocab[1][jj] % 100 == 0:
Beispiel #33
0
        print "Submitting ...", filename,
        qsub(filename, max_jobs)

    if delete:
        print "deleted"
        os.remove(filename)
    print(filename)
    print ""

if __name__ == "__main__":
    from topicmod.util import flags

    mkdir("/tmp/%s" % USER)
    mkdir("/tmp/%s/qsub-scripts" % USER)

    flags.define_string("template", "", "Where we read the template file from")
    flags.define_dict("args", {}, "Substitute values for the template")
    flags.define_dict("defaults", {}, "Default args")
    flags.define_string("wall", "24:00:00", "The wall time")
    flags.define_string("name", "", "Name given to job on cluster")
    flags.define_string("mem", "4gb", "How much memory we give")
    flags.define_string("queue", "shallow", "Which queue do we submit to")
    flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster")
    flags.define_bool("delete_scripts", True, "Do we delete after we're done?")
    flags.define_bool("submit", True, "Do we submit")

    flags.InitFlags()
    template = open(flags.template).read()
    d = flags.defaults

    d["wall"] = flags.wall
Beispiel #34
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
from topicmod.corpora.pang_lee_movie import PangLeeMovieCorpus

flags.define_string("base", "../../data/rdd/moviestyledata/",
                    "Where we look for data")
flags.define_string("output", "../../data/rdd/moviestyleproto/numeric/",
                    "Where we write output")
flags.define_string("response", "rating", "Which rating format we use")
flags.define_int("doc_limit", -1, "How many documents we add")

if __name__ == "__main__":
  flags.InitFlags()
  corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit)
  corpus.add_language("*/subj.*", flags.response, DIXIE)
  corpus.write_proto(flags.output + "numeric", "richmond", 1000)

  #corpus = PangLeeMovieCorpus(flags.base, flags.doc_limit)
  #corpus.add_language("filmrezension.de_lines/*/subj.*", flags.response, \
   #                     GERMAN)
  #corpus.write_proto(flags.output + "numeric", "richmond", 100)
Beispiel #35
0
from math import log
from random import random

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_string("vocab", None, "Size of vocabulary")
flags.define_int("num_docs", None, "Numer of documents")
flags.define_int("num_topics", 128, "Number topics")
flags.define_string("model_name", "output/model", "Name of model")

flags.define_bool("finite", False, "Use finite model")
flags.define_bool("ignore_trans", False, "Use only documents")
flags.define_bool("ignore_docs", False, "Use only syntax")
flags.define_bool("shortcut_gsl", False,
                  "Use closed form updates when possible")

flags.define_int("max_doc_iterations", 5,
                 "Number of e-step rounds per-document")
flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions")
flags.define_int("alpha_trans", 1.0,
                 "DP parameter for transition distributions")
flags.define_int("alpha_top", 1.0,
                 "DP parameter for top-level stick-breaking distribution")
flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx")

if __name__ == "__main__":
    flags.InitFlags()

    params = SyntopParameters()
from math import log
from random import random

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_string("vocab", None, "Size of vocabulary")
flags.define_int("num_docs", None, "Numer of documents")
flags.define_int("num_topics", 128, "Number topics")
flags.define_string("model_name", "output/model", "Name of model")

flags.define_bool("finite", False, "Use finite model")
flags.define_bool("ignore_trans", False, "Use only documents")
flags.define_bool("ignore_docs", False, "Use only syntax")
flags.define_bool("shortcut_gsl", False, "Use closed form updates when possible")

flags.define_int("max_doc_iterations", 5, "Number of e-step rounds per-document")
flags.define_int("alpha_doc", 1.0, "DP parameter for doc distributions")
flags.define_int("alpha_trans",  1.0, "DP parameter for transition distributions")
flags.define_int("alpha_top", 1.0, "DP parameter for top-level stick-breaking distribution")
flags.define_int("vocab_sigma", 0.1, "Vocab hyperparametersx")

if __name__ == "__main__":
  flags.InitFlags()

  params = SyntopParameters()

  params.finite = flags.finite
  params.ignore_trans = flags.ignore_trans
  params.ignore_docs = flags.ignore_docs
  params.shortcut_gsl = flags.shortcut_gsl
Beispiel #37
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
#from topicmod.corpora.flat import FlatCorpus
from topicmod.corpora.crossfire import CrossfireCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_string("base", "../../data/crossfire/cf/clean/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

if __name__ == "__main__":
    flags.InitFlags()
    #corpus = FlatCorpus(flags.base, flags.doc_limit)
    corpus = CrossfireCorpus(flags.base, flags.doc_limit)
    corpus.add_language("*.txt")

    corpus.write_proto(flags.output + "numeric", "crossfire")
Beispiel #38
0
from topicmod.util import flags
from topicmod.util.sets import count_line
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.ml_vocab import MultilingualVocab
from topicmod.corpora.ml_vocab import Vocab

from collections import defaultdict
import codecs

flags.define_string("output", "", "Where we write output")
flags.define_glob("doc_roots", "", "The document vocab")
flags.define_string("vocab", "", "The vocab file")
flags.define_string("location", "", "Where the data live")
flags.define_int("min_length", 50, "Minimum number of tokens")
flags.define_int("num_docs", -1, "Number of documents we write")
flags.define_string("language", "en", "What language this is")

kLANGUAGE_ID = {"en": ENGLISH, "de": GERMAN, "zh": CHINESE}


def lda_line(filename, full_vocab, filtered_vocab):
    d = defaultdict(int)

    doc = Document()
    doc.ParseFromString(open(filename, 'rb').read())

    num_words = 0
    for sent in doc.sentences:
        for word in sent.words:
            new_word = full_vocab.get_word(doc.language, word.token)
Beispiel #39
0
import sys
import os

from topicmod.util import flags
from syntop_parameters_pb2 import *

flags.define_int("num_iterations", 1, "Number of iterations")
flags.define_string("model_name", "output/model", "Where we find data")

flags.define_string("corpus", None, "The source corpus")

flags.define_bool("hadoop", False, "Do we use hadoop or local batch")
flags.define_bool("doc_step", True, "Do we call the document-centric parts")
flags.define_bool("merge_step", True,
                  "Do we merge doc step results (and compute new topics)")
flags.define_bool(
    "update_global", True,
    "Do we compute new transition and DP variational parameters")


class Array:
    def __init__(self, name):
        self._rows = {}
        self._name = name

    def __getitem__(self, index):
        if not index in self._rows:
            self._rows[index] = defaultdict(float)
        return self._rows[index]

    def __iter__(self):
Beispiel #40
0
import os
import gzip

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags

# Input
flags.define_string("doc_filter", None, "Files to filter out")
flags.define_string("vocab", None, "The file that defines the vocab")
flags.define_string("state_file", None, \
                    "The state file that create the corpus")

# Output
flags.define_string("state_output", None, "Where we write state file")
flags.define_string("corpus_output_path", None, "Where we write the corpus")
flags.define_string("corpus_name", "NIH", "Name of the corpus")

# Options
flags.define_int("docs_per_index", 5000, "Number of docs per section")
flags.define_int("doc_limit", -1, "Cap on number of documents")


class MalletAssignment:

  def __init__(self, line, debug=False):
    if debug:
      for ii in xrange(len(line.split())):
        print ii, line.split()[ii]
    self.doc, foo, self.index, self.term_id, self.term, self.assignment = \
      line.split()
    self.doc = int(self.doc)
    rank = tfidf
  else:
    rank = frequency

  o = codecs.open(outputname, 'w', 'utf-8')
  for ii in rank:
    count = 0
    for jj in rank[ii]:
      count += 1
      if count <= vocab_limit and frequency[ii][jj] >= freq_limit:
        word = vocab[ii][jj]
        o.write(u"%i\t%s\t%f\t%i\n" % (ii, word, tfidf[ii][jj], frequency[ii][jj]))
        
  o.close()


flags.define_string("proto_corpus", None, "The proto files")
flags.define_bool("lemma", False, "Use lemma or tokens")
flags.define_bool("select_tfidf", False, "select the vocab by tfidf or frequency")
flags.define_string("output", "", "Where we output the preprocessed data")
flags.define_string("vocab", None, "Where we output the vocab")
flags.define_int("vocab_limit", 10000, "The vocab size")
flags.define_int("freq_limit", 20, "The minimum frequency of each word")

if __name__ == "__main__":

  flags.InitFlags()  
  [vocab, tfidf, frequency] = gen_files(flags.proto_corpus, flags.output, flags.lemma)
  gen_vocab(vocab, tfidf, frequency, flags.select_tfidf, flags.vocab, flags.vocab_limit, flags.freq_limit)

Beispiel #42
0
from topicmod.corpora.nyt_reader import *
from topicmod.util import flags

flags.define_string("nyt_base", "../../data/new_york_times/", "Where we find the nyt corpus")
flags.define_int("doc_limit", -1, "How many documents")
flags.define_string("output", "/tmp/jbg/nyt/", "Where we write data")
flags.define_float("bigram_limit", 0.9, "p-value for bigrams")

if __name__ == "__main__":
    flags.InitFlags()
    nyt = NewYorkTimesReader(flags.nyt_base, flags.doc_limit, flags.bigram_limit)
    nyt.add_language_list("../../data/new_york_times/editorial_file_list")

    nyt.write_proto(flags.output + "numeric", "nyt", 1000)
import re
import os.path
from proto.corpus_pb2 import *
from proto.wordnet_file_pb2 import *
from topicmod.util import flags
from topicmod.util.sets import read_pickle, write_pickle

flags.define_int("option", 0, \
   "change the whole documents or just the topics of just the word")
flags.define_string("ldawnoutput", "output/nsf", "ldawn output directory")
flags.define_string("maps", "output/nsf", "mapping files directory")
flags.define_string("wordnet", "wn/output.0", "contraint source")
flags.define_string("assignment_path", None, "Where the assignments live")

def checkSame(cons, old_cons):
    if len(cons) != len(old_cons):
        return False
    for key in cons:
        if key not in old_cons:
            return False
    return True
  
  
def getMappingDicts_reGen(corpusdir, mapsdir, cons):
    # check the old constraint.dict exists or not
    cons_file = corpusdir + "/constraint.set"
    if (not os.path.exists(cons_file)):
        # Regenerate
        (word_wid_dic, wid_did_dic, did_doc_dic) = \
            getNewMappingDicts(corpusdir, mapsdir)
    else:
Beispiel #44
0
        qsub(filename, max_jobs)

    if delete:
        print "deleted"
        os.remove(filename)
    print(filename)
    print ""


if __name__ == "__main__":
    from topicmod.util import flags

    mkdir("/tmp/%s" % USER)
    mkdir("/tmp/%s/qsub-scripts" % USER)

    flags.define_string("template", "", "Where we read the template file from")
    flags.define_dict("args", {}, "Substitute values for the template")
    flags.define_dict("defaults", {}, "Default args")
    flags.define_string("wall", "24:00:00", "The wall time")
    flags.define_string("name", "", "Name given to job on cluster")
    flags.define_string("mem", "4gb", "How much memory we give")
    flags.define_string("queue", "shallow", "Which queue do we submit to")
    flags.define_int("max_jobs", 16, "Number of simultaneous jobs on cluster")
    flags.define_bool("delete_scripts", True, "Do we delete after we're done?")
    flags.define_bool("submit", True, "Do we submit")

    flags.InitFlags()
    template = open(flags.template).read()
    d = flags.defaults

    d["wall"] = flags.wall
Beispiel #45
0
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags
#from topicmod.corpora.flat import FlatCorpus
from topicmod.corpora.flat import FlatEmailCorpus

flags.define_int("doc_limit", -1, "How many documents we add")
flags.define_string("base", "../../data/20_news_date/", \
                      "Where we look for data")
flags.define_string("output", "/tmp/", "Where we write output")

if __name__ == "__main__":
    flags.InitFlags()
    #corpus = FlatCorpus(flags.base, flags.doc_limit)
    corpus = FlatEmailCorpus(flags.base, flags.doc_limit)
    corpus.add_language("train/*/*")
    corpus.add_language("test/*/*")

    corpus.write_proto(flags.output + "numeric", "20_news_date")
Beispiel #46
0
    tmp = 'MERGE_'
    for word in merge1:
        tmp += '\t' + word
    tmp += '\n'
    output_file.write(tmp)

    tmp = 'MERGE_'
    for word in merge2:
        tmp += '\t' + word
    tmp += '\n'
    output_file.write(tmp)

    output_file.close()


flags.define_string("vocab", "vocab/20_news.voc", "Where we find the vocab")
flags.define_string("stats", None, "Where we find the stat_file")
flags.define_string("model", "", "The model files folder of topic models")
flags.define_string("output", "constraints/tmp", "Output filename")
flags.define_int("topics_cutoff", 30, "Number of topic words")
flags.define_float("tfidf_thresh", 0, "threshold for tfidf")

if __name__ == "__main__":

    flags.InitFlags()

    # getting statistics: slower version, full statistics, memory cost
    #[cooccur, wordcount, tfidf, vocab_index_word, topics, doc_num] \
    #      = get_statistics_all(flags.corpus, flags.num_topics, flags.model, \
    #             flags.topics_cutoff, flags.window_size, flags.train_only)
Beispiel #47
0
from collections import defaultdict

from topicmod.util import flags
from topicmod.util.wordnet import load_wn
from topicmod.ling.dictionary import *
from topicmod.ling.snowball_wrapper import Snowball
from topicmod.corpora.ontology_writer import OntologyWriter
from topicmod.corpora.ontology_writer import orderedTraversal
from topicmod.corpora.ml_vocab import MultilingualVocab

from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("limit", 250, "How many items in our MuTo matching")
flags.define_bool("dictionary", False, "Use a dictionary")
flags.define_string("dic_dir", "../../data/dictionaries/", "Use a dictionary")
flags.define_bool("translation", False, "Use translation matrix")
flags.define_bool("greedy_matching", False, "Use a matching from dictionary")
flags.define_bool("wordnet", False, "Use WordNet as scaffold")
flags.define_bool("german", False, "Include German")
flags.define_bool("chinese", False, "Include Chinese")
flags.define_string("output", "", "Where we write ontology")
flags.define_float("trans_cutoff", 0.5, "Min value for using the translation")
flags.define_string("wn_version", "3.0", "Which version of WN we use")
flags.define_string("filter_vocab", "", "Filter entries based on vocabulary")
flags.define_bool("stem", False, "Stem words")
flags.define_bool("id_strings", False, "Add identical strings")

# generate an updated vocab: note not all the words in the original vocab will be included
# in the generated wordnet, generate a new vocab only contains the words in the wordnet.
flags.define_string("updated_vocab", "", "generate a new vocab")
Beispiel #48
0
from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.corpora.proto.wordnet_file_pb2 import *
from topicmod.corpora.ml_vocab import Vocab
from topicmod.corpora.ml_vocab import MultilingualVocab

import codecs

from collections import defaultdict

flags.define_glob("wordnet", "", "The wordnet files")
flags.define_string("vocab", "", "The vocab file")
flags.define_glob("docs", "", "The documents we want to view")
flags.define_glob("doc_roots", "", "The document vocab")


def print_doc(filename, full, flat, wn):
  doc = Document()
  doc.ParseFromString(open(filename, 'rb').read())

  print "-------------------------------"
  print "Original document:"
  for sent in doc.sentences:
    for word in sent.words:
      print "|%i:%i:%s|" % (doc.language, word.token, \
                              full.get_word(doc.language, \
                                              word.token).encode("ascii", \
                                                                   'ignore')),
  print ""
Beispiel #49
0
  infile = open(infilename, 'r')
  vocab = defaultdict(FreqDist)
  for line in infile:
    line = line.strip()
    ww = line.split('\t')
    lang = ww[0]
    if source[lang][ww[1]] == 0:
      print source[lang][ww[1]], ww[1]
    vocab[lang].inc(ww[1], source[lang][ww[1]])
  infile.close()

  outfile = codecs.open(outfilename, 'w', 'utf-8')
  for ii in vocab:
    for jj in vocab[ii]:
      outfile.write(u"%s\t%s\n" % (ii, jj))
      #outfile.write(u"%s\t%s\t%f\t%i\n" % (ii, jj, tfidf[ii][jj], frequency[ii][jj]))
  outfile.close()


flags.define_string("stats_vocab", None, "The proto files")
flags.define_string("input_vocab", None, "Where we get the original vocab")
flags.define_int("option", 0, "1: tfidf; others: frequency")
flags.define_string("sorted_vocab", None, "Where we output the vocab")

if __name__ == "__main__":

  flags.InitFlags()  
  [tfidf, frequency] = readStats(flags.stats_vocab)
  
  sortVocab(flags.input_vocab, tfidf, frequency, flags.option, flags.sorted_vocab)
Beispiel #50
0
from numpy.random.mtrand import dirichlet
from numpy.random import multinomial
from numpy.random import normal
from math import isnan, isinf

from topicmod.util import flags
from topicmod.corpora.proto.corpus_pb2 import *

flags.define_int("num_docs", 500, "Number of documents")
flags.define_int("num_topics", 5, "Number of topics")
flags.define_int("doc_length", 5, "Length of every document")
flags.define_int("num_langs", 2, "Number of languages")
flags.define_float("variance", 0.5, "Variance of distribution")
flags.define_float("gamma", 1.0, "Vocabulary hyperparameter")
flags.define_float("alpha", 0.1, "Document topic hyperparameter")
flags.define_string("output_base", "data/synthetic", "Where we write the data")
flags.define_string("doc_proportion", "synthetic.theta", "Where we write doc thetas")
flags.define_int("num_groups", 2, "Number of splits")
flags.define_string("vocab_output", "vocab/synthetic.voc", "Where we write vocab")
flags.define_int("topic_output_size", 15, "Number of words to display when we output topics")

ml_vocab = [{0: ["dog", "cat", "moose", "butterfly"], 
 1: ["hund", "katze", "spinner", "pferd", "maultier", "kuh"], 
 2:["toro", "mariposa", "gato", "vaca", "donkey", "burro", "caballo", "mosquito", "arana", "pavo"]},
{0: ["monday", "tuesday", "thursday", "friday", "saturday"],
 1: ["montag", "dienstag", "mitwoch", "donnerstag", "freitag", "samstag", "sontag"], 
 2: ["lunes", "martes", "miercoles", "jueves", "viernes", "sabado", "domingo"]},
{0: ["mop", "broom", "bucket", "rake"],
 1: ["mopp", "besen", "eimer", "moebelpolitur"],
 2: ["trapeador", "escoba", "cubeta", "rastrillo"]},
{0: ["north", "east", "south", "west"],
import os
import gzip

from topicmod.corpora.proto.corpus_pb2 import *
from topicmod.util import flags

# Input
flags.define_string("doc_filter", None, "Files to filter out")
flags.define_string("vocab", None, "The file that defines the vocab")
flags.define_string("state_file", None, \
                    "The state file that create the corpus")

# Output
flags.define_string("state_output", None, "Where we write state file")
flags.define_string("corpus_output_path", None, "Where we write the corpus")
flags.define_string("corpus_name", "NIH", "Name of the corpus")

# Options
flags.define_int("docs_per_index", 5000, "Number of docs per section")
flags.define_int("doc_limit", -1, "Cap on number of documents")


class MalletAssignment:
    def __init__(self, line, debug=False):
        if debug:
            for ii in xrange(len(line.split())):
                print ii, line.split()[ii]
        self.doc, foo, self.index, self.term_id, self.term, self.assignment = \
          line.split()
        self.doc = int(self.doc)
        self.index = int(self.index)
Beispiel #52
0
      tmp = word + "\t" + str(self._wordcount[word]) + "\n"
      outfile.write(tmp)
    outfile.close()

    # write coccurance:
    outputfile = self._output_dir + "/cooccurance.txt"
    outfile = open(outputfile, 'w')
    for w1 in self._cooccur.keys():
      for w2 in self._cooccur[w1].keys():
        if self._cooccur[w1][w2] != 0:
          tmp = w1 + "\t" + w2 + "\t" + str(self._cooccur[w1][w2]) + "\n"
          outfile.write(tmp)
    outfile.close()


flags.define_string("corpus", None, "Where we find the input corpora")
flags.define_string("proto_corpus", None, "Where we find the input proto corpora")
flags.define_string("vocab", "", "The model files folder of topic models")
flags.define_int("window_size", 10, "Size of window for computing coocurrance")
flags.define_string("output", "PMI_stat/20_news", "PMI stat output filename")
flags.define_int("option", "2", "0: 20_news; 1: wikipedia")

if __name__ == "__main__":
  flags.InitFlags()
  # {0: 'english', 1: 'german'}
  lang = 0
  cp = corpusParser(lang, flags.vocab, flags.corpus, flags.window_size, flags.output)
  if flags.option == 0:
    cp.parseCorpus20news()
    get_tfidf(flags.proto_corpus, flags.vocab, flags.output)
  elif flags.option == 1: