Example #1
0
def get_lexiconfile(taggedfile, lexiconfile='', poskeys='', windowsize=0):
  global TAGGEDFILE_FIELDS;
  FIELDS = TAGGEDFILE_FIELDS;
  orig_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS);
  
  if poskeys == '': 
    pos_key = (FIELDS.index('udpostag'), );  # assume the default scenario to be UD tags
  else:
    pos_key = tuple(FIELDS.index(key.strip()) for key in poskeys.split(','));
  
  mod_sents = ([(replace_token(sent[idx]['form']), "+".join(sent[idx][FIELDS[key]] for key in pos_key)) for idx in range(len(sent))] for sent in orig_sents);
  mod_sents = ([('BOS', 'BOS')]*windowsize+sent+[('EOS', 'EOS')]*windowsize for sent in mod_sents);
  #word_tags = ((sent[idx][0], '_'.join('%d:%s' %(i, sent[idx+i][1]) for i in range(-windowsize, windowsize+1))) for sent in mod_sents for idx in range(windowsize, len(sent)-windowsize));
  word_tags = ((sent[idx][0], '_'.join('%s' %(sent[idx+i][1]) for i in range(-windowsize, windowsize+1))) for sent in mod_sents for idx in range(windowsize, len(sent)-windowsize));
  
  freqs = defaultdict(int);
  forms_hash = defaultdict(lambda: counter().next);
  tags_hash  = defaultdict(lambda: counter().next);
  for word, tag in word_tags:
    freqs[(forms_hash[word], tags_hash[tag])] += 1;

  inv_forms_hash = dict((wordidx, word) for word, wordidx in forms_hash.items());
  inv_tags_hash  = dict((wordidx, word) for word, wordidx in tags_hash.items());
  freqs = ("%d\t%s\t%s" %(freq, inv_forms_hash[word[0]], inv_tags_hash[word[1]]) for word, freq in sorted(freqs.iteritems(), key=itemgetter(1), reverse=True));
  ru.lines_to_file(lexiconfile, freqs);
  return;
Example #2
0
def clean_dictionary(phrase_file):
    lexicon = pt.getPhraseEntriesFromTable(phrase_file)
    lexicon = filter(pt.filterLex, lexicon)
    entries = list((entry['srcphrase'], entry['tgtphrase'], \
        entry['probValues'][0], entry['probValues'][1], \
        entry['probValues'][2], entry['probValues'][3]) \
        for entry in lexicon)

    # Make it completely random. Which two distributions we choose to work with
    #direction = True if np.random.random() <= 0.5 else False;
    direction = True
    if direction:
        #srctotgt
        pprobs = np.asarray([X[2] for X in entries])
        lprobs = np.asarray([X[4] for X in entries])
        vocab = set(X[0] for X in entries)
        index = 0
    else:
        #tgttosrc
        pprobs = np.asarray([X[3] for X in entries])
        lprobs = np.asarray([X[5] for X in entries])
        vocab = set(X[1] for X in entries)
        index = 1

    vocab = sorted(list(vocab))
    vocab = dict((phrase, idx) for idx, phrase in enumerate(vocab))
    groups = sparse.dok_matrix((len(vocab), len(entries)), dtype=float)
    for idx, entry in enumerate(entries):
        groups[vocab[entry[index]], idx] = 1
    groups = groups.tocsc()

    sparse_dists = convex_cleanup(pprobs, lprobs, groups)
    global_sol = None
    global_entropy = -100
    for dist in sparse_dists:
        solution = dist.value
        entropy = cvx.sum_entries(cvx.entr(solution)).value
        if entropy > global_entropy:
            global_sol = solution
        print(np.count_nonzero(solution),
              np.min(solution),
              np.max(solution),
              entropy,
              file=stderr)
        #solution = list(solution.getA1());

    global_sol = list(global_sol.getA1())
    groups = groups.todok()
    pruned_dictionary = ("%s\t%s\t%.4f" %(entries[key[1]][0], \
        entries[key[1]][1], \
        prob) \
        for key, prob in zip(sorted(groups.keys()), solution))

    random_utils.lines_to_file('', pruned_dictionary)

    return
Example #3
0
def add_tags(conllfile, taggedfile):
  FIELDS1   = cu.CONLL07_COLUMNS;
  FIELDS2   = ('form', 'postag', );
  cu.FIELDS = ('id', 'form', 'cpostag', 'postag', );

  orig_sents = cu.sentences_from_conll(ru.lines_from_file(conllfile), fields=FIELDS1);
  tag_sents  = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS2);
  new_sents  = map(updateTags, zip(orig_sents, tag_sents));
  ru.lines_to_file("", cu.sentences_to_conll(new_sents));
  return;
Example #4
0
def preprocess_treebanks(conllfile, outconllfile):
  def lc_num(conll_sent):
    return map(lambda X: dict(X.items()+[('form', replace_token(X['form']))]),
      conll_sent);

  inputStream = ru.lines_from_file(conllfile);
  cu.FIELDS = cu.CONLL07_COLUMNS;
  conll_sents = cu.sentences_from_conll(inputStream);
  mod_conll_sents = map(lc_num, conll_sents);
  ru.lines_to_file(outconllfile, cu.sentences_to_conll07(mod_conll_sents));
  return;
Example #5
0
def unigram_freqs(taggedfile, unigramsfile=''):
  global TAGGEDFILE_FIELDS;
  FIELDS = TAGGEDFILE_FIELDS;
  orig_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS);
  word_forms = (replace_token(edge['form']) for sent in orig_sents for edge in sent);
  freqs = defaultdict(int);
  for word in word_forms:
    freqs[word] += 1;
  freqs = ("%s\t%s" %(word, freq) for word, freq in sorted(freqs.iteritems(), key=itemgetter(1), reverse=True));
  ru.lines_to_file(unigramsfile, freqs);
  return;
Example #6
0
def emission_matrix(lexiconfile, threshold=10, matrixfile=''):
  entries = ru.lines_from_file(lexiconfile);
  entries = map(lambda X: X.split('\t'), entries);
  entries = filter(lambda X: len(X) == 3, entries);
  entries = ((word, tag, int(count)) for count, word, tag in entries);
  #entries = [(replace_token(word), tag, count) for word, tag, count in entries];
  forms_hash = defaultdict(lambda: counter().next);
  tags_hash  = defaultdict(lambda: counter().next);
  word_freqs = defaultdict(int);
  wordtag_freqs = defaultdict(int);
  for word, tag, count in entries:
    word_freqs[forms_hash[word]] += count;
    wordtag_freqs[(forms_hash[word], tags_hash[tag])] += count;
  filtered_vcb = filter(lambda X: X[1] >= threshold, word_freqs.items());
  filtered_vcb = dict(filtered_vcb);
  emission_matrix = ((word, tag, wordtag_freqs[(word, tag)]/word_freqs[word]) for word, tag in wordtag_freqs if word in filtered_vcb);
  #print("Vcb-size: %d\tFil.Vcb-size: %d\tEmi-size: %d" \
  #    %(len(entries), len(filtered_vcb), len(emission_matrix)), file=stderr);
  inv_forms_hash = dict((wordidx, word) for word, wordidx in forms_hash.items());
  inv_tags_hash  = dict((wordidx, word) for word, wordidx in tags_hash.items());
  emission_matrix = (u"{0}\t{1}\t{2}".format(inv_forms_hash[word], inv_tags_hash[tag], prob) for word, tag, prob in sorted(emission_matrix, key=lambda (wi, ti, p): (inv_forms_hash[wi], inv_tags_hash[ti], p)));
  ru.lines_to_file(matrixfile, emission_matrix);
  return;