Esempio n. 1
0
def add_tags(conllfile, taggedfile):
  FIELDS1   = cu.CONLL07_COLUMNS;
  FIELDS2   = ('form', 'postag', );
  cu.FIELDS = ('id', 'form', 'cpostag', 'postag', );

  orig_sents = cu.sentences_from_conll(ru.lines_from_file(conllfile), fields=FIELDS1);
  tag_sents  = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS2);
  new_sents  = map(updateTags, zip(orig_sents, tag_sents));
  ru.lines_to_file("", cu.sentences_to_conll(new_sents));
  return;
def main():
    if len(sysargv) < 5:
        print(
            "Error: ./%s <src-file> <tgt-file> <align-file> <src-parses-file>"
            % (sys.argv[0]),
            file=stderr)
        sysexit(1)

    srcsents = random_utils.lines_from_file(sys.argv[1])
    tgtsents = random_utils.lines_from_file(sys.argv[2])
    alignments = random_utils.lines_from_file(sys.argv[3])
    parsetrees = random_utils.lines_from_file(sys.argv[4])

    for phrase in extractLexicon(srcsents, tgtsents, alignments, parsetrees):
        print(phrasetable_utils.ppphrase(phrase))
    return
Esempio n. 3
0
def get_lexiconfile(taggedfile, lexiconfile='', poskeys='', windowsize=0):
  global TAGGEDFILE_FIELDS;
  FIELDS = TAGGEDFILE_FIELDS;
  orig_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS);
  
  if poskeys == '': 
    pos_key = (FIELDS.index('udpostag'), );  # assume the default scenario to be UD tags
  else:
    pos_key = tuple(FIELDS.index(key.strip()) for key in poskeys.split(','));
  
  mod_sents = ([(replace_token(sent[idx]['form']), "+".join(sent[idx][FIELDS[key]] for key in pos_key)) for idx in range(len(sent))] for sent in orig_sents);
  mod_sents = ([('BOS', 'BOS')]*windowsize+sent+[('EOS', 'EOS')]*windowsize for sent in mod_sents);
  #word_tags = ((sent[idx][0], '_'.join('%d:%s' %(i, sent[idx+i][1]) for i in range(-windowsize, windowsize+1))) for sent in mod_sents for idx in range(windowsize, len(sent)-windowsize));
  word_tags = ((sent[idx][0], '_'.join('%s' %(sent[idx+i][1]) for i in range(-windowsize, windowsize+1))) for sent in mod_sents for idx in range(windowsize, len(sent)-windowsize));
  
  freqs = defaultdict(int);
  forms_hash = defaultdict(lambda: counter().next);
  tags_hash  = defaultdict(lambda: counter().next);
  for word, tag in word_tags:
    freqs[(forms_hash[word], tags_hash[tag])] += 1;

  inv_forms_hash = dict((wordidx, word) for word, wordidx in forms_hash.items());
  inv_tags_hash  = dict((wordidx, word) for word, wordidx in tags_hash.items());
  freqs = ("%d\t%s\t%s" %(freq, inv_forms_hash[word[0]], inv_tags_hash[word[1]]) for word, freq in sorted(freqs.iteritems(), key=itemgetter(1), reverse=True));
  ru.lines_to_file(lexiconfile, freqs);
  return;
Esempio n. 4
0
def split_conll():
    original_conllfile = sys.argv[1]
    if original_conllfile.endswith('.gz') or original_conllfile.endswith(
            '.bz2'):
        conllfilename, cext = os.path.splitext(original_conllfile)
    output_prefix, ext = os.path.splitext(conllfilename)

    import itertools
    splits = 20
    sent_count = sum(
        imap(lambda x: 0 if x else 1,
             random_utils.lines_from_file(original_conllfile)))
    split_size = sent_count / splits + 1

    with random_utils.smart_open(original_conllfile) as infile:
        conll_utils.FIELDS = conll_utils.CONLL09_COLUMNS
        foldlen = len(str(splits))
        conll_sents = (
            sentence for sentence in conll_utils.sentences_from_conll(infile))
        for foldidx in xrange(splits):
            outfilepath = '%s-split%s%s%s' % (
                output_prefix, str(foldidx + 1).zfill(foldlen), ext, cext)
            with random_utils.smart_open(outfilepath, 'w') as outfile:
                try:
                    conll_utils.sentences_to_conll09(
                        outfile,
                        (conll_sents.next() for idx in xrange(split_size)))
                except StopIteration:
                    pass
    return
Esempio n. 5
0
def getCleanPhraseEntries(lexicontablefile):
    alphaChar = lambda x: x not in string.punctuation and x not in string.digits
    puncChar = lambda x: x not in '!"#$%&()*+,./:;<=>?@[\\]^_`{|}~'
    lenFilter = lambda token: len(token.split()) > 0
    #categoryFilter = lambda tagsList: set.intersection(set(tagsList), set(['NN', 'NNS', 'NNP', 'NNPS']));
    #categoryFilter = lambda tagsList: not set.intersection(set(tagsList), set(['CC', 'CD', 'DT', 'FW', 'IN', 'MD', 'RP', 'SYM', 'TO', 'UH']));
    categoryFilter = lambda x: True

    for line in random_utils.lines_from_file(lexicontablefile):
        srcphrase, tgtphrase, values = line.strip().split('\t', 2)
        srcphrase = moses_detokenize(srcphrase)
        if categoryFilter(srcphrase) and filter(alphaChar, srcphrase).strip():
            srcphraseRepr = ' '.join(filter(puncChar,
                                            srcphrase).split()).strip(' -')
            if not lenFilter(srcphraseRepr):
                continue
        else:
            continue
        tgtphrase = moses_detokenize(tgtphrase)
        if categoryFilter(tgtphrase) and filter(alphaChar, tgtphrase).strip():
            tgtphraseRepr = ' '.join(filter(puncChar,
                                            tgtphrase).split()).strip(' -')
            if not lenFilter(tgtphraseRepr):
                continue
        else:
            continue
        yield '%s\t%s\t%s' % (srcphraseRepr, tgtphraseRepr, values)
Esempio n. 6
0
def preprocess_treebanks(conllfile, outconllfile):
  def lc_num(conll_sent):
    return map(lambda X: dict(X.items()+[('form', replace_token(X['form']))]),
      conll_sent);

  inputStream = ru.lines_from_file(conllfile);
  cu.FIELDS = cu.CONLL07_COLUMNS;
  conll_sents = cu.sentences_from_conll(inputStream);
  mod_conll_sents = map(lc_num, conll_sents);
  ru.lines_to_file(outconllfile, cu.sentences_to_conll07(mod_conll_sents));
  return;
Esempio n. 7
0
def unigram_freqs(taggedfile, unigramsfile=''):
  global TAGGEDFILE_FIELDS;
  FIELDS = TAGGEDFILE_FIELDS;
  orig_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS);
  word_forms = (replace_token(edge['form']) for sent in orig_sents for edge in sent);
  freqs = defaultdict(int);
  for word in word_forms:
    freqs[word] += 1;
  freqs = ("%s\t%s" %(word, freq) for word, freq in sorted(freqs.iteritems(), key=itemgetter(1), reverse=True));
  ru.lines_to_file(unigramsfile, freqs);
  return;
Esempio n. 8
0
def emission_matrix(lexiconfile, threshold=10, matrixfile=''):
  entries = ru.lines_from_file(lexiconfile);
  entries = map(lambda X: X.split('\t'), entries);
  entries = filter(lambda X: len(X) == 3, entries);
  entries = ((word, tag, int(count)) for count, word, tag in entries);
  #entries = [(replace_token(word), tag, count) for word, tag, count in entries];
  forms_hash = defaultdict(lambda: counter().next);
  tags_hash  = defaultdict(lambda: counter().next);
  word_freqs = defaultdict(int);
  wordtag_freqs = defaultdict(int);
  for word, tag, count in entries:
    word_freqs[forms_hash[word]] += count;
    wordtag_freqs[(forms_hash[word], tags_hash[tag])] += count;
  filtered_vcb = filter(lambda X: X[1] >= threshold, word_freqs.items());
  filtered_vcb = dict(filtered_vcb);
  emission_matrix = ((word, tag, wordtag_freqs[(word, tag)]/word_freqs[word]) for word, tag in wordtag_freqs if word in filtered_vcb);
  #print("Vcb-size: %d\tFil.Vcb-size: %d\tEmi-size: %d" \
  #    %(len(entries), len(filtered_vcb), len(emission_matrix)), file=stderr);
  inv_forms_hash = dict((wordidx, word) for word, wordidx in forms_hash.items());
  inv_tags_hash  = dict((wordidx, word) for word, wordidx in tags_hash.items());
  emission_matrix = (u"{0}\t{1}\t{2}".format(inv_forms_hash[word], inv_tags_hash[tag], prob) for word, tag, prob in sorted(emission_matrix, key=lambda (wi, ti, p): (inv_forms_hash[wi], inv_tags_hash[ti], p)));
  ru.lines_to_file(matrixfile, emission_matrix);
  return;
Esempio n. 9
0
def getLexiconEntries(phraseTableFile, entryParser=None):
  entryParser = parseentry if not entryParser else entryParser;
  fields = ['srcphrase', 'tgtphrase', 'logprob', 'pmival', 'pmivalvar'];
  return map(entryParser, random_utils.lines_from_file(phraseTableFile), replicate(fields), replicate('\t'));
Esempio n. 10
0
def par_getPhraseEntriesFromTable(phraseTableFile, entryParser=None, cores=0):
  entryParser = parseentry if not entryParser else entryParser;
  cores = multiprocessing.cpu_count() if not cores else cores;
  with multiprocessing.Pool(cores) as jobs:
    return jobs.imap_unordered(entryParser, \
        random_utils.lines_from_file(phraseTableFile), chunksize=100000);
Esempio n. 11
0
def getPhraseEntriesFromTable_alt(phraseTableFile, entryParser=None):
  entryParser = parseentry if not entryParser else entryParser;
  for line in random_utils.lines_from_file(phraseTableFile):
    yield entryParser(line.strip());
Esempio n. 12
0
def getPhraseEntriesFromTable(phraseTableFile, entryParser=None):
  entryParser = parseentry if not entryParser else entryParser;
  return map(entryParser, random_utils.lines_from_file(phraseTableFile));