def add_tags(conllfile, taggedfile): FIELDS1 = cu.CONLL07_COLUMNS; FIELDS2 = ('form', 'postag', ); cu.FIELDS = ('id', 'form', 'cpostag', 'postag', ); orig_sents = cu.sentences_from_conll(ru.lines_from_file(conllfile), fields=FIELDS1); tag_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS2); new_sents = map(updateTags, zip(orig_sents, tag_sents)); ru.lines_to_file("", cu.sentences_to_conll(new_sents)); return;
def main(): if len(sysargv) < 5: print( "Error: ./%s <src-file> <tgt-file> <align-file> <src-parses-file>" % (sys.argv[0]), file=stderr) sysexit(1) srcsents = random_utils.lines_from_file(sys.argv[1]) tgtsents = random_utils.lines_from_file(sys.argv[2]) alignments = random_utils.lines_from_file(sys.argv[3]) parsetrees = random_utils.lines_from_file(sys.argv[4]) for phrase in extractLexicon(srcsents, tgtsents, alignments, parsetrees): print(phrasetable_utils.ppphrase(phrase)) return
def get_lexiconfile(taggedfile, lexiconfile='', poskeys='', windowsize=0): global TAGGEDFILE_FIELDS; FIELDS = TAGGEDFILE_FIELDS; orig_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS); if poskeys == '': pos_key = (FIELDS.index('udpostag'), ); # assume the default scenario to be UD tags else: pos_key = tuple(FIELDS.index(key.strip()) for key in poskeys.split(',')); mod_sents = ([(replace_token(sent[idx]['form']), "+".join(sent[idx][FIELDS[key]] for key in pos_key)) for idx in range(len(sent))] for sent in orig_sents); mod_sents = ([('BOS', 'BOS')]*windowsize+sent+[('EOS', 'EOS')]*windowsize for sent in mod_sents); #word_tags = ((sent[idx][0], '_'.join('%d:%s' %(i, sent[idx+i][1]) for i in range(-windowsize, windowsize+1))) for sent in mod_sents for idx in range(windowsize, len(sent)-windowsize)); word_tags = ((sent[idx][0], '_'.join('%s' %(sent[idx+i][1]) for i in range(-windowsize, windowsize+1))) for sent in mod_sents for idx in range(windowsize, len(sent)-windowsize)); freqs = defaultdict(int); forms_hash = defaultdict(lambda: counter().next); tags_hash = defaultdict(lambda: counter().next); for word, tag in word_tags: freqs[(forms_hash[word], tags_hash[tag])] += 1; inv_forms_hash = dict((wordidx, word) for word, wordidx in forms_hash.items()); inv_tags_hash = dict((wordidx, word) for word, wordidx in tags_hash.items()); freqs = ("%d\t%s\t%s" %(freq, inv_forms_hash[word[0]], inv_tags_hash[word[1]]) for word, freq in sorted(freqs.iteritems(), key=itemgetter(1), reverse=True)); ru.lines_to_file(lexiconfile, freqs); return;
def split_conll(): original_conllfile = sys.argv[1] if original_conllfile.endswith('.gz') or original_conllfile.endswith( '.bz2'): conllfilename, cext = os.path.splitext(original_conllfile) output_prefix, ext = os.path.splitext(conllfilename) import itertools splits = 20 sent_count = sum( imap(lambda x: 0 if x else 1, random_utils.lines_from_file(original_conllfile))) split_size = sent_count / splits + 1 with random_utils.smart_open(original_conllfile) as infile: conll_utils.FIELDS = conll_utils.CONLL09_COLUMNS foldlen = len(str(splits)) conll_sents = ( sentence for sentence in conll_utils.sentences_from_conll(infile)) for foldidx in xrange(splits): outfilepath = '%s-split%s%s%s' % ( output_prefix, str(foldidx + 1).zfill(foldlen), ext, cext) with random_utils.smart_open(outfilepath, 'w') as outfile: try: conll_utils.sentences_to_conll09( outfile, (conll_sents.next() for idx in xrange(split_size))) except StopIteration: pass return
def getCleanPhraseEntries(lexicontablefile): alphaChar = lambda x: x not in string.punctuation and x not in string.digits puncChar = lambda x: x not in '!"#$%&()*+,./:;<=>?@[\\]^_`{|}~' lenFilter = lambda token: len(token.split()) > 0 #categoryFilter = lambda tagsList: set.intersection(set(tagsList), set(['NN', 'NNS', 'NNP', 'NNPS'])); #categoryFilter = lambda tagsList: not set.intersection(set(tagsList), set(['CC', 'CD', 'DT', 'FW', 'IN', 'MD', 'RP', 'SYM', 'TO', 'UH'])); categoryFilter = lambda x: True for line in random_utils.lines_from_file(lexicontablefile): srcphrase, tgtphrase, values = line.strip().split('\t', 2) srcphrase = moses_detokenize(srcphrase) if categoryFilter(srcphrase) and filter(alphaChar, srcphrase).strip(): srcphraseRepr = ' '.join(filter(puncChar, srcphrase).split()).strip(' -') if not lenFilter(srcphraseRepr): continue else: continue tgtphrase = moses_detokenize(tgtphrase) if categoryFilter(tgtphrase) and filter(alphaChar, tgtphrase).strip(): tgtphraseRepr = ' '.join(filter(puncChar, tgtphrase).split()).strip(' -') if not lenFilter(tgtphraseRepr): continue else: continue yield '%s\t%s\t%s' % (srcphraseRepr, tgtphraseRepr, values)
def preprocess_treebanks(conllfile, outconllfile): def lc_num(conll_sent): return map(lambda X: dict(X.items()+[('form', replace_token(X['form']))]), conll_sent); inputStream = ru.lines_from_file(conllfile); cu.FIELDS = cu.CONLL07_COLUMNS; conll_sents = cu.sentences_from_conll(inputStream); mod_conll_sents = map(lc_num, conll_sents); ru.lines_to_file(outconllfile, cu.sentences_to_conll07(mod_conll_sents)); return;
def unigram_freqs(taggedfile, unigramsfile=''): global TAGGEDFILE_FIELDS; FIELDS = TAGGEDFILE_FIELDS; orig_sents = cu.sentences_from_conll(ru.lines_from_file(taggedfile), fields=FIELDS); word_forms = (replace_token(edge['form']) for sent in orig_sents for edge in sent); freqs = defaultdict(int); for word in word_forms: freqs[word] += 1; freqs = ("%s\t%s" %(word, freq) for word, freq in sorted(freqs.iteritems(), key=itemgetter(1), reverse=True)); ru.lines_to_file(unigramsfile, freqs); return;
def emission_matrix(lexiconfile, threshold=10, matrixfile=''): entries = ru.lines_from_file(lexiconfile); entries = map(lambda X: X.split('\t'), entries); entries = filter(lambda X: len(X) == 3, entries); entries = ((word, tag, int(count)) for count, word, tag in entries); #entries = [(replace_token(word), tag, count) for word, tag, count in entries]; forms_hash = defaultdict(lambda: counter().next); tags_hash = defaultdict(lambda: counter().next); word_freqs = defaultdict(int); wordtag_freqs = defaultdict(int); for word, tag, count in entries: word_freqs[forms_hash[word]] += count; wordtag_freqs[(forms_hash[word], tags_hash[tag])] += count; filtered_vcb = filter(lambda X: X[1] >= threshold, word_freqs.items()); filtered_vcb = dict(filtered_vcb); emission_matrix = ((word, tag, wordtag_freqs[(word, tag)]/word_freqs[word]) for word, tag in wordtag_freqs if word in filtered_vcb); #print("Vcb-size: %d\tFil.Vcb-size: %d\tEmi-size: %d" \ # %(len(entries), len(filtered_vcb), len(emission_matrix)), file=stderr); inv_forms_hash = dict((wordidx, word) for word, wordidx in forms_hash.items()); inv_tags_hash = dict((wordidx, word) for word, wordidx in tags_hash.items()); emission_matrix = (u"{0}\t{1}\t{2}".format(inv_forms_hash[word], inv_tags_hash[tag], prob) for word, tag, prob in sorted(emission_matrix, key=lambda (wi, ti, p): (inv_forms_hash[wi], inv_tags_hash[ti], p))); ru.lines_to_file(matrixfile, emission_matrix); return;
def getLexiconEntries(phraseTableFile, entryParser=None): entryParser = parseentry if not entryParser else entryParser; fields = ['srcphrase', 'tgtphrase', 'logprob', 'pmival', 'pmivalvar']; return map(entryParser, random_utils.lines_from_file(phraseTableFile), replicate(fields), replicate('\t'));
def par_getPhraseEntriesFromTable(phraseTableFile, entryParser=None, cores=0): entryParser = parseentry if not entryParser else entryParser; cores = multiprocessing.cpu_count() if not cores else cores; with multiprocessing.Pool(cores) as jobs: return jobs.imap_unordered(entryParser, \ random_utils.lines_from_file(phraseTableFile), chunksize=100000);
def getPhraseEntriesFromTable_alt(phraseTableFile, entryParser=None): entryParser = parseentry if not entryParser else entryParser; for line in random_utils.lines_from_file(phraseTableFile): yield entryParser(line.strip());
def getPhraseEntriesFromTable(phraseTableFile, entryParser=None): entryParser = parseentry if not entryParser else entryParser; return map(entryParser, random_utils.lines_from_file(phraseTableFile));