def main(): if len(sys.argv) != 4: print('Usage: %s outdir input.mmax retok.txt' % sys.argv[0], file=sys.stderr) sys.exit(1) out_dir = sys.argv[1] in_mmax = sys.argv[2] retok_txt = sys.argv[3] mmax_dir, mmax_file = os.path.split(in_mmax) mmax_id = os.path.splitext(mmax_file)[0] mmax_sent = mmax.get_sentences_from_mmax(mmax_dir, mmax_id) with open(retok_txt, 'r') as f: retok_sent = [line.rstrip('\n').split(' ') for line in f] for d in ['Basedata', 'Markables']: if not os.path.exists(out_dir + '/' + d): os.mkdir(out_dir + '/' + d) mmax_start = 0 retok_start = 0 translated = {} for m, p in zip(mmax_sent, retok_sent): alig = tokalign.align([t[1] for t in m], p) for m_idx, p_idx in alig: if m_idx is not None and p_idx is not None: translated[mmax_start + m_idx] = retok_start + p_idx mmax_start += len(m) retok_start += len(p) write_basedata(mmax.words_file(out_dir, mmax_id), retok_sent) write_sentences(mmax.sentences_file(out_dir, mmax_id), retok_sent) translate_coref(mmax.coref_file(mmax_dir, mmax_id), mmax.coref_file(out_dir, mmax_id), translated)
def main(): if len(sys.argv) != 4: print('Usage: %s outdir input.mmax input.xml' % sys.argv[0], file=sys.stderr) sys.exit(1) out_dir = sys.argv[1] in_mmax = sys.argv[2] in_wmt = sys.argv[3] mmax_dir, mmax_file = os.path.split(in_mmax) mmax_id = os.path.splitext(mmax_file)[0] mmax_sent = mmax.get_sentences_from_mmax(mmax_dir, mmax_id) penn_sent = get_penntok_from_wmt_xml(in_wmt) for d in ['Basedata', 'Markables']: if not os.path.exists(out_dir + '/' + d): os.mkdir(out_dir + '/' + d) mmax_start = 0 penn_start = 0 translated = {} operations = [ tokalign.LinkSame(0.0), AlignApostropheS(0.0), tokalign.LinkDifferent(1.0), tokalign.Skip1(2.0), tokalign.Skip2(2.0) ] for m, p in zip(mmax_sent, penn_sent): alig = tokalign.align([t[1] for t in m], p, operations=operations) for m_idx, p_idx in alig: if m_idx is not None and p_idx is not None: translated[mmax_start + m_idx] = penn_start + p_idx mmax_start += len(m) penn_start += len(p) write_basedata(mmax.words_file(out_dir, mmax_id), penn_sent) write_sentences(mmax.sentences_file(out_dir, mmax_id), penn_sent) translate_coref(mmax.coref_file(mmax_dir, mmax_id), mmax.coref_file(out_dir, mmax_id), translated)
def main(): if len(sys.argv) != 4: print('Usage: %s outdir input.mmax input.txt' % sys.argv[0], file=sys.stderr) sys.exit(1) out_dir = sys.argv[1] in_mmax = sys.argv[2] in_txt = sys.argv[3] mmax_dir, mmax_file = os.path.split(in_mmax) mmax_id = os.path.splitext(mmax_file)[0] mmax_sent = mmax.get_sentences_from_mmax(mmax_dir, mmax_id) penn_sent = get_penntok_from_txt(in_txt) for d in ['Basedata', 'Markables']: if not os.path.exists(out_dir + '/' + d): os.mkdir(out_dir + '/' + d) mmax_start = 0 penn_start = 0 translated = {} for m, p in zip(mmax_sent, penn_sent): alig = tokalign.align([t[1] for t in m], p) for m_idx, p_idx in alig: if m_idx is not None and p_idx is not None: translated[mmax_start + m_idx] = penn_start + p_idx mmax_start += len(m) penn_start += len(p) shutil.copy(in_mmax, out_dir) write_basedata(mmax.words_file(out_dir, mmax_id), penn_sent) write_sentences(mmax.sentences_file(out_dir, mmax_id), penn_sent) translate_coref(mmax.coref_file(mmax_dir, mmax_id), mmax.coref_file(out_dir, mmax_id), translated)
def get_coref_chain_boundaries(mmax_dir, mmax_id): with open(mmax.sentences_file(mmax_dir, mmax_id), 'r') as f: s_soup = bs4.BeautifulSoup(f, 'xml') sentence_id = {} for mrk in s_soup.find_all('markable'): for i in range(*mmax.parse_span(mrk['span'])): sentence_id[i] = mrk['orderid'] with open(mmax.coref_file(mmax_dir, mmax_id), 'r') as f: soup = bs4.BeautifulSoup(f, 'xml') directory = {'__next__': 1} boundaries = {} clause_or_vp = set() for mrk in soup.find_all('markable'): if not mrk.has_attr('coref_class') or not mrk['coref_class'] or mrk['coref_class'] == 'empty': continue chain_idx = lookup_chain(directory, mrk['coref_class']) if mrk['mention'] in ('clause', 'vp'): clause_or_vp.add(chain_idx) for s in mrk['span'].split(','): start, end = mmax.parse_span(s) if start == end - 1: append(boundaries, start, ('(%d)', chain_idx)) else: if sentence_id[start] != sentence_id[end - 1]: print('%s: Skipped cross-sentence mention (%d): %s' % (mmax_id, end - start, str(mrk)), file=sys.stderr) else: append(boundaries, start, ('(%d', chain_idx)) append(boundaries, end - 1, ('%d)', chain_idx)) str_boundaries = {} for pos, chains in boundaries.items(): str_chains = [fmt % idx for fmt, idx in chains if idx not in clause_or_vp] if len(str_chains): str_boundaries[pos] = '|'.join(str_chains) else: str_boundaries[pos] = '-' return str_boundaries