Ejemplo n.º 1
0
def main():
    if len(sys.argv) != 4:
        print('Usage: %s outdir input.mmax retok.txt' % sys.argv[0], file=sys.stderr)
        sys.exit(1)

    out_dir = sys.argv[1]
    in_mmax = sys.argv[2]
    retok_txt = sys.argv[3]

    mmax_dir, mmax_file = os.path.split(in_mmax)
    mmax_id = os.path.splitext(mmax_file)[0]

    mmax_sent = mmax.get_sentences_from_mmax(mmax_dir, mmax_id)
    with open(retok_txt, 'r') as f:
        retok_sent = [line.rstrip('\n').split(' ') for line in f]

    for d in ['Basedata', 'Markables']:
        if not os.path.exists(out_dir + '/' + d):
            os.mkdir(out_dir + '/' + d)

    mmax_start = 0
    retok_start = 0
    translated = {}

    for m, p in zip(mmax_sent, retok_sent):
        alig = tokalign.align([t[1] for t in m], p)
        for m_idx, p_idx in alig:
            if m_idx is not None and p_idx is not None:
                translated[mmax_start + m_idx] = retok_start + p_idx
        mmax_start += len(m)
        retok_start += len(p)

    write_basedata(mmax.words_file(out_dir, mmax_id), retok_sent)
    write_sentences(mmax.sentences_file(out_dir, mmax_id), retok_sent)
    translate_coref(mmax.coref_file(mmax_dir, mmax_id), mmax.coref_file(out_dir, mmax_id), translated)
Ejemplo n.º 2
0
def main():
    if len(sys.argv) != 4:
        print('Usage: %s outdir input.mmax input.xml' % sys.argv[0],
              file=sys.stderr)
        sys.exit(1)

    out_dir = sys.argv[1]
    in_mmax = sys.argv[2]
    in_wmt = sys.argv[3]

    mmax_dir, mmax_file = os.path.split(in_mmax)
    mmax_id = os.path.splitext(mmax_file)[0]

    mmax_sent = mmax.get_sentences_from_mmax(mmax_dir, mmax_id)
    penn_sent = get_penntok_from_wmt_xml(in_wmt)

    for d in ['Basedata', 'Markables']:
        if not os.path.exists(out_dir + '/' + d):
            os.mkdir(out_dir + '/' + d)

    mmax_start = 0
    penn_start = 0
    translated = {}

    operations = [
        tokalign.LinkSame(0.0),
        AlignApostropheS(0.0),
        tokalign.LinkDifferent(1.0),
        tokalign.Skip1(2.0),
        tokalign.Skip2(2.0)
    ]

    for m, p in zip(mmax_sent, penn_sent):
        alig = tokalign.align([t[1] for t in m], p, operations=operations)
        for m_idx, p_idx in alig:
            if m_idx is not None and p_idx is not None:
                translated[mmax_start + m_idx] = penn_start + p_idx
        mmax_start += len(m)
        penn_start += len(p)

    write_basedata(mmax.words_file(out_dir, mmax_id), penn_sent)
    write_sentences(mmax.sentences_file(out_dir, mmax_id), penn_sent)
    translate_coref(mmax.coref_file(mmax_dir, mmax_id),
                    mmax.coref_file(out_dir, mmax_id), translated)
Ejemplo n.º 3
0
def main():
    if len(sys.argv) != 4:
        print('Usage: %s outdir input.mmax input.txt' % sys.argv[0],
              file=sys.stderr)
        sys.exit(1)

    out_dir = sys.argv[1]
    in_mmax = sys.argv[2]
    in_txt = sys.argv[3]

    mmax_dir, mmax_file = os.path.split(in_mmax)
    mmax_id = os.path.splitext(mmax_file)[0]

    mmax_sent = mmax.get_sentences_from_mmax(mmax_dir, mmax_id)
    penn_sent = get_penntok_from_txt(in_txt)

    for d in ['Basedata', 'Markables']:
        if not os.path.exists(out_dir + '/' + d):
            os.mkdir(out_dir + '/' + d)

    mmax_start = 0
    penn_start = 0
    translated = {}

    for m, p in zip(mmax_sent, penn_sent):
        alig = tokalign.align([t[1] for t in m], p)
        for m_idx, p_idx in alig:
            if m_idx is not None and p_idx is not None:
                translated[mmax_start + m_idx] = penn_start + p_idx
        mmax_start += len(m)
        penn_start += len(p)

    shutil.copy(in_mmax, out_dir)
    write_basedata(mmax.words_file(out_dir, mmax_id), penn_sent)
    write_sentences(mmax.sentences_file(out_dir, mmax_id), penn_sent)
    translate_coref(mmax.coref_file(mmax_dir, mmax_id),
                    mmax.coref_file(out_dir, mmax_id), translated)
Ejemplo n.º 4
0
def get_coref_chain_boundaries(mmax_dir, mmax_id):
    with open(mmax.sentences_file(mmax_dir, mmax_id), 'r') as f:
        s_soup = bs4.BeautifulSoup(f, 'xml')

    sentence_id = {}
    for mrk in s_soup.find_all('markable'):
        for i in range(*mmax.parse_span(mrk['span'])):
            sentence_id[i] = mrk['orderid']

    with open(mmax.coref_file(mmax_dir, mmax_id), 'r') as f:
        soup = bs4.BeautifulSoup(f, 'xml')

    directory = {'__next__': 1}
    boundaries = {}
    clause_or_vp = set()
    for mrk in soup.find_all('markable'):
        if not mrk.has_attr('coref_class') or not mrk['coref_class'] or mrk['coref_class'] == 'empty':
            continue

        chain_idx = lookup_chain(directory, mrk['coref_class'])

        if mrk['mention'] in ('clause', 'vp'):
            clause_or_vp.add(chain_idx)

        for s in mrk['span'].split(','):
            start, end = mmax.parse_span(s)
            if start == end - 1:
                append(boundaries, start, ('(%d)', chain_idx))
            else:
                if sentence_id[start] != sentence_id[end - 1]:
                    print('%s: Skipped cross-sentence mention (%d): %s' % (mmax_id, end - start, str(mrk)),
                            file=sys.stderr)
                else:
                    append(boundaries, start, ('(%d', chain_idx))
                    append(boundaries, end - 1, ('%d)', chain_idx))

    str_boundaries = {}
    for pos, chains in boundaries.items():
        str_chains = [fmt % idx for fmt, idx in chains if idx not in clause_or_vp]
        if len(str_chains):
            str_boundaries[pos] = '|'.join(str_chains)
        else:
            str_boundaries[pos] = '-'

    return str_boundaries