def translate_coref(infile, outfile, translated): with open(infile, 'r') as f: soup = bs4.BeautifulSoup(f, 'xml') total = 0 skipped = 0 for mrk in soup.find_all('markable'): total += 1 span_parts = [] for in_span in mrk['span'].split(','): from_idx, to_idx = mmax.parse_span(in_span) if from_idx not in translated: print('Unaligned start word: ' + str(mrk), file=sys.stderr) skipped += 1 continue if from_idx == to_idx: span = 'word_%d' % (translated[from_idx] + 1) elif to_idx - 1 not in translated: print('Unaligned end word: ' + str(mrk), file=sys.stderr) skipped += 1 continue else: span = 'word_%d..word_%d' % tuple( translated[i] + 1 for i in [from_idx, to_idx - 1]) span_parts.append(span) mrk['span'] = ','.join(span_parts) print('Skipped %d out of %d markables.' % (skipped, total), file=sys.stderr) with open(outfile, 'w') as f: print(soup.prettify(), file=f)
def get_coref_chain_boundaries(mmax_dir, mmax_id): with open(mmax.sentences_file(mmax_dir, mmax_id), 'r') as f: s_soup = bs4.BeautifulSoup(f, 'xml') sentence_id = {} for mrk in s_soup.find_all('markable'): for i in range(*mmax.parse_span(mrk['span'])): sentence_id[i] = mrk['orderid'] with open(mmax.coref_file(mmax_dir, mmax_id), 'r') as f: soup = bs4.BeautifulSoup(f, 'xml') directory = {'__next__': 1} boundaries = {} clause_or_vp = set() for mrk in soup.find_all('markable'): if not mrk.has_attr('coref_class') or not mrk['coref_class'] or mrk['coref_class'] == 'empty': continue chain_idx = lookup_chain(directory, mrk['coref_class']) if mrk['mention'] in ('clause', 'vp'): clause_or_vp.add(chain_idx) for s in mrk['span'].split(','): start, end = mmax.parse_span(s) if start == end - 1: append(boundaries, start, ('(%d)', chain_idx)) else: if sentence_id[start] != sentence_id[end - 1]: print('%s: Skipped cross-sentence mention (%d): %s' % (mmax_id, end - start, str(mrk)), file=sys.stderr) else: append(boundaries, start, ('(%d', chain_idx)) append(boundaries, end - 1, ('%d)', chain_idx)) str_boundaries = {} for pos, chains in boundaries.items(): str_chains = [fmt % idx for fmt, idx in chains if idx not in clause_or_vp] if len(str_chains): str_boundaries[pos] = '|'.join(str_chains) else: str_boundaries[pos] = '-' return str_boundaries
def compare_mmax(dir1, dir2, mmax_dir, mmax_id): mmax_dir1 = os.path.join(dir1, mmax_dir) mmax_dir2 = os.path.join(dir2, mmax_dir) with open(mmax.words_file(mmax_dir1, mmax_id), 'r') as f: words1 = [ w.string for w in bs4.BeautifulSoup(f, 'xml').find_all('word') ] with open(mmax.words_file(mmax_dir2, mmax_id), 'r') as f: words2 = [ w.string for w in bs4.BeautifulSoup(f, 'xml').find_all('word') ] to_check = [] for level in ['sentence', 'coref']: fname1 = '%s/Markables/%s_%s_level.xml' % (mmax_dir1, mmax_id, level) fname2 = '%s/Markables/%s_%s_level.xml' % (mmax_dir2, mmax_id, level) with open(fname1, 'r') as f: markables1 = [ m for m in bs4.BeautifulSoup(f, 'xml').find_all('markable') ] with open(fname2, 'r') as f: markables2 = [ m for m in bs4.BeautifulSoup(f, 'xml').find_all('markable') ] def sort_key(mrk): return [mmax.parse_span(m) for m in mrk['span'].split(',')] markables1.sort(key=sort_key) markables2.sort(key=sort_key) if len(markables1) != len(markables2): print('%s/%s: Number of markables does not match (%d != %d)' % (mmax_dir, mmax_id, len(markables1), len(markables2)), file=sys.stderr) continue for mrk1, mrk2 in zip(markables1, markables2): spans1 = mrk1['span'].split(',') spans2 = mrk2['span'].split(',') if len(spans1) != len(spans2): print( '%s/%s: Number of span components does not match (%s / %s)' % (mmax_dir, mmax_id, mrk1['span'], mrk2['span']), file=sys.stderr) continue for sp1, sp2 in zip(spans1, spans2): s1, e1 = mmax.parse_span(sp1) s2, e2 = mmax.parse_span(sp2) txt1 = ''.join(words1[s1:e1]) txt2 = ''.join(words2[s2:e2]) # Quotes are changed by the tokeniser txt1 = txt1.replace("``", '"').replace("''", '"') txt2 = txt2.replace("``", '"').replace("''", '"') if txt1 != txt2: to_check.append((level, sp2)) if to_check: print(mmax_dir, mmax_id, file=sys.stderr) create_checks_level(mmax_dir2, mmax_id, to_check)
def sort_key(mrk): return [mmax.parse_span(m) for m in mrk['span'].split(',')]