def parse(texts, data): for text, (chunks, tags) in strict_zip(texts, data): # see patch_texts if not text.strip(): spans = [] else: tokens = list(find_tokens(chunks, text)) spans = list(bio_spans(tokens, tags)) yield DeeppavlovMarkup(text, spans)
def parse_conll(lines): chunks = [] tags = [] for line in lines: chunk, tag = line.split('\t', 1) chunks.append(chunk) tags.append(tag) text = ' '.join(chunks) tokens = list(find_tokens(chunks, text)) spans = list(bio_spans(tokens, tags)) return GareevMarkup(text, spans)
def parse(text, data): chunks, spans = data tokens = list(find_tokens(chunks, text, strip=MITIE_STRIP)) spans = list(parse_spans(tokens, spans)) return MitieMarkup(text, spans)
def test_find_tokens(chunks, text, etalon): guess = list(find_tokens(chunks, text)) assert guess == etalon