def calculate(disamb_path, pred_path, ambig_path): if 'jsonl' in disamb_path: reference_paragraphs = [] with jsonlines.open(disamb_path) as reader: for data in reader: ktext = KText.load(data) # ktext.find_ambiguous_end_offsets() reference_paragraphs.append(ktext) refs = {} for ref in reference_paragraphs: ref_offsets, text = get_reference_offsets(ref) refs[text] = set(ref_offsets) elif 'tsv' in disamb_path: reference_paragraphs = list(get_input_paragraphs(disamb_path)) refs = {} for pred in reference_paragraphs: input_offsets, text = get_input_offsets(pred) refs[text] = set(input_offsets) predicted_paragraphs = list(paragraphs(pred_path)) assert len(predicted_paragraphs) == len(reference_paragraphs) input_paragraphs = list(get_input_paragraphs(ambig_path)) assert len(predicted_paragraphs) == len(input_paragraphs) preds = {} input_refs = {} unambigs = {} for pred in predicted_paragraphs: pred_offsets, text = get_predicted_offsets(pred) preds[text] = set(pred_offsets) for pred in input_paragraphs: unambig_offsets, text = get_input_unambig_offsets(pred) unambigs[text] = set(unambig_offsets) for pred in input_paragraphs: input_offsets, text = get_input_offsets(pred) input_refs[text] = set(input_offsets) print("\n".join(sorted(preds.keys() - refs.keys()))) print('---') print("\n".join(sorted(refs.keys() - preds.keys()))) assert not (preds.keys() - refs.keys()) assert not (refs.keys() - preds.keys()) assert not (refs.keys() - unambigs.keys()) assert not (unambigs.keys() - refs.keys()) return refs, preds, unambigs, input_refs
def morfeusz_tokenize(text: str, original_ktext: KText): ktext = KText(original_ktext.id) ktext.text = text ktext.year = original_ktext.year output = morfeusz.analyse(text) for start_position, end_position, i in output: form, pseudo_lemma, combined_tags, _, _ = i lemma = regex.sub(r':[abcdijnopqsv][0-9]?$', '', pseudo_lemma) kinterpretation = KInterpretation(lemma, combined_tags, disamb=False, manual=False) if ktext.tokens and ktext.tokens[-1].start_position == start_position and ktext.tokens[ -1].end_position == end_position: ktext.tokens[-1].add_interpretation(kinterpretation) else: ktoken = KToken(form, space_before=None, start_offset=None, end_offset=None) ktoken.start_position = start_position ktoken.end_position = end_position ktoken.add_interpretation(kinterpretation) ktext.add_token(ktoken) return ktext
#get all tokens from char preds # for segments in paragraphs(args.pred_path): # tokens=segments_to_tokens(segments) # print(tokens) # # break with jsonlines.open(args.merged_path) as reader, open(args.output_path, 'w') as writer: for data, segments in zip(reader, paragraphs(args.pred_path, args.input_path)): segmentation_tokens = segments_to_tokens(segments) # print(segmentation_tokens) ktext = KText.load(data) ktext.tokens = sorted(ktext.tokens, key=lambda t: (t.start_position, t.end_position)) ktext.fix_offsets3() #find offsets of each token in text without spaces? plain_tokens = {} for token in ktext.tokens: if token.manual: continue # print(token.form, token.start_offset2, token.end_offset2, token.space_before) plain_tokens[(token.form, token.start_offset2, token.end_offset2)] = token
if ktext.tokens and ktext.tokens[-1].start_position == start_position and ktext.tokens[ -1].end_position == end_position: ktext.tokens[-1].add_interpretation(kinterpretation) else: ktoken = KToken(form, space_before=None, start_offset=None, end_offset=None) ktoken.start_position = start_position ktoken.end_position = end_position ktoken.add_interpretation(kinterpretation) ktext.add_token(ktoken) return ktext parser = ArgumentParser(description='Train') parser.add_argument('jsonl_path', help='path to JSONL for getting text') parser.add_argument('--dict_dir', default=None, help='path to directory with dict') parser.add_argument('--dict_name', default=None, help='dict name') parser.add_argument('output_path', help='path to merged JSONL') args = parser.parse_args() morfeusz = morfeusz2.Morfeusz(generate=False, expand_tags=True, dict_name=args.dict_name, dict_path=args.dict_dir) # dict_name=None, dict_path=None #--dict-dir /home/kwrobel/repos/poleval2020-task2/data/ --dict morfeusz-f19 with jsonlines.open(args.jsonl_path) as reader, jsonlines.open(args.output_path, mode='w') as writer: for data in reader: original_ktext = KText.load(data) text = original_ktext.text ktext = morfeusz_tokenize(text, original_ktext) ktext.fix_offsets2() writer.write(ktext.save())
def convert_to_ktagger(path): file_name = os.path.basename(path) paragraphs = read_dag(path) # print(path, len(paragraphs)) for paragraph_index, paragraph in enumerate(paragraphs): if args.only_disamb: tokens = [token for token in paragraph[TOKENS] if is_disamb(token)] paragraph[TOKENS] = tokens paragraph_id = f"{corpus}▁{file_name}▁{paragraph_index}" ktext = KText(paragraph_id) years = paragraph[YEARS] year_feature = years[:2] ktext.year = year_feature text = original_text(paragraph) ktext.text = text dag_offsets(paragraph) for token in paragraph[TOKENS]: ktoken = KToken(token[SEGMENT], token[SPACE_BEFORE], token[START_OFFSET], token[END_OFFSET]) ktext.add_token(ktoken) ktoken.start_position = token[START_POSITION] ktoken.end_position = token[END_POSITION] for interpretation in token[INTERPRETATIONS]: disamb = 'disamb' in interpretation[DISAMB] if args.only_disamb and not disamb: continue manual = 'manual' in interpretation[DISAMB] kinterpretation = KInterpretation(interpretation[LEMMA], interpretation[TAG], disamb, manual) ktoken.add_interpretation(kinterpretation) assert text == ktext.infer_original_text() ktext.check_offsets() # print(ktext.save()) payload = json.loads(ktext.save2()) k = KText.load(payload) # print(k) # print(ktext.save()) # print(k.save()) assert ktext.save2() == k.save2() # print(payload) assert payload == ktext.save() yield ktext
def read_xces(file_path: str, corpus_name: str = '', only_disamb: bool = False): paragraphs_defined = True ns = False # no separator first_chunk = True paragraph_index = 0 for event, elem in ET.iterparse(file_path, events=( "start", "end", )): if first_chunk and event == "start" and elem.tag in ('chunk', 'sentence'): if elem.get('type') == 's' or elem.tag == 'sentence': paragraphs_defined = False first_chunk = False elif event == "end" and elem.tag in ('chunk', 'sentence'): xml_sentences = [] paragraph = KText(f"{corpus_name}▁{file_path}▁{paragraph_index}") paragraph_index += 1 start_position = 0 if paragraphs_defined and elem.tag == 'chunk' and elem.get( 'type') != 's': xml_sentences = elem.getchildren() elif (not paragraphs_defined) and ( (elem.tag == 'chunk' and elem.get('type') == 's') or elem.tag == 'sentence'): xml_sentences = [elem] else: continue for sentence_index, xml_sentence in enumerate(xml_sentences): # sentence=Sentence() # paragraph.add_sentence(sentence) for token_index, xml_token in enumerate( xml_sentence.getchildren()): if xml_token.tag == 'ns': if token_index > 0 or sentence_index > 0: # omit first ns in paragraph ns = True elif xml_token.tag == 'tok': token = KToken(None, None, None, None, start_position=start_position, end_position=start_position + 1) start_position += 1 token.space_before = not ns for xml_node in xml_token.getchildren(): if xml_node.tag == 'orth': orth = xml_node.text if orth is not None: orth = orth.replace(' ', ' ') #a j e n t a token.form = orth elif xml_node.tag == 'lex': if xml_node.get('disamb') == '1': disamb = True else: disamb = False base = xml_node.find('base').text ctag = xml_node.find('ctag').text form = KInterpretation(base, ctag, disamb=False) if disamb: form.disamb = True form.manual = True #TODO # if token.gold_form is not None: # logging.warning(f'More than 1 disamb {file_path} {orth}') # token.gold_form=form if disamb or not only_disamb: token.add_interpretation(form) elif xml_node.tag == 'ann': continue else: logging.error('Error 1 {xml_token}') if token.form: paragraph.add_token(token) ns = False else: logging.error(f'Error 2 {xml_token}') paragraph.tokens[-1].sentence_end = True paragraph.text = paragraph.infer_original_text() paragraph.fix_offsets(paragraph.text) yield paragraph elem.clear()