def convert_to_ktagger(path): file_name = os.path.basename(path) paragraphs = read_dag(path) # print(path, len(paragraphs)) for paragraph_index, paragraph in enumerate(paragraphs): if args.only_disamb: tokens = [token for token in paragraph[TOKENS] if is_disamb(token)] paragraph[TOKENS] = tokens paragraph_id = f"{corpus}▁{file_name}▁{paragraph_index}" ktext = KText(paragraph_id) years = paragraph[YEARS] year_feature = years[:2] ktext.year = year_feature text = original_text(paragraph) ktext.text = text dag_offsets(paragraph) for token in paragraph[TOKENS]: ktoken = KToken(token[SEGMENT], token[SPACE_BEFORE], token[START_OFFSET], token[END_OFFSET]) ktext.add_token(ktoken) ktoken.start_position = token[START_POSITION] ktoken.end_position = token[END_POSITION] for interpretation in token[INTERPRETATIONS]: disamb = 'disamb' in interpretation[DISAMB] if args.only_disamb and not disamb: continue manual = 'manual' in interpretation[DISAMB] kinterpretation = KInterpretation(interpretation[LEMMA], interpretation[TAG], disamb, manual) ktoken.add_interpretation(kinterpretation) assert text == ktext.infer_original_text() ktext.check_offsets() # print(ktext.save()) payload = json.loads(ktext.save2()) k = KText.load(payload) # print(k) # print(ktext.save()) # print(k.save()) assert ktext.save2() == k.save2() # print(payload) assert payload == ktext.save() yield ktext
def morfeusz_tokenize(text: str, original_ktext: KText): ktext = KText(original_ktext.id) ktext.text = text ktext.year = original_ktext.year output = morfeusz.analyse(text) for start_position, end_position, i in output: form, pseudo_lemma, combined_tags, _, _ = i lemma = regex.sub(r':[abcdijnopqsv][0-9]?$', '', pseudo_lemma) kinterpretation = KInterpretation(lemma, combined_tags, disamb=False, manual=False) if ktext.tokens and ktext.tokens[-1].start_position == start_position and ktext.tokens[ -1].end_position == end_position: ktext.tokens[-1].add_interpretation(kinterpretation) else: ktoken = KToken(form, space_before=None, start_offset=None, end_offset=None) ktoken.start_position = start_position ktoken.end_position = end_position ktoken.add_interpretation(kinterpretation) ktext.add_token(ktoken) return ktext