Beispiel #1
0
def convert_to_ktagger(path):
    file_name = os.path.basename(path)
    paragraphs = read_dag(path)
    # print(path, len(paragraphs))

    for paragraph_index, paragraph in enumerate(paragraphs):
        if args.only_disamb:
            tokens = [token for token in paragraph[TOKENS] if is_disamb(token)]
            paragraph[TOKENS] = tokens

        paragraph_id = f"{corpus}▁{file_name}▁{paragraph_index}"
        ktext = KText(paragraph_id)
        years = paragraph[YEARS]
        year_feature = years[:2]
        ktext.year = year_feature

        text = original_text(paragraph)
        ktext.text = text

        dag_offsets(paragraph)

        for token in paragraph[TOKENS]:
            ktoken = KToken(token[SEGMENT], token[SPACE_BEFORE], token[START_OFFSET], token[END_OFFSET])
            ktext.add_token(ktoken)
            ktoken.start_position = token[START_POSITION]
            ktoken.end_position = token[END_POSITION]
            for interpretation in token[INTERPRETATIONS]:
                disamb = 'disamb' in interpretation[DISAMB]
                if args.only_disamb and not disamb:
                    continue
                manual = 'manual' in interpretation[DISAMB]
                kinterpretation = KInterpretation(interpretation[LEMMA], interpretation[TAG], disamb, manual)
                ktoken.add_interpretation(kinterpretation)

        assert text == ktext.infer_original_text()
        ktext.check_offsets()

        # print(ktext.save())

        payload = json.loads(ktext.save2())
        k = KText.load(payload)
        # print(k)

        # print(ktext.save())
        # print(k.save())
        assert ktext.save2() == k.save2()
        # print(payload)
        assert payload == ktext.save()
        yield ktext
def morfeusz_tokenize(text: str, original_ktext: KText):
    ktext = KText(original_ktext.id)
    ktext.text = text
    ktext.year = original_ktext.year

    output = morfeusz.analyse(text)

    for start_position, end_position, i in output:
        form, pseudo_lemma, combined_tags, _, _ = i
        
        lemma = regex.sub(r':[abcdijnopqsv][0-9]?$', '', pseudo_lemma)
        
        kinterpretation = KInterpretation(lemma, combined_tags, disamb=False, manual=False)
        if ktext.tokens and ktext.tokens[-1].start_position == start_position and ktext.tokens[
            -1].end_position == end_position:
            ktext.tokens[-1].add_interpretation(kinterpretation)
        else:
            ktoken = KToken(form, space_before=None, start_offset=None, end_offset=None)
            ktoken.start_position = start_position
            ktoken.end_position = end_position
            ktoken.add_interpretation(kinterpretation)
            ktext.add_token(ktoken)
    return ktext