Ejemplo n.º 1
0
def calculate(disamb_path, pred_path, ambig_path):
    if 'jsonl' in disamb_path:
        reference_paragraphs = []
        with jsonlines.open(disamb_path) as reader:
            for data in reader:
                ktext = KText.load(data)
                # ktext.find_ambiguous_end_offsets()
                reference_paragraphs.append(ktext)

        refs = {}
        for ref in reference_paragraphs:
            ref_offsets, text = get_reference_offsets(ref)
            refs[text] = set(ref_offsets)
    elif 'tsv' in disamb_path:
        reference_paragraphs = list(get_input_paragraphs(disamb_path))
        refs = {}
        for pred in reference_paragraphs:
            input_offsets, text = get_input_offsets(pred)
            refs[text] = set(input_offsets)

    predicted_paragraphs = list(paragraphs(pred_path))
    assert len(predicted_paragraphs) == len(reference_paragraphs)
    input_paragraphs = list(get_input_paragraphs(ambig_path))
    assert len(predicted_paragraphs) == len(input_paragraphs)

    preds = {}

    input_refs = {}
    unambigs = {}
    for pred in predicted_paragraphs:
        pred_offsets, text = get_predicted_offsets(pred)
        preds[text] = set(pred_offsets)

    for pred in input_paragraphs:
        unambig_offsets, text = get_input_unambig_offsets(pred)
        unambigs[text] = set(unambig_offsets)

    for pred in input_paragraphs:
        input_offsets, text = get_input_offsets(pred)
        input_refs[text] = set(input_offsets)

    print("\n".join(sorted(preds.keys() - refs.keys())))
    print('---')
    print("\n".join(sorted(refs.keys() - preds.keys())))

    assert not (preds.keys() - refs.keys())
    assert not (refs.keys() - preds.keys())
    assert not (refs.keys() - unambigs.keys())
    assert not (unambigs.keys() - refs.keys())

    return refs, preds, unambigs, input_refs
Ejemplo n.º 2
0
def morfeusz_tokenize(text: str, original_ktext: KText):
    ktext = KText(original_ktext.id)
    ktext.text = text
    ktext.year = original_ktext.year

    output = morfeusz.analyse(text)

    for start_position, end_position, i in output:
        form, pseudo_lemma, combined_tags, _, _ = i
        
        lemma = regex.sub(r':[abcdijnopqsv][0-9]?$', '', pseudo_lemma)
        
        kinterpretation = KInterpretation(lemma, combined_tags, disamb=False, manual=False)
        if ktext.tokens and ktext.tokens[-1].start_position == start_position and ktext.tokens[
            -1].end_position == end_position:
            ktext.tokens[-1].add_interpretation(kinterpretation)
        else:
            ktoken = KToken(form, space_before=None, start_offset=None, end_offset=None)
            ktoken.start_position = start_position
            ktoken.end_position = end_position
            ktoken.add_interpretation(kinterpretation)
            ktext.add_token(ktoken)
    return ktext
Ejemplo n.º 3
0
#get all tokens from char preds

# for segments in paragraphs(args.pred_path):
#     tokens=segments_to_tokens(segments)
#     print(tokens)
#     # break

with jsonlines.open(args.merged_path) as reader, open(args.output_path,
                                                      'w') as writer:
    for data, segments in zip(reader,
                              paragraphs(args.pred_path, args.input_path)):
        segmentation_tokens = segments_to_tokens(segments)
        # print(segmentation_tokens)

        ktext = KText.load(data)

        ktext.tokens = sorted(ktext.tokens,
                              key=lambda t: (t.start_position, t.end_position))

        ktext.fix_offsets3()
        #find offsets of each token in text without spaces?

        plain_tokens = {}
        for token in ktext.tokens:
            if token.manual:
                continue
            # print(token.form, token.start_offset2, token.end_offset2, token.space_before)
            plain_tokens[(token.form, token.start_offset2,
                          token.end_offset2)] = token
Ejemplo n.º 4
0
        if ktext.tokens and ktext.tokens[-1].start_position == start_position and ktext.tokens[
            -1].end_position == end_position:
            ktext.tokens[-1].add_interpretation(kinterpretation)
        else:
            ktoken = KToken(form, space_before=None, start_offset=None, end_offset=None)
            ktoken.start_position = start_position
            ktoken.end_position = end_position
            ktoken.add_interpretation(kinterpretation)
            ktext.add_token(ktoken)
    return ktext


parser = ArgumentParser(description='Train')
parser.add_argument('jsonl_path', help='path to JSONL for getting text')
parser.add_argument('--dict_dir', default=None, help='path to directory with dict')
parser.add_argument('--dict_name', default=None, help='dict name')
parser.add_argument('output_path', help='path to merged JSONL')
args = parser.parse_args()

morfeusz = morfeusz2.Morfeusz(generate=False, expand_tags=True, dict_name=args.dict_name, dict_path=args.dict_dir)  # dict_name=None, dict_path=None
#--dict-dir /home/kwrobel/repos/poleval2020-task2/data/ --dict morfeusz-f19

with jsonlines.open(args.jsonl_path) as reader, jsonlines.open(args.output_path, mode='w') as writer:
    for data in reader:
        original_ktext = KText.load(data)
        text = original_ktext.text

        ktext = morfeusz_tokenize(text, original_ktext)
        ktext.fix_offsets2()
        writer.write(ktext.save())
Ejemplo n.º 5
0
def convert_to_ktagger(path):
    file_name = os.path.basename(path)
    paragraphs = read_dag(path)
    # print(path, len(paragraphs))

    for paragraph_index, paragraph in enumerate(paragraphs):
        if args.only_disamb:
            tokens = [token for token in paragraph[TOKENS] if is_disamb(token)]
            paragraph[TOKENS] = tokens

        paragraph_id = f"{corpus}▁{file_name}▁{paragraph_index}"
        ktext = KText(paragraph_id)
        years = paragraph[YEARS]
        year_feature = years[:2]
        ktext.year = year_feature

        text = original_text(paragraph)
        ktext.text = text

        dag_offsets(paragraph)

        for token in paragraph[TOKENS]:
            ktoken = KToken(token[SEGMENT], token[SPACE_BEFORE], token[START_OFFSET], token[END_OFFSET])
            ktext.add_token(ktoken)
            ktoken.start_position = token[START_POSITION]
            ktoken.end_position = token[END_POSITION]
            for interpretation in token[INTERPRETATIONS]:
                disamb = 'disamb' in interpretation[DISAMB]
                if args.only_disamb and not disamb:
                    continue
                manual = 'manual' in interpretation[DISAMB]
                kinterpretation = KInterpretation(interpretation[LEMMA], interpretation[TAG], disamb, manual)
                ktoken.add_interpretation(kinterpretation)

        assert text == ktext.infer_original_text()
        ktext.check_offsets()

        # print(ktext.save())

        payload = json.loads(ktext.save2())
        k = KText.load(payload)
        # print(k)

        # print(ktext.save())
        # print(k.save())
        assert ktext.save2() == k.save2()
        # print(payload)
        assert payload == ktext.save()
        yield ktext
Ejemplo n.º 6
0
def read_xces(file_path: str,
              corpus_name: str = '',
              only_disamb: bool = False):
    paragraphs_defined = True
    ns = False  # no separator
    first_chunk = True

    paragraph_index = 0
    for event, elem in ET.iterparse(file_path, events=(
            "start",
            "end",
    )):
        if first_chunk and event == "start" and elem.tag in ('chunk',
                                                             'sentence'):
            if elem.get('type') == 's' or elem.tag == 'sentence':
                paragraphs_defined = False
            first_chunk = False
        elif event == "end" and elem.tag in ('chunk', 'sentence'):
            xml_sentences = []
            paragraph = KText(f"{corpus_name}▁{file_path}▁{paragraph_index}")
            paragraph_index += 1
            start_position = 0
            if paragraphs_defined and elem.tag == 'chunk' and elem.get(
                    'type') != 's':
                xml_sentences = elem.getchildren()
            elif (not paragraphs_defined) and (
                (elem.tag == 'chunk' and elem.get('type') == 's')
                    or elem.tag == 'sentence'):
                xml_sentences = [elem]
            else:
                continue

            for sentence_index, xml_sentence in enumerate(xml_sentences):
                # sentence=Sentence()
                # paragraph.add_sentence(sentence)
                for token_index, xml_token in enumerate(
                        xml_sentence.getchildren()):
                    if xml_token.tag == 'ns':
                        if token_index > 0 or sentence_index > 0:  # omit first ns in paragraph
                            ns = True
                    elif xml_token.tag == 'tok':
                        token = KToken(None,
                                       None,
                                       None,
                                       None,
                                       start_position=start_position,
                                       end_position=start_position + 1)
                        start_position += 1
                        token.space_before = not ns

                        for xml_node in xml_token.getchildren():
                            if xml_node.tag == 'orth':
                                orth = xml_node.text
                                if orth is not None:
                                    orth = orth.replace(' ', ' ')  #a j e n t a
                                token.form = orth
                            elif xml_node.tag == 'lex':
                                if xml_node.get('disamb') == '1':
                                    disamb = True
                                else:
                                    disamb = False

                                base = xml_node.find('base').text
                                ctag = xml_node.find('ctag').text

                                form = KInterpretation(base,
                                                       ctag,
                                                       disamb=False)
                                if disamb:
                                    form.disamb = True
                                    form.manual = True  #TODO
                                    # if token.gold_form is not None:
                                    #     logging.warning(f'More than 1 disamb {file_path} {orth}')
                                    # token.gold_form=form
                                if disamb or not only_disamb:
                                    token.add_interpretation(form)
                            elif xml_node.tag == 'ann':
                                continue
                            else:
                                logging.error('Error 1 {xml_token}')
                        if token.form:
                            paragraph.add_token(token)
                        ns = False
                    else:
                        logging.error(f'Error 2 {xml_token}')
                paragraph.tokens[-1].sentence_end = True

            paragraph.text = paragraph.infer_original_text()
            paragraph.fix_offsets(paragraph.text)
            yield paragraph
            elem.clear()