コード例 #1
0
def main():
    # Initialise English spaCy parser, already including the ConllFormatter as a pipeline component
    nlp = init_parser("en_core_web_sm", "spacy", include_headers=True)
    parser = ConllParser(nlp)
    # Path to a CoNLL-U test file
    path = Path(
        __file__).parent.parent / "tests" / "en_ewt-ud-dev.conllu-sample.txt"
    doc = parser.parse_conll_file_as_spacy(path, "utf-8")
    for sent_id, sent in enumerate(doc.sents, 1):
        print(sent._.conll_str)
        for word in sent:
            print(word, word.dep_)
        print()
コード例 #2
0
def test_conllf_to_spacy(spacy_conllparser: ConllParser, conll_testfile: Path):
    doc = spacy_conllparser.parse_conll_file_as_spacy(conll_testfile, input_encoding="utf-8")

    assert len(list(doc.sents)) == 2
    assert doc.has_annotation("DEP")
    assert doc.has_annotation("TAG")
    assert doc.has_annotation("MORPH")
コード例 #3
0
def test_conllstr_to_spacy(spacy_conllparser: ConllParser, conll_testfile: Path):
    text = conll_testfile.read_text(encoding="utf-8")
    doc = spacy_conllparser.parse_conll_text_as_spacy(text)

    assert len(list(doc.sents)) == 2
    assert doc.has_annotation("DEP")
    assert doc.has_annotation("TAG")
    assert doc.has_annotation("MORPH")
コード例 #4
0
ファイル: parse.py プロジェクト: BramVanroy/spacy_conll
def parse(args: Namespace):
    if not args.input_str and not args.input_file:
        raise ValueError("'input_str' or 'input_file' must be given")

    nlp = init_parser(
        args.model_or_lang,
        args.parser,
        is_tokenized=args.is_tokenized,
        disable_sbd=args.disable_sbd,
        disable_pandas=True,
        include_headers=args.include_headers,
    )

    parser = ConllParser(nlp, is_tokenized=args.is_tokenized)

    if args.input_file:
        conll_str = parser.parse_file_as_conll(
            args.input_file,
            args.input_encoding,
            n_process=args.n_process,
            no_force_counting=args.no_force_counting,
            ignore_pipe_errors=args.ignore_pipe_errors,
            no_split_on_newline=args.no_split_on_newline,
        )
    else:
        conll_str = parser.parse_text_as_conll(
            args.input_str,
            n_process=args.n_process,
            no_force_counting=args.no_force_counting,
            ignore_pipe_errors=args.ignore_pipe_errors,
            no_split_on_newline=args.no_split_on_newline,
        )

    fhout = Path(args.output_file).open("w", encoding=args.output_encoding) if args.output_file is not None else stdout
    fhout.write(conll_str)

    if fhout is not stdout and args.verbose:
        # end='' to avoid adding yet another newline
        print(conll_str, end="")
コード例 #5
0
def spacy_conllparser():
    return ConllParser(get_parser("spacy", include_headers=True))
コード例 #6
0
def pretokenized_conllparser(request):
    yield ConllParser(get_parser(request.param, is_tokenized=True, include_headers=True), is_tokenized=True)
コード例 #7
0
def conllparser(request):
    yield ConllParser(get_parser(request.param, include_headers=True))
コード例 #8
0
def conllparser_parse_conllfile(spacy_vanila):
    return ConllParser(spacy_vanila).parse_conll_as_spacy(
        Path(__file__).parent.joinpath("en_ewt-ud-dev.conllu-sample.txt"), input_encoding="utf-8")