def main(): # Initialise English spaCy parser, already including the ConllFormatter as a pipeline component nlp = init_parser("en_core_web_sm", "spacy", include_headers=True) parser = ConllParser(nlp) # Path to a CoNLL-U test file path = Path( __file__).parent.parent / "tests" / "en_ewt-ud-dev.conllu-sample.txt" doc = parser.parse_conll_file_as_spacy(path, "utf-8") for sent_id, sent in enumerate(doc.sents, 1): print(sent._.conll_str) for word in sent: print(word, word.dep_) print()
def test_conllf_to_spacy(spacy_conllparser: ConllParser, conll_testfile: Path): doc = spacy_conllparser.parse_conll_file_as_spacy(conll_testfile, input_encoding="utf-8") assert len(list(doc.sents)) == 2 assert doc.has_annotation("DEP") assert doc.has_annotation("TAG") assert doc.has_annotation("MORPH")
def test_conllstr_to_spacy(spacy_conllparser: ConllParser, conll_testfile: Path): text = conll_testfile.read_text(encoding="utf-8") doc = spacy_conllparser.parse_conll_text_as_spacy(text) assert len(list(doc.sents)) == 2 assert doc.has_annotation("DEP") assert doc.has_annotation("TAG") assert doc.has_annotation("MORPH")
def parse(args: Namespace): if not args.input_str and not args.input_file: raise ValueError("'input_str' or 'input_file' must be given") nlp = init_parser( args.model_or_lang, args.parser, is_tokenized=args.is_tokenized, disable_sbd=args.disable_sbd, disable_pandas=True, include_headers=args.include_headers, ) parser = ConllParser(nlp, is_tokenized=args.is_tokenized) if args.input_file: conll_str = parser.parse_file_as_conll( args.input_file, args.input_encoding, n_process=args.n_process, no_force_counting=args.no_force_counting, ignore_pipe_errors=args.ignore_pipe_errors, no_split_on_newline=args.no_split_on_newline, ) else: conll_str = parser.parse_text_as_conll( args.input_str, n_process=args.n_process, no_force_counting=args.no_force_counting, ignore_pipe_errors=args.ignore_pipe_errors, no_split_on_newline=args.no_split_on_newline, ) fhout = Path(args.output_file).open("w", encoding=args.output_encoding) if args.output_file is not None else stdout fhout.write(conll_str) if fhout is not stdout and args.verbose: # end='' to avoid adding yet another newline print(conll_str, end="")
def spacy_conllparser(): return ConllParser(get_parser("spacy", include_headers=True))
def pretokenized_conllparser(request): yield ConllParser(get_parser(request.param, is_tokenized=True, include_headers=True), is_tokenized=True)
def conllparser(request): yield ConllParser(get_parser(request.param, include_headers=True))
def conllparser_parse_conllfile(spacy_vanila): return ConllParser(spacy_vanila).parse_conll_as_spacy( Path(__file__).parent.joinpath("en_ewt-ud-dev.conllu-sample.txt"), input_encoding="utf-8")