def get_parser(name, **kwargs): model_or_lang = "en_core_web_sm" if name == "spacy" else "en" if f"{name}-{kwargs}" not in PARSERS: PARSERS[f"{name}-{kwargs}"] = init_parser(model_or_lang, name, **kwargs) return PARSERS[f"{name}-{kwargs}"]
def spacy_ext_names(): nlp = init_parser(ext_names={ "conll": "conllu", "conll_str": "conll_text", "conll_pd": "pandas" }) return nlp
def main(): # Initialise English spaCy parser, already including the ConllFormatter as a pipeline component nlp = init_parser("en_core_web_sm", "spacy", include_headers=True) parser = ConllParser(nlp) # Path to a CoNLL-U test file path = Path( __file__).parent.parent / "tests" / "en_ewt-ud-dev.conllu-sample.txt" doc = parser.parse_conll_file_as_spacy(path, "utf-8") for sent_id, sent in enumerate(doc.sents, 1): print(sent._.conll_str) for word in sent: print(word, word.dep_) print()
def main(): # Initialise English parser, already including the ConllFormatter as a pipeline component nlp = init_parser("spacy", "en") # Parse a given string doc = nlp( "A cookie is a baked or cooked food that is typically small, flat and sweet. It usually contains flour," " sugar and some type of oil or fat. It may include other ingredients such as raisins, oats, chocolate" " chips, nuts, etc." ) # Write the conll representation of each sentence to its own file # Note that .conll_pd is only present if pandas is installed for sent_idx, sent in enumerate(doc.sents, 1): sent._.conll_pd.to_csv(f"sentence-{sent_idx}.txt", index=False, sep="\t")
def main(): # Initialise English parser, already including the ConllFormatter as a pipeline component. # Indicate that we want to get the CoNLL headers in the string output. # `use_gpu` and `verbose` are specific to stanza (and stanfordnlp). These keywords arguments # are passed onto their Pipeline() initialisation nlp = init_parser("stanza", "en", parser_opts={ "use_gpu": True, "verbose": False }, include_headers=True) # Parse a given string doc = nlp( "A cookie is a baked or cooked food that is typically small, flat and sweet. It usually contains flour," " sugar and some type of oil or fat. It may include other ingredients such as raisins, oats, chocolate" " chips, nuts, etc.") # Get the CoNLL representation of the whole document, including headers conll = doc._.conll_str print(conll)
def parse(args: Namespace): if not args.input_str and not args.input_file: raise ValueError("'input_str' or 'input_file' must be given") nlp = init_parser( args.model_or_lang, args.parser, is_tokenized=args.is_tokenized, disable_sbd=args.disable_sbd, disable_pandas=True, include_headers=args.include_headers, ) parser = ConllParser(nlp, is_tokenized=args.is_tokenized) if args.input_file: conll_str = parser.parse_file_as_conll( args.input_file, args.input_encoding, n_process=args.n_process, no_force_counting=args.no_force_counting, ignore_pipe_errors=args.ignore_pipe_errors, no_split_on_newline=args.no_split_on_newline, ) else: conll_str = parser.parse_text_as_conll( args.input_str, n_process=args.n_process, no_force_counting=args.no_force_counting, ignore_pipe_errors=args.ignore_pipe_errors, no_split_on_newline=args.no_split_on_newline, ) fhout = Path(args.output_file).open("w", encoding=args.output_encoding) if args.output_file is not None else stdout fhout.write(conll_str) if fhout is not stdout and args.verbose: # end='' to avoid adding yet another newline print(conll_str, end="")
def spacy_disabled_pandas(): return init_parser("en_core_web_sm", "spacy", disable_pandas=True)
def spacy_conversion_map(): return init_parser("en_core_web_sm", "spacy", conversion_maps={"lemma": {"-PRON-": "PRON"}})
def spacy_ext_names(): return init_parser("en_core_web_sm", "spacy", ext_names={"conll": "conllu", "conll_str": "conll_text", "conll_pd": "pandas"} )
def spacy_disabled_pandas(): nlp = init_parser(disable_pandas=True) return nlp
def spacy_conversion_map(): nlp = init_parser(conversion_maps={"lemma": {"-PRON-": "PRON"}}) return nlp
def get_parser(name, **kwargs): if f"{name}-{kwargs}" not in PARSERS: PARSERS[f"{name}-{kwargs}"] = init_parser(name, **kwargs) return PARSERS[f"{name}-{kwargs}"]
def parse( input_file: Optional[str] = None, input_encoding: str = getpreferredencoding(), input_str: Optional[str] = None, is_tokenized: bool = False, output_file: Optional[str] = None, output_encoding: str = getpreferredencoding(), parser: str = "spacy", model_or_lang: Optional[str] = None, disable_sbd: bool = False, include_headers: bool = False, no_force_counting: bool = False, n_process: int = 1, verbose: bool = False, ): """ Parse an input string or input file to CoNLL-U format :param input_file: path to file with sentences to parse. Has precedence over 'input_str' :param input_encoding: encoding of the input file. Default value is system default :param input_str: input string to parse :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). Setting this option has difference consequences for different parsers: - SpaCy will simply not do any further tokenisation: we simply split the tokens on whitespace, sentence segmentation still works as usual - Stanfordnlp and Stanza will not tokenize but in addition, will also only do sentence splitting on newlines. No additional sentence segmentation is done. - For UDpipe we also simply disable tokenisation and use white-spaced tokens (works from 0.3.0 upwards). No further sentence segmentation is done. :param output_file: path to output file. If not specified, the output will be printed on standard output :param output_encoding: encoding of the output file. Default value is system default :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are 'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be installed, e.g. spacy-stanza. :param model_or_lang: language model to use (must be installed). Defaults to an English model :param disable_sbd: disables spaCy automatic sentence boundary detection (works for spaCy) :param include_headers: to include headers before the output of every sentence :param no_force_counting: to disable force counting the 'sent_id', starting from 1 and increasing for each sentence :param n_process: number of processes to use in nlp.pipe(). -1 will use as many cores as available :param verbose: to print the output to stdout, regardless of 'output_file' """ if not input_str and not input_file: raise ValueError( "'input_file' or 'input_str' must be given. Use parse-as-conll -h for help." ) # disable_pandas to prevent multiprocessing issues nlp = init_parser( parser, model_or_lang, is_tokenized=is_tokenized, disable_sbd=disable_sbd, include_headers=include_headers, disable_pandas=True, ) # Gather input: # Collect lines in 'lines' variable, taking into account 'is_tokenized' lines = [] if input_str: lines = [l.strip() for l in input_str.split("\n")] elif input_file: with Path(input_file).open(encoding=input_encoding) as fhin: lines = [l.strip() for l in fhin.readlines()] if is_tokenized: # UDPipe allows pretokenized text since version 0.3.0 if parser == "spacy": lines = [l.split(" ") for l in lines] elif parser == "udpipe": if version.parse( pkg_resources.get_distribution("spacy_udpipe").version ) >= version.parse("0.3.0"): # UDPipe uses List[str] for presegmented text, and List[List[str]] for pretokenized text lines = [[l.split(" ") for l in lines]] else: logging.warning( "UDPipe should have version 0.3.0 or higher when using '--is_tokenized'." " Continuing with tokenizer." ) # Write to output: # If 'output_file' given, write to that file - if, also, 'verbose' is given, also write to stdout # Else write to stdout fhout = ( Path(output_file).open("w", encoding=output_encoding) if output_file is not None else stdout ) # 'n_process' argument is only supported from spaCy 2.2.2 onwards _nlpgen = None if version.parse(spacy.__version__) >= version.parse("2.2.2"): _nlpgen = nlp.pipe(lines, n_process=n_process) else: _nlpgen = nlp.pipe(lines) conll_idx = 0 for doc_idx, doc in enumerate(_nlpgen): for sent in doc.sents: conll_idx += 1 sent_as_conll = sent._.conll_str if include_headers and not no_force_counting: # nlp.pipe returns different docs, meaning that the generated sentence indices # by ConllFormatter are not consecutive (they reset for each new doc) # We can do a regex replace to fix that, though. sent_as_conll = re.sub(SENT_ID_RE, str(conll_idx), sent_as_conll, 1) # Newline madness dealing with writing to file and printing to stdout at the same time: # Prepend additional newline for all except the very first string. if not (doc_idx == 0 and sent.start == 0): sent_as_conll = "\n" + sent_as_conll fhout.write(sent_as_conll) if fhout is not stdout and verbose: # end='' to avoid adding yet another newline print(sent_as_conll, end="") fhout.close()