Exemple #1
0
def get_parser(name, **kwargs):
    model_or_lang = "en_core_web_sm" if name == "spacy" else "en"

    if f"{name}-{kwargs}" not in PARSERS:
        PARSERS[f"{name}-{kwargs}"] = init_parser(model_or_lang, name, **kwargs)

    return PARSERS[f"{name}-{kwargs}"]
Exemple #2
0
def spacy_ext_names():
    nlp = init_parser(ext_names={
        "conll": "conllu",
        "conll_str": "conll_text",
        "conll_pd": "pandas"
    })
    return nlp
Exemple #3
0
def main():
    # Initialise English spaCy parser, already including the ConllFormatter as a pipeline component
    nlp = init_parser("en_core_web_sm", "spacy", include_headers=True)
    parser = ConllParser(nlp)
    # Path to a CoNLL-U test file
    path = Path(
        __file__).parent.parent / "tests" / "en_ewt-ud-dev.conllu-sample.txt"
    doc = parser.parse_conll_file_as_spacy(path, "utf-8")
    for sent_id, sent in enumerate(doc.sents, 1):
        print(sent._.conll_str)
        for word in sent:
            print(word, word.dep_)
        print()
Exemple #4
0
def main():
    # Initialise English parser, already including the ConllFormatter as a pipeline component
    nlp = init_parser("spacy", "en")
    # Parse a given string
    doc = nlp(
        "A cookie is a baked or cooked food that is typically small, flat and sweet. It usually contains flour,"
        " sugar and some type of oil or fat. It may include other ingredients such as raisins, oats, chocolate"
        " chips, nuts, etc."
    )

    # Write the conll representation of each sentence to its own file
    # Note that .conll_pd is only present if pandas is installed
    for sent_idx, sent in enumerate(doc.sents, 1):
        sent._.conll_pd.to_csv(f"sentence-{sent_idx}.txt", index=False, sep="\t")
Exemple #5
0
def main():
    # Initialise English parser, already including the ConllFormatter as a pipeline component.
    # Indicate that we want to get the CoNLL headers in the string output.
    # `use_gpu` and `verbose` are specific to stanza (and stanfordnlp). These keywords arguments
    # are passed onto their Pipeline() initialisation
    nlp = init_parser("stanza",
                      "en",
                      parser_opts={
                          "use_gpu": True,
                          "verbose": False
                      },
                      include_headers=True)
    # Parse a given string
    doc = nlp(
        "A cookie is a baked or cooked food that is typically small, flat and sweet. It usually contains flour,"
        " sugar and some type of oil or fat. It may include other ingredients such as raisins, oats, chocolate"
        " chips, nuts, etc.")

    # Get the CoNLL representation of the whole document, including headers
    conll = doc._.conll_str
    print(conll)
Exemple #6
0
def parse(args: Namespace):
    if not args.input_str and not args.input_file:
        raise ValueError("'input_str' or 'input_file' must be given")

    nlp = init_parser(
        args.model_or_lang,
        args.parser,
        is_tokenized=args.is_tokenized,
        disable_sbd=args.disable_sbd,
        disable_pandas=True,
        include_headers=args.include_headers,
    )

    parser = ConllParser(nlp, is_tokenized=args.is_tokenized)

    if args.input_file:
        conll_str = parser.parse_file_as_conll(
            args.input_file,
            args.input_encoding,
            n_process=args.n_process,
            no_force_counting=args.no_force_counting,
            ignore_pipe_errors=args.ignore_pipe_errors,
            no_split_on_newline=args.no_split_on_newline,
        )
    else:
        conll_str = parser.parse_text_as_conll(
            args.input_str,
            n_process=args.n_process,
            no_force_counting=args.no_force_counting,
            ignore_pipe_errors=args.ignore_pipe_errors,
            no_split_on_newline=args.no_split_on_newline,
        )

    fhout = Path(args.output_file).open("w", encoding=args.output_encoding) if args.output_file is not None else stdout
    fhout.write(conll_str)

    if fhout is not stdout and args.verbose:
        # end='' to avoid adding yet another newline
        print(conll_str, end="")
Exemple #7
0
def spacy_disabled_pandas():
    return init_parser("en_core_web_sm", "spacy", disable_pandas=True)
Exemple #8
0
def spacy_conversion_map():
    return init_parser("en_core_web_sm", "spacy", conversion_maps={"lemma": {"-PRON-": "PRON"}})
Exemple #9
0
def spacy_ext_names():
    return init_parser("en_core_web_sm", "spacy",
                       ext_names={"conll": "conllu", "conll_str": "conll_text", "conll_pd": "pandas"}
                       )
Exemple #10
0
def spacy_disabled_pandas():
    nlp = init_parser(disable_pandas=True)
    return nlp
Exemple #11
0
def spacy_conversion_map():
    nlp = init_parser(conversion_maps={"lemma": {"-PRON-": "PRON"}})
    return nlp
Exemple #12
0
def get_parser(name, **kwargs):
    if f"{name}-{kwargs}" not in PARSERS:
        PARSERS[f"{name}-{kwargs}"] = init_parser(name, **kwargs)

    return PARSERS[f"{name}-{kwargs}"]
Exemple #13
0
def parse(
    input_file: Optional[str] = None,
    input_encoding: str = getpreferredencoding(),
    input_str: Optional[str] = None,
    is_tokenized: bool = False,
    output_file: Optional[str] = None,
    output_encoding: str = getpreferredencoding(),
    parser: str = "spacy",
    model_or_lang: Optional[str] = None,
    disable_sbd: bool = False,
    include_headers: bool = False,
    no_force_counting: bool = False,
    n_process: int = 1,
    verbose: bool = False,
):
    """ Parse an input string or input file to CoNLL-U format

        :param input_file: path to file with sentences to parse. Has precedence over 'input_str'
        :param input_encoding: encoding of the input file. Default value is system default
        :param input_str: input string to parse
        :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). Setting this
               option has difference consequences for different parsers:
               - SpaCy will simply not do any further tokenisation: we simply split the tokens on whitespace, sentence
                 segmentation still works as usual
               - Stanfordnlp and Stanza will not tokenize but in addition, will also only do sentence splitting on
                 newlines. No additional sentence segmentation is done.
               - For UDpipe we also simply disable tokenisation and use white-spaced tokens (works from 0.3.0 upwards).
                 No further sentence segmentation is done.
        :param output_file: path to output file. If not specified, the output will be printed on standard output
        :param output_encoding: encoding of the output file. Default value is system default
        :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options
               are 'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need
               to be installed, e.g. spacy-stanza.
        :param model_or_lang: language model to use (must be installed). Defaults to an English model
        :param disable_sbd: disables spaCy automatic sentence boundary detection (works for spaCy)
        :param include_headers: to include headers before the output of every sentence
        :param no_force_counting: to disable force counting the 'sent_id', starting from 1 and increasing for each
               sentence
        :param n_process: number of processes to use in nlp.pipe(). -1 will use as many cores as available
        :param verbose: to print the output to stdout, regardless of 'output_file'
        """
    if not input_str and not input_file:
        raise ValueError(
            "'input_file' or 'input_str' must be given. Use parse-as-conll -h for help."
        )

    # disable_pandas to prevent multiprocessing issues
    nlp = init_parser(
        parser,
        model_or_lang,
        is_tokenized=is_tokenized,
        disable_sbd=disable_sbd,
        include_headers=include_headers,
        disable_pandas=True,
    )

    # Gather input:
    # Collect lines in 'lines' variable, taking into account 'is_tokenized'
    lines = []
    if input_str:
        lines = [l.strip() for l in input_str.split("\n")]
    elif input_file:
        with Path(input_file).open(encoding=input_encoding) as fhin:
            lines = [l.strip() for l in fhin.readlines()]

    if is_tokenized:
        # UDPipe allows pretokenized text since version 0.3.0
        if parser == "spacy":
            lines = [l.split(" ") for l in lines]
        elif parser == "udpipe":
            if version.parse(
                pkg_resources.get_distribution("spacy_udpipe").version
            ) >= version.parse("0.3.0"):
                # UDPipe uses List[str] for presegmented text, and List[List[str]] for pretokenized text
                lines = [[l.split(" ") for l in lines]]
            else:
                logging.warning(
                    "UDPipe should have version 0.3.0 or higher when using '--is_tokenized'."
                    " Continuing with tokenizer."
                )

    # Write to output:
    # If 'output_file' given, write to that file - if, also, 'verbose' is given, also write to stdout
    # Else write to stdout
    fhout = (
        Path(output_file).open("w", encoding=output_encoding)
        if output_file is not None
        else stdout
    )

    # 'n_process' argument is only supported from spaCy 2.2.2 onwards
    _nlpgen = None
    if version.parse(spacy.__version__) >= version.parse("2.2.2"):
        _nlpgen = nlp.pipe(lines, n_process=n_process)
    else:
        _nlpgen = nlp.pipe(lines)

    conll_idx = 0
    for doc_idx, doc in enumerate(_nlpgen):
        for sent in doc.sents:
            conll_idx += 1

            sent_as_conll = sent._.conll_str
            if include_headers and not no_force_counting:
                # nlp.pipe returns different docs, meaning that the generated sentence indices
                # by ConllFormatter are not consecutive (they reset for each new doc)
                # We can do a regex replace to fix that, though.
                sent_as_conll = re.sub(SENT_ID_RE, str(conll_idx), sent_as_conll, 1)

            # Newline madness dealing with writing to file and printing to stdout at the same time:
            # Prepend additional newline for all except the very first string.
            if not (doc_idx == 0 and sent.start == 0):
                sent_as_conll = "\n" + sent_as_conll

            fhout.write(sent_as_conll)
            if fhout is not stdout and verbose:
                # end='' to avoid adding yet another newline
                print(sent_as_conll, end="")

    fhout.close()