Ejemplo n.º 1
0
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
    en_texts_without_empty = [t for t in en_texts if len(t)]
    de_text = "Wie war die Frage?"
    en_docs = [en_tokenizer(text) for text in en_texts]
    docs_idx = en_texts[0].index("docs")
    de_doc = de_tokenizer(de_text)
    expected = (True, None, None, None)
    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected
    assert Doc.from_docs([]) is None
    assert de_doc is not Doc.from_docs([de_doc])
    assert str(de_doc) == str(Doc.from_docs([de_doc]))

    with pytest.raises(ValueError):
        Doc.from_docs(en_docs + [de_doc])

    m_doc = Doc.from_docs(en_docs)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
    assert str(m_doc) == " ".join(en_texts_without_empty)
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
    assert m_doc[9].idx == think_idx
    with pytest.raises(AttributeError):
        # not callable, because it was not set via set_extension
        m_doc[2]._.is_ambiguous
    assert len(m_doc.user_data) == len(en_docs[0].user_data)  # but it's there

    m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
    assert len(str(m_doc)) == sum(len(t) for t in en_texts)
    assert str(m_doc) == "".join(en_texts)
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and not bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
    assert m_doc[9].idx == think_idx

    m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
    # space delimiter considered, although spacy attribute was missing
    assert str(m_doc) == " ".join(en_texts_without_empty)
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
    assert m_doc[9].idx == think_idx
Ejemplo n.º 2
0
def prepare_data(
    params: Params,
    verbose: bool = True,
) -> Dict[str, Doc]:
    """
    return a single spacy doc for each age.

    warning: if corpus binary is not on disk already, it will be saved to disk.
    this means the corpus should never be modified - else, the binary will also contain unexpected modifications
    """

    # try loading transcripts from disk
    fn = params.corpus_name + '.spacy'
    bin_path = configs.Dirs.corpora / fn
    if bin_path.exists():
        doc_bin = DocBin().from_disk(bin_path)
        docs = list(doc_bin.get_docs(nlp.vocab))
    # load raw transcripts + process them
    else:
        print(
            f'WARNING: Did not find binary file associated with {params.corpus_name}. Preprocessing corpus...'
        )
        transcripts = load_transcripts(params)
        docs: List[Doc] = [doc for doc in nlp.pipe(transcripts)]
        # WARNING: only save to disk if we know that corpus has not been modified
        doc_bin = DocBin(docs=docs)
        doc_bin.to_disk(bin_path)

    # group docs by age
    ages = load_ages(params)
    if len(ages) != len(docs):
        raise RuntimeError(f'Num docs={len(docs)} and num ages={len(ages)}')
    age2docs = {}
    for age in SortedSet(ages):
        if age == EXCLUDED_AGE:
            continue
        docs_at_age = [docs[n] for n, ai in enumerate(ages) if ai == age]
        age2docs[age] = docs_at_age
        if verbose:
            print(
                f'Processed {len(age2docs[age]):>6} transcripts for age={age}')

    # combine all documents at same age
    age2doc = {}
    for age, docs in age2docs.items():

        doc_combined = Doc.from_docs(docs)
        age2doc[age] = doc_combined
        print(f'Num tokens at age={age} is {len(doc_combined):,}')

    return age2doc
Ejemplo n.º 3
0
def merge_docs(language=None, docs=[], docbin=None, text=None):
    if docbin:
        assert language is not None
        model = settings.LANGUAGE_MODELS[language]
        # docs.extend(list(docbin.get_docs(model.vocab)))
        docs = docs + list(docbin.get_docs(model.vocab))
    if text:
        docs.append(text_to_doc(text))
    if not docs:
        return -1, None
    if not all(doc[0].lang_ == doc.lang_ for doc in docs):
        return -2, None
    if len(docs) == 1:
        doc = docs[0]
    else:
        doc = Doc.from_docs(docs)
        _init_doc(doc)
    return 0, doc
Ejemplo n.º 4
0
def make_doc_from_text_chunks(
    text: str,
    lang: types.LangLike,
    chunk_size: int = 100000,
) -> Doc:
    """
    Make a single spaCy-processed document from 1 or more chunks of ``text``.
    This is a workaround for processing very long texts, for which spaCy
    is unable to allocate enough RAM.

    Args:
        text: Text document to be chunked and processed by spaCy.
        lang: Language with which spaCy processes ``text``, represented as
            the full name of or path on disk to the pipeline, or
            an already instantiated pipeline instance.
        chunk_size: Number of characters comprising each text chunk
            (excluding the last chunk, which is probably smaller).
            For best performance, value should be somewhere between 1e3 and 1e7,
            depending on how much RAM you have available.

            .. note:: Since chunking is done by character, chunks edges' probably
               won't respect natural language segmentation, which means that every
               ``chunk_size`` characters, spaCy's models may make mistakes.

    Returns:
        A single processed document, built from concatenated text chunks.
    """
    utils.deprecated(
        "This function is deprecated, and will be removed in a future version. "
        "Instead, use the usual :func:`textacy.make_spacy_doc()` "
        "and specify a non-null `chunk_size`",
        action="once",
    )
    lang = resolve_langlike(lang)
    text_chunks = (text[i:i + chunk_size]
                   for i in range(0, len(text), chunk_size))
    docs = list(lang.pipe(text_chunks))
    return Doc.from_docs(docs)
Ejemplo n.º 5
0
def _make_spacy_doc_from_text_chunks(text: str, lang: Language, chunk_size: int) -> Doc:
    text_chunks = (text[i : i + chunk_size] for i in range(0, len(text), chunk_size))
    return Doc.from_docs(list(lang.pipe(text_chunks)))
Ejemplo n.º 6
0
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_texts = [
        "Merging the docs is fun.",
        "",
        "They don't think alike. ",
        "",
        "Another doc.",
    ]
    en_texts_without_empty = [t for t in en_texts if len(t)]
    de_text = "Wie war die Frage?"
    en_docs = [en_tokenizer(text) for text in en_texts]
    en_docs[0].spans["group"] = [en_docs[0][1:4]]
    en_docs[2].spans["group"] = [en_docs[2][1:4]]
    en_docs[4].spans["group"] = [en_docs[4][0:1]]
    span_group_texts = sorted(
        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
    )
    de_doc = de_tokenizer(de_text)
    Token.set_extension("is_ambiguous", default=False)
    en_docs[0][2]._.is_ambiguous = True  # docs
    en_docs[2][3]._.is_ambiguous = True  # think
    assert Doc.from_docs([]) is None
    assert de_doc is not Doc.from_docs([de_doc])
    assert str(de_doc) == str(Doc.from_docs([de_doc]))

    with pytest.raises(ValueError):
        Doc.from_docs(en_docs + [de_doc])

    m_doc = Doc.from_docs(en_docs)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
    assert m_doc[2]._.is_ambiguous is True
    assert m_doc[9].idx == think_idx
    assert m_doc[9]._.is_ambiguous is True
    assert not any([t._.is_ambiguous for t in m_doc[3:8]])
    assert "group" in m_doc.spans
    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
    assert bool(m_doc[11].whitespace_)

    m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
    assert len(m_doc.text) == sum(len(t) for t in en_texts)
    assert m_doc.text == "".join(en_texts_without_empty)
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and not bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
    assert m_doc[9].idx == think_idx
    assert "group" in m_doc.spans
    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
    assert bool(m_doc[11].whitespace_)

    m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
    # space delimiter considered, although spacy attribute was missing
    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
    assert m_doc[9].idx == think_idx
    assert "group" in m_doc.spans
    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])

    # can merge empty docs
    doc = Doc.from_docs([en_tokenizer("")] * 10)

    # empty but set spans keys are preserved
    en_docs = [en_tokenizer(text) for text in en_texts]
    m_doc = Doc.from_docs(en_docs)
    assert "group" not in m_doc.spans
    for doc in en_docs:
        doc.spans["group"] = []
    m_doc = Doc.from_docs(en_docs)
    assert "group" in m_doc.spans
    assert len(m_doc.spans["group"]) == 0
Ejemplo n.º 7
0
    def parse_conll_text_as_spacy(
        self,
        text: str,
        ner_tag_pattern: str = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$",
        ner_map: Dict[str, str] = None,
    ) -> Doc:
        """Parses a given CoNLL-U string into a spaCy doc. Parsed sentence section must be separated by a new line (\n\n).
        Note that we do our best to retain as much information as possible but that not all CoNLL-U fields are
        supported in spaCy. We add a Token._.conll_misc_field extension to save CoNLL-U MISC field, and a
        Token._.conll_deps_graphs_field extension to save CoNLL-U DEPS field. The metadata (lines starting with #)
        is saved in Span._.conll_metadata of sentence Spans.

        This method has been adapted from the work by spaCy.
        See: https://github.com/explosion/spaCy/blob/a1c5b694be117ac92e21f9860309821ad6da06f7/spacy/cli/converters/conllu2json.py#L179

        Multi-word tokens and empty nodes are not supported.

        :param text: CoNLL-U formatted text
        :param ner_tag_pattern: Regex pattern for entity tag in the MISC field
        :param ner_map: Map old NER tag names to new ones, '' maps to O
        :return: a spacy Doc containing all the tokens and sentences from the CoNLL file including
         the custom CoNLL extensions
        """
        if not Token.has_extension("conll_misc_field"):
            Token.set_extension("conll_misc_field", default="_")
        if not Token.has_extension("conll_deps_graphs_field"):
            Token.set_extension("conll_deps_graphs_field", default="_")
        if not Span.has_extension("conll_metadata"):
            Span.set_extension("conll_metadata", default=None)

        docs = []
        for chunk in text.split("\n\n"):
            lines = [
                l for l in chunk.splitlines() if l and not l.startswith("#")
            ]
            words, spaces, tags, poses, morphs, lemmas, miscs = [], [], [], [], [], [], []
            heads, deps, deps_graphs = [], [], []
            for i in range(len(lines)):
                line = lines[i]
                parts = line.split("\t")

                if any(not p for p in parts):
                    raise ValueError(
                        "According to the CoNLL-U Format, fields cannot be empty. See"
                        " https://universaldependencies.org/format.html")

                id_, word, lemma, pos, tag, morph, head, dep, deps_graph, misc = parts

                if any(" " in f
                       for f in (id_, pos, tag, morph, head, dep, deps_graph)):
                    raise ValueError(
                        "According to the CoNLL-U Format, only FORM, LEMMA, and MISC fields can contain"
                        " spaces. See https://universaldependencies.org/format.html"
                    )

                if "." in id_ or "-" in id_:
                    raise NotImplementedError(
                        "Multi-word tokens and empty nodes are not supported in spacy_conll"
                    )

                words.append(word)

                if "SpaceAfter=No" in misc:
                    spaces.append(False)
                else:
                    spaces.append(True)

                id_ = int(id_) - 1
                lemmas.append(lemma)
                poses.append(pos)
                tags.append(pos if tag == "_" else tag)
                morphs.append(morph if morph != "_" else "")
                heads.append((int(head) - 1) if head not in ("0",
                                                             "_") else id_)
                deps.append("ROOT" if dep == "root" else dep)
                deps_graphs.append(deps_graph)
                miscs.append(misc)

            doc = Doc(
                self.nlp.vocab,
                words=words,
                spaces=spaces,
                tags=tags,
                pos=poses,
                morphs=morphs,
                lemmas=lemmas,
                heads=heads,
                deps=deps,
            )

            # Set custom Token extensions
            for i in range(len(doc)):
                doc[i]._.conll_misc_field = miscs[i]
                doc[i]._.conll_deps_graphs_field = deps_graphs[i]

            ents = get_entities(lines, ner_tag_pattern, ner_map)
            doc.ents = spans_from_biluo_tags(doc, ents)

            # The deprel relations ensure that this CoNLL chunk is one sentence
            # Deprel cannot therefore not be empty or each word is considered a separate sentence
            if len(list(doc.sents)) != 1:
                raise ValueError(
                    "Your data is in an unexpected format. Make sure that it follows the CoNLL-U format"
                    " requirements. See https://universaldependencies.org/format.html. Particularly make"
                    " sure that the DEPREL field is filled in.")

            # Save the metadata in a custom sentence Span attribute so that the formatter can use it
            metadata = "\n".join(
                [l for l in chunk.splitlines() if l.startswith("#")])
            # We really only expect one sentence
            for sent in doc.sents:
                sent._.conll_metadata = f"{metadata}\n" if metadata else ""

            docs.append(doc)

        # Add CoNLL custom extensions
        return self.nlp.get_pipe("conll_formatter")(Doc.from_docs(docs))
Ejemplo n.º 8
0
def pdf_reader(
    pdf_path: str,
    nlp: spacy.Language,
    pdf_parser: BaseParser = pdfminer.PdfminerParser,
    verbose: bool = False,
    **kwargs: Any,
) -> spacy.tokens.Doc:
    """Convert a PDF document to a spaCy Doc object.

    Args:
        pdf_path: Path to a PDF file.
        nlp: A spaCy Language object with a loaded pipeline. For example
            `spacy.load("en_core_web_sm")`.
        pdf_parser: The parser to convert PDF file to text. Read the docs for
            more detailsDefaults to pdfminer.Parser.
        verbose: If True details will be printed to the terminal. By default,
            False.
        **kwargs: Arbitrary keyword arguments.

    Returns:
        A spacy Doc object with the custom extensions.

    Examples:
        By default pdfminer is used to extract text from the PDF.

        >>> import spacy
        >>> from spacypdfreader import pdf_reader
        >>>
        >>> nlp = spacy.load("en_core_web_sm")
        >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp)

        To be more explicit import `PdfminerParser` and pass it into the
        `pdf_reader` function.

        >>> import spacy
        >>> from spacypdfreader import pdf_reader
        >>> from spacypdfreader.parsers.pdfminer import PdfminerParser
        >>>
        >>> nlp = spacy.load("en_core_web_sm")
        >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PdfminerParser)

        Alternative parsers can be used as well such as pytesseract.

        >>> import spacy
        >>> from spacypdfreader import pdf_reader
        >>> from spacypdfreader.parsers.pytesseract import PytesseractParser
        >>>
        >>> nlp = spacy.load("en_core_web_sm")
        >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PytesseractParser)

        For more fine tuning you can pass in additional parameters to
        pytesseract.

        >>> import spacy
        >>> from spacypdfreader import pdf_reader
        >>> from spacypdfreader.parsers.pytesseract import PytesseractParser
        >>>
        >>> nlp = spacy.load("en_core_web_sm")
        >>> params = {"nice": 1}
        >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PytesseractParser, **params)
    """
    if verbose:
        console.print(f"PDF to text engine: [blue bold]{pdf_parser.name}[/]...")

    pdf_path = os.path.normpath(pdf_path)
    num_pages = _get_number_of_pages(pdf_path)

    # Convert pdf to text.
    if verbose:
        console.print(f"Extracting text from {num_pages} pdf pages...")
    texts = []
    for page_num in range(1, num_pages + 1):
        parser = pdf_parser(pdf_path, page_num)
        text = parser.pdf_to_text(**kwargs)
        texts.append(text)

    # Convert text to spaCy Doc objects.
    if verbose:
        console.print("Converting text to [blue bold]spaCy[/] Doc...")

    docs = [doc for doc in nlp.pipe(texts)]
    for idx, doc in enumerate(docs):
        page_num = idx + 1
        for token in doc:
            token._.page_number = page_num

    combined_doc = Doc.from_docs(docs)
    combined_doc._.pdf_file_name = pdf_path

    if verbose:
        console.print(":white_check_mark: [green]Complete!")

    return combined_doc