def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_texts = ["Merging the docs is fun.", "", "They don't think alike."] en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] docs_idx = en_texts[0].index("docs") de_doc = de_tokenizer(de_text) expected = (True, None, None, None) en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected assert Doc.from_docs([]) is None assert de_doc is not Doc.from_docs([de_doc]) assert str(de_doc) == str(Doc.from_docs([de_doc])) with pytest.raises(ValueError): Doc.from_docs(en_docs + [de_doc]) m_doc = Doc.from_docs(en_docs) assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) assert str(m_doc) == " ".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") assert m_doc[9].idx == think_idx with pytest.raises(AttributeError): # not callable, because it was not set via set_extension m_doc[2]._.is_ambiguous assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(str(m_doc)) == sum(len(t) for t in en_texts) assert str(m_doc) == "".join(en_texts) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think") assert m_doc[9].idx == think_idx m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing assert str(m_doc) == " ".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") assert m_doc[9].idx == think_idx
def prepare_data( params: Params, verbose: bool = True, ) -> Dict[str, Doc]: """ return a single spacy doc for each age. warning: if corpus binary is not on disk already, it will be saved to disk. this means the corpus should never be modified - else, the binary will also contain unexpected modifications """ # try loading transcripts from disk fn = params.corpus_name + '.spacy' bin_path = configs.Dirs.corpora / fn if bin_path.exists(): doc_bin = DocBin().from_disk(bin_path) docs = list(doc_bin.get_docs(nlp.vocab)) # load raw transcripts + process them else: print( f'WARNING: Did not find binary file associated with {params.corpus_name}. Preprocessing corpus...' ) transcripts = load_transcripts(params) docs: List[Doc] = [doc for doc in nlp.pipe(transcripts)] # WARNING: only save to disk if we know that corpus has not been modified doc_bin = DocBin(docs=docs) doc_bin.to_disk(bin_path) # group docs by age ages = load_ages(params) if len(ages) != len(docs): raise RuntimeError(f'Num docs={len(docs)} and num ages={len(ages)}') age2docs = {} for age in SortedSet(ages): if age == EXCLUDED_AGE: continue docs_at_age = [docs[n] for n, ai in enumerate(ages) if ai == age] age2docs[age] = docs_at_age if verbose: print( f'Processed {len(age2docs[age]):>6} transcripts for age={age}') # combine all documents at same age age2doc = {} for age, docs in age2docs.items(): doc_combined = Doc.from_docs(docs) age2doc[age] = doc_combined print(f'Num tokens at age={age} is {len(doc_combined):,}') return age2doc
def merge_docs(language=None, docs=[], docbin=None, text=None): if docbin: assert language is not None model = settings.LANGUAGE_MODELS[language] # docs.extend(list(docbin.get_docs(model.vocab))) docs = docs + list(docbin.get_docs(model.vocab)) if text: docs.append(text_to_doc(text)) if not docs: return -1, None if not all(doc[0].lang_ == doc.lang_ for doc in docs): return -2, None if len(docs) == 1: doc = docs[0] else: doc = Doc.from_docs(docs) _init_doc(doc) return 0, doc
def make_doc_from_text_chunks( text: str, lang: types.LangLike, chunk_size: int = 100000, ) -> Doc: """ Make a single spaCy-processed document from 1 or more chunks of ``text``. This is a workaround for processing very long texts, for which spaCy is unable to allocate enough RAM. Args: text: Text document to be chunked and processed by spaCy. lang: Language with which spaCy processes ``text``, represented as the full name of or path on disk to the pipeline, or an already instantiated pipeline instance. chunk_size: Number of characters comprising each text chunk (excluding the last chunk, which is probably smaller). For best performance, value should be somewhere between 1e3 and 1e7, depending on how much RAM you have available. .. note:: Since chunking is done by character, chunks edges' probably won't respect natural language segmentation, which means that every ``chunk_size`` characters, spaCy's models may make mistakes. Returns: A single processed document, built from concatenated text chunks. """ utils.deprecated( "This function is deprecated, and will be removed in a future version. " "Instead, use the usual :func:`textacy.make_spacy_doc()` " "and specify a non-null `chunk_size`", action="once", ) lang = resolve_langlike(lang) text_chunks = (text[i:i + chunk_size] for i in range(0, len(text), chunk_size)) docs = list(lang.pipe(text_chunks)) return Doc.from_docs(docs)
def _make_spacy_doc_from_text_chunks(text: str, lang: Language, chunk_size: int) -> Doc: text_chunks = (text[i : i + chunk_size] for i in range(0, len(text), chunk_size)) return Doc.from_docs(list(lang.pipe(text_chunks)))
def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_texts = [ "Merging the docs is fun.", "", "They don't think alike. ", "", "Another doc.", ] en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] en_docs[4].spans["group"] = [en_docs[4][0:1]] span_group_texts = sorted( [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text] ) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) en_docs[0][2]._.is_ambiguous = True # docs en_docs[2][3]._.is_ambiguous = True # think assert Doc.from_docs([]) is None assert de_doc is not Doc.from_docs([de_doc]) assert str(de_doc) == str(Doc.from_docs([de_doc])) with pytest.raises(ValueError): Doc.from_docs(en_docs + [de_doc]) m_doc = Doc.from_docs(en_docs) assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") assert m_doc[2]._.is_ambiguous is True assert m_doc[9].idx == think_idx assert m_doc[9]._.is_ambiguous is True assert not any([t._.is_ambiguous for t in m_doc[3:8]]) assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(m_doc.text) == sum(len(t) for t in en_texts) assert m_doc.text == "".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think") assert m_doc[9].idx == think_idx assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] assert len(m_doc) == len(en_docs_tokens) think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think") assert m_doc[9].idx == think_idx assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) # can merge empty docs doc = Doc.from_docs([en_tokenizer("")] * 10) # empty but set spans keys are preserved en_docs = [en_tokenizer(text) for text in en_texts] m_doc = Doc.from_docs(en_docs) assert "group" not in m_doc.spans for doc in en_docs: doc.spans["group"] = [] m_doc = Doc.from_docs(en_docs) assert "group" in m_doc.spans assert len(m_doc.spans["group"]) == 0
def parse_conll_text_as_spacy( self, text: str, ner_tag_pattern: str = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$", ner_map: Dict[str, str] = None, ) -> Doc: """Parses a given CoNLL-U string into a spaCy doc. Parsed sentence section must be separated by a new line (\n\n). Note that we do our best to retain as much information as possible but that not all CoNLL-U fields are supported in spaCy. We add a Token._.conll_misc_field extension to save CoNLL-U MISC field, and a Token._.conll_deps_graphs_field extension to save CoNLL-U DEPS field. The metadata (lines starting with #) is saved in Span._.conll_metadata of sentence Spans. This method has been adapted from the work by spaCy. See: https://github.com/explosion/spaCy/blob/a1c5b694be117ac92e21f9860309821ad6da06f7/spacy/cli/converters/conllu2json.py#L179 Multi-word tokens and empty nodes are not supported. :param text: CoNLL-U formatted text :param ner_tag_pattern: Regex pattern for entity tag in the MISC field :param ner_map: Map old NER tag names to new ones, '' maps to O :return: a spacy Doc containing all the tokens and sentences from the CoNLL file including the custom CoNLL extensions """ if not Token.has_extension("conll_misc_field"): Token.set_extension("conll_misc_field", default="_") if not Token.has_extension("conll_deps_graphs_field"): Token.set_extension("conll_deps_graphs_field", default="_") if not Span.has_extension("conll_metadata"): Span.set_extension("conll_metadata", default=None) docs = [] for chunk in text.split("\n\n"): lines = [ l for l in chunk.splitlines() if l and not l.startswith("#") ] words, spaces, tags, poses, morphs, lemmas, miscs = [], [], [], [], [], [], [] heads, deps, deps_graphs = [], [], [] for i in range(len(lines)): line = lines[i] parts = line.split("\t") if any(not p for p in parts): raise ValueError( "According to the CoNLL-U Format, fields cannot be empty. See" " https://universaldependencies.org/format.html") id_, word, lemma, pos, tag, morph, head, dep, deps_graph, misc = parts if any(" " in f for f in (id_, pos, tag, morph, head, dep, deps_graph)): raise ValueError( "According to the CoNLL-U Format, only FORM, LEMMA, and MISC fields can contain" " spaces. See https://universaldependencies.org/format.html" ) if "." in id_ or "-" in id_: raise NotImplementedError( "Multi-word tokens and empty nodes are not supported in spacy_conll" ) words.append(word) if "SpaceAfter=No" in misc: spaces.append(False) else: spaces.append(True) id_ = int(id_) - 1 lemmas.append(lemma) poses.append(pos) tags.append(pos if tag == "_" else tag) morphs.append(morph if morph != "_" else "") heads.append((int(head) - 1) if head not in ("0", "_") else id_) deps.append("ROOT" if dep == "root" else dep) deps_graphs.append(deps_graph) miscs.append(misc) doc = Doc( self.nlp.vocab, words=words, spaces=spaces, tags=tags, pos=poses, morphs=morphs, lemmas=lemmas, heads=heads, deps=deps, ) # Set custom Token extensions for i in range(len(doc)): doc[i]._.conll_misc_field = miscs[i] doc[i]._.conll_deps_graphs_field = deps_graphs[i] ents = get_entities(lines, ner_tag_pattern, ner_map) doc.ents = spans_from_biluo_tags(doc, ents) # The deprel relations ensure that this CoNLL chunk is one sentence # Deprel cannot therefore not be empty or each word is considered a separate sentence if len(list(doc.sents)) != 1: raise ValueError( "Your data is in an unexpected format. Make sure that it follows the CoNLL-U format" " requirements. See https://universaldependencies.org/format.html. Particularly make" " sure that the DEPREL field is filled in.") # Save the metadata in a custom sentence Span attribute so that the formatter can use it metadata = "\n".join( [l for l in chunk.splitlines() if l.startswith("#")]) # We really only expect one sentence for sent in doc.sents: sent._.conll_metadata = f"{metadata}\n" if metadata else "" docs.append(doc) # Add CoNLL custom extensions return self.nlp.get_pipe("conll_formatter")(Doc.from_docs(docs))
def pdf_reader( pdf_path: str, nlp: spacy.Language, pdf_parser: BaseParser = pdfminer.PdfminerParser, verbose: bool = False, **kwargs: Any, ) -> spacy.tokens.Doc: """Convert a PDF document to a spaCy Doc object. Args: pdf_path: Path to a PDF file. nlp: A spaCy Language object with a loaded pipeline. For example `spacy.load("en_core_web_sm")`. pdf_parser: The parser to convert PDF file to text. Read the docs for more detailsDefaults to pdfminer.Parser. verbose: If True details will be printed to the terminal. By default, False. **kwargs: Arbitrary keyword arguments. Returns: A spacy Doc object with the custom extensions. Examples: By default pdfminer is used to extract text from the PDF. >>> import spacy >>> from spacypdfreader import pdf_reader >>> >>> nlp = spacy.load("en_core_web_sm") >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp) To be more explicit import `PdfminerParser` and pass it into the `pdf_reader` function. >>> import spacy >>> from spacypdfreader import pdf_reader >>> from spacypdfreader.parsers.pdfminer import PdfminerParser >>> >>> nlp = spacy.load("en_core_web_sm") >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PdfminerParser) Alternative parsers can be used as well such as pytesseract. >>> import spacy >>> from spacypdfreader import pdf_reader >>> from spacypdfreader.parsers.pytesseract import PytesseractParser >>> >>> nlp = spacy.load("en_core_web_sm") >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PytesseractParser) For more fine tuning you can pass in additional parameters to pytesseract. >>> import spacy >>> from spacypdfreader import pdf_reader >>> from spacypdfreader.parsers.pytesseract import PytesseractParser >>> >>> nlp = spacy.load("en_core_web_sm") >>> params = {"nice": 1} >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PytesseractParser, **params) """ if verbose: console.print(f"PDF to text engine: [blue bold]{pdf_parser.name}[/]...") pdf_path = os.path.normpath(pdf_path) num_pages = _get_number_of_pages(pdf_path) # Convert pdf to text. if verbose: console.print(f"Extracting text from {num_pages} pdf pages...") texts = [] for page_num in range(1, num_pages + 1): parser = pdf_parser(pdf_path, page_num) text = parser.pdf_to_text(**kwargs) texts.append(text) # Convert text to spaCy Doc objects. if verbose: console.print("Converting text to [blue bold]spaCy[/] Doc...") docs = [doc for doc in nlp.pipe(texts)] for idx, doc in enumerate(docs): page_num = idx + 1 for token in doc: token._.page_number = page_num combined_doc = Doc.from_docs(docs) combined_doc._.pdf_file_name = pdf_path if verbose: console.print(":white_check_mark: [green]Complete!") return combined_doc