def __init__(self, lang, texts=None, docs=None, metadatas=None): if isinstance(lang, unicode_type): self.lang = lang self.spacy_lang = data.load_spacy(self.lang) elif isinstance(lang, SpacyLang): self.lang = lang.lang self.spacy_lang = lang else: msg = '`lang` must be {}, not "{}"'.format( {unicode_type, SpacyLang}, type(lang)) raise ValueError(msg) self.spacy_vocab = self.spacy_lang.vocab self.spacy_stringstore = self.spacy_vocab.strings self.docs = [] self.n_docs = 0 self.n_tokens = 0 self.n_sents = 0 if self.spacy_lang.parser else None if texts and docs: msg = 'Corpus may be initialized with either `texts` or `docs`, but not both.' raise ValueError(msg) if texts: self.add_texts(texts, metadatas=metadatas) elif docs: if metadatas: for doc, metadata in zip(docs, metadatas): self.add_doc(doc, metadata=metadata) else: for doc in docs: self.add_doc(doc)
def from_texts(cls, lang, texts, metadata=None, n_threads=2, batch_size=1000): """ Convenience function for creating a :class:`TextCorpus <textacy.texts.TextCorpus>` from an iterable of text strings. Args: lang (str) texts (iterable(str)) metadata (iterable(dict), optional) n_threads (int, optional) batch_size (int, optional) Returns: :class:`TextCorpus <textacy.texts.TextCorpus>` """ textcorpus = cls(lang=lang) spacy_docs = textcorpus.spacy_pipeline.pipe( texts, n_threads=n_threads, batch_size=batch_size) if metadata is not None: for spacy_doc, md in zip(spacy_docs, metadata): textcorpus.add_doc(TextDoc(spacy_doc, lang=lang, spacy_pipeline=textcorpus.spacy_pipeline, metadata=md)) else: for spacy_doc in spacy_docs: textcorpus.add_doc(TextDoc(spacy_doc, lang=lang, spacy_pipeline=textcorpus.spacy_pipeline, metadata=None)) return textcorpus
def add_texts(self, texts, metadatas=None, n_threads=4, batch_size=1000): """ Process a stream of texts (and a corresponding stream of metadata dicts, optionally) in parallel with spaCy; add as :class:`textacy.Doc <textacy.doc.Doc>` s to the corpus. Args: texts (Iterable[str]): Stream of texts to add to corpus as ``Doc`` s metadatas (Iterable[dict]): Stream of dictionaries of relevant document metadata. **Note:** This stream must align exactly with ``texts``, or metadata will be mis-assigned to texts. More concretely, the first item in ``metadatas`` will be assigned to the first item in ``texts``, and so on from there. n_threads (int): Number of threads to use when processing ``texts`` in parallel, if available. batch_size (int): Number of texts to process at a time. See Also: :func:`fileio.split_record_fields()` http://spacy.io/docs/#multi-threaded """ spacy_docs = self.spacy_lang.pipe( texts, n_threads=n_threads, batch_size=batch_size) if metadatas: for spacy_doc, metadata in zip(spacy_docs, metadatas): self._add_textacy_doc( Doc(spacy_doc, lang=self.spacy_lang, metadata=metadata)) else: for spacy_doc in spacy_docs: self._add_textacy_doc( Doc(spacy_doc, lang=self.spacy_lang, metadata=None))
def add_texts(self, texts, metadatas=None, n_threads=4, batch_size=1000): """ Process a stream of texts (and a corresponding stream of metadata dicts, optionally) in parallel with spaCy; add as :class:`textacy.Doc <textacy.doc.Doc>` s to the corpus. Args: texts (Iterable[str]): Stream of texts to add to corpus as ``Doc`` s metadatas (Iterable[dict]): Stream of dictionaries of relevant document metadata. **Note:** This stream must align exactly with ``texts``, or metadata will be mis-assigned to texts. More concretely, the first item in ``metadatas`` will be assigned to the first item in ``texts``, and so on from there. n_threads (int): Number of threads to use when processing ``texts`` in parallel, if available. batch_size (int): Number of texts to process at a time. See Also: :func:`fileio.split_record_fields()` http://spacy.io/docs/#multi-threaded """ spacy_docs = self.spacy_lang.pipe(texts, n_threads=n_threads, batch_size=batch_size) if metadatas: for spacy_doc, metadata in zip(spacy_docs, metadatas): self._add_textacy_doc( Doc(spacy_doc, lang=self.spacy_lang, metadata=metadata)) else: for spacy_doc in spacy_docs: self._add_textacy_doc( Doc(spacy_doc, lang=self.spacy_lang, metadata=None))
def load(cls, path, fname_prefix=None, compression=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file Returns: :class:`textacy.TextCorpus` .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines( meta_fname, mode=meta_mode, ) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def load(cls, path, name=None, compression=None): """ Load content and metadata from disk, and initialize a ``Corpus``. Args: path (str): Directory on disk where content + metadata are saved. name (str): Identifying/uniquifying name prepended to the default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json', used when corpus was saved to disk via :meth:`Corpus.save()`. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file when saved, if any. Returns: :class:`textacy.Corpus <Corpus>` .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this Corpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded Corpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) corpus = Corpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode) spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): corpus.add_doc( Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus
def load(cls, path, fname_prefix=None, compression=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file Returns: :class:`textacy.TextCorpus` .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode,) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def load(cls, path, fname_prefix=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk Returns: :class:`textacy.TextCorpus` """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines(meta_fname) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def from_texts(cls, lang_or_pipeline, texts, metadata=None, n_threads=2, batch_size=1000): """ Convenience function for creating a :class:`TextCorpus <textacy.texts.TextCorpus>` from an iterable of text strings. Args: lang_or_pipeline ({'en', 'de'} or :class:`spacy.<lang>.<Language>`) texts (iterable(str)) metadata (iterable(dict), optional) n_threads (int, optional) batch_size (int, optional) Returns: :class:`TextCorpus <textacy.texts.TextCorpus>` """ textcorpus = cls(lang_or_pipeline) spacy_docs = textcorpus.spacy_pipeline.pipe(texts, n_threads=n_threads, batch_size=batch_size) if metadata is not None: for spacy_doc, md in zip(spacy_docs, metadata): textcorpus.add_doc( TextDoc(spacy_doc, lang=textcorpus.lang, spacy_pipeline=textcorpus.spacy_pipeline, metadata=md)) else: for spacy_doc in spacy_docs: textcorpus.add_doc( TextDoc(spacy_doc, lang=textcorpus.lang, spacy_pipeline=textcorpus.spacy_pipeline, metadata=None)) return textcorpus