Example #1
0
    def load(cls, path, fname_prefix=None):
        """
        Load serialized content and metadata from disk, and initialize a TextDoc.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_doc.bin' and 'metadata.json'
                when saving to disk

        Returns:
            :class:`textacy.TextDoc`
        """
        if fname_prefix:
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadata.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_doc.bin']))
        else:
            meta_fname = os.path.join(path, 'metadata.json')
            docs_fname = os.path.join(path, 'spacy_doc.bin')
        metadata = list(fileio.read_json(meta_fname))[0]
        lang = metadata.pop('textacy_lang')
        spacy_version = metadata.pop('spacy_version')
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextDoc to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextDoc may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        spacy_vocab = data.load_spacy(lang).vocab
        return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0],
                   lang=lang, metadata=metadata)
Example #2
0
 def test_read_write_json(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     filename = os.path.join(self.tempdir, 'test_read_write_json.json')
     fileio.write_json(expected, filename)
     observed = list(fileio.read_json(filename, prefix=''))[0]
     self.assertEqual(observed, expected)
Example #3
0
 def test_read_write_json_prefix(self):
     to_write = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     expected = [item['sent'] for item in to_write]
     filename = os.path.join(self.tempdir, 'test_read_write_json_prefix.json')
     fileio.write_json(to_write, filename)
     observed = list(fileio.read_json(filename, prefix='item.sent'))
     self.assertEqual(observed, expected)
Example #4
0
    def load(cls, path, fname_prefix=None, compression=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk
            compression ({'gzip', 'bz2', 'lzma'} or None): type of compression
                used to reduce size of metadatas json file

        Returns:
            :class:`textacy.TextCorpus`

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path,
                                      '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(
                path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(
                path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip' else
                                   '.bz2' if compression == 'bz2' else
                                   '.xz' if compression == 'lzma' else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(
            meta_fname,
            mode=meta_mode,
        )
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc,
                        spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang,
                        metadata=metadata))
        return textcorpus
Example #5
0
 def test_read_write_json(self):
     expected = [{
         'idx': i,
         'sent': sent.text
     } for i, sent in enumerate(self.spacy_doc.sents)]
     filename = os.path.join(self.tempdir, 'test_read_write_json.json')
     fileio.write_json(expected, filename)
     observed = list(fileio.read_json(filename, prefix=''))[0]
     self.assertEqual(observed, expected)
Example #6
0
 def test_read_write_json_prefix(self):
     to_write = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     for prefix in ('idx', 'sent'):
         expected = [item[prefix] for item in to_write]
         filename = os.path.join(
             self.tempdir, 'test_read_write_json_prefix.json')
         fileio.write_json(to_write, filename, auto_make_dirs=True)
         observed = list(fileio.read_json(filename, prefix='item.' + prefix))
         self.assertEqual(observed, expected)
Example #7
0
    def load(cls, path, name=None, compression=None):
        """
        Load content and metadata from disk, and initialize a ``Corpus``.

        Args:
            path (str): Directory on disk where content + metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json',
                used when corpus was saved to disk via :meth:`Corpus.save()`.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file when saved, if any.

        Returns:
            :class:`textacy.Corpus <Corpus>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Corpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Corpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        corpus = Corpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode)
        spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            corpus.add_doc(
                Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
        return corpus
Example #8
0
    def load(cls, path, name=None, compression=None):
        """
        Load content and metadata from disk, and initialize a ``Corpus``.

        Args:
            path (str): Directory on disk where content + metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json',
                used when corpus was saved to disk via :meth:`Corpus.save()`.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file when saved, if any.

        Returns:
            :class:`textacy.Corpus <Corpus>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Corpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Corpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        corpus = Corpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode)
        spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            corpus.add_doc(
                Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
        return corpus
Example #9
0
    def load(cls, path, fname_prefix=None, compression=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk
            compression ({'gzip', 'bz2', 'lzma'} or None): type of compression
                used to reduce size of metadatas json file

        Returns:
            :class:`textacy.TextCorpus`

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode,)
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang, metadata=metadata))
        return textcorpus
Example #10
0
 def test_read_write_json_unicode(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_json_unicode' + ext)
         if PY2 is True and ext != '.json':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wt', None, True)
         else:
             fileio.write_json(expected, filename, mode='wt',
                               auto_make_dirs=True)
             observed = list(fileio.read_json(filename, mode='rt', prefix=''))[0]
             self.assertEqual(observed, expected)
Example #11
0
 def test_read_write_json_unicode(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_json_unicode' + ext)
         if PY2 is True and ext != '.json':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wt', None, True)
         else:
             fileio.write_json(expected, filename, mode='wt',
                               auto_make_dirs=True)
             observed = list(fileio.read_json(filename, mode='rt', prefix=''))[0]
             self.assertEqual(observed, expected)
Example #12
0
 def test_read_write_json_bytes(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_json_bytes' + ext)
         if PY2 is True:
             fileio.write_json(expected, filename, mode='wb',
                               auto_make_dirs=True)
             observed = list(fileio.read_json(filename, mode='rb', prefix=''))[0]
             self.assertEqual(observed, expected)
         else:
             self.assertRaises(
                 TypeError,
                 lambda: fileio.write_json(expected, filename, 'wb',
                                           auto_make_dirs=True))
Example #13
0
    def load(cls, path, fname_prefix=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk

        Returns:
            :class:`textacy.TextCorpus`
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname)
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang, metadata=metadata))
        return textcorpus
Example #14
0
    def load(cls, path, fname_prefix=None):
        """
        Load serialized content and metadata from disk, and initialize a TextDoc.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_doc.bin' and 'metadata.json'
                when saving to disk

        Returns:
            :class:`textacy.TextDoc`

        .. warn:: If the `spacy.Vocab` object used to save this document is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            meta_fname = os.path.join(
                path, '_'.join([fname_prefix, 'metadata.json']))
            docs_fname = os.path.join(
                path, '_'.join([fname_prefix, 'spacy_doc.bin']))
        else:
            meta_fname = os.path.join(path, 'metadata.json')
            docs_fname = os.path.join(path, 'spacy_doc.bin')
        metadata = list(fileio.read_json(meta_fname))[0]
        lang = metadata.pop('textacy_lang')
        spacy_version = metadata.pop('spacy_version')
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextDoc to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextDoc may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        spacy_vocab = data.load_spacy(lang).vocab
        return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0],
                   lang=lang,
                   metadata=metadata)
Example #15
0
    def load(cls, path, name=None):
        """
        Load content and metadata from disk, and initialize a ``Doc``.

        Args:
            path (str): Directory on disk where content and metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_doc.bin' and 'metadata.json', used when doc was
                saved to disk via :meth:`Doc.save()`.

        Returns:
            :class:`textacy.Doc <Doc>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            meta_fname = os.path.join(path, '_'.join([name, 'metadata.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_doc.bin']))
        else:
            meta_fname = os.path.join(path, 'metadata.json')
            docs_fname = os.path.join(path, 'spacy_doc.bin')
        metadata = list(fileio.read_json(meta_fname))[0]
        lang = metadata.pop('textacy_lang')
        spacy_version = metadata.pop('spacy_version')
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Doc to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Doc may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        spacy_vocab = data.load_spacy(lang).vocab
        return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0],
                   lang=lang,
                   metadata=metadata)
Example #16
0
    def load(cls, path, name=None):
        """
        Load content and metadata from disk, and initialize a ``Doc``.

        Args:
            path (str): Directory on disk where content and metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_doc.bin' and 'metadata.json', used when doc was
                saved to disk via :meth:`Doc.save()`.

        Returns:
            :class:`textacy.Doc <Doc>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            meta_fname = os.path.join(path, '_'.join([name, 'metadata.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_doc.bin']))
        else:
            meta_fname = os.path.join(path, 'metadata.json')
            docs_fname = os.path.join(path, 'spacy_doc.bin')
        metadata = list(fileio.read_json(meta_fname))[0]
        lang = metadata.pop('textacy_lang')
        spacy_version = metadata.pop('spacy_version')
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Doc to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Doc may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        spacy_vocab = data.load_spacy(lang).vocab
        return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0],
                   lang=lang, metadata=metadata)
Example #17
0
 def test_read_write_json_bytes(self):
     expected = [{
         'idx': i,
         'sent': sent.text
     } for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(self.tempdir,
                                 'test_read_write_json_bytes' + ext)
         if is_python2 is True:
             if ext == '.json.xz':
                 self.assertRaises(ValueError, fileio.open_sesame, filename,
                                   'wb', 'utf-8', True)
             else:
                 fileio.write_json(expected,
                                   filename,
                                   mode='wb',
                                   auto_make_dirs=True)
                 observed = list(
                     fileio.read_json(filename, mode='rb', prefix=''))[0]
                 self.assertEqual(observed, expected)
         else:
             self.assertRaises(
                 TypeError, lambda: fileio.write_json(
                     expected, filename, 'wb', auto_make_dirs=True))