def load(cls, path, fname_prefix=None): """ Load serialized content and metadata from disk, and initialize a TextDoc. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_doc.bin' and 'metadata.json' when saving to disk Returns: :class:`textacy.TextDoc` """ if fname_prefix: meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadata.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') docs_fname = os.path.join(path, 'spacy_doc.bin') metadata = list(fileio.read_json(meta_fname))[0] lang = metadata.pop('textacy_lang') spacy_version = metadata.pop('spacy_version') if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextDoc to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextDoc may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) spacy_vocab = data.load_spacy(lang).vocab return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0], lang=lang, metadata=metadata)
def test_read_write_json(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] filename = os.path.join(self.tempdir, 'test_read_write_json.json') fileio.write_json(expected, filename) observed = list(fileio.read_json(filename, prefix=''))[0] self.assertEqual(observed, expected)
def test_read_write_json_prefix(self): to_write = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] expected = [item['sent'] for item in to_write] filename = os.path.join(self.tempdir, 'test_read_write_json_prefix.json') fileio.write_json(to_write, filename) observed = list(fileio.read_json(filename, prefix='item.sent')) self.assertEqual(observed, expected)
def load(cls, path, fname_prefix=None, compression=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file Returns: :class:`textacy.TextCorpus` .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines( meta_fname, mode=meta_mode, ) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def test_read_write_json(self): expected = [{ 'idx': i, 'sent': sent.text } for i, sent in enumerate(self.spacy_doc.sents)] filename = os.path.join(self.tempdir, 'test_read_write_json.json') fileio.write_json(expected, filename) observed = list(fileio.read_json(filename, prefix=''))[0] self.assertEqual(observed, expected)
def test_read_write_json_prefix(self): to_write = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] for prefix in ('idx', 'sent'): expected = [item[prefix] for item in to_write] filename = os.path.join( self.tempdir, 'test_read_write_json_prefix.json') fileio.write_json(to_write, filename, auto_make_dirs=True) observed = list(fileio.read_json(filename, prefix='item.' + prefix)) self.assertEqual(observed, expected)
def load(cls, path, name=None, compression=None): """ Load content and metadata from disk, and initialize a ``Corpus``. Args: path (str): Directory on disk where content + metadata are saved. name (str): Identifying/uniquifying name prepended to the default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json', used when corpus was saved to disk via :meth:`Corpus.save()`. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file when saved, if any. Returns: :class:`textacy.Corpus <Corpus>` .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this Corpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded Corpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) corpus = Corpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode) spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): corpus.add_doc( Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus
def load(cls, path, fname_prefix=None, compression=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file Returns: :class:`textacy.TextCorpus` .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode,) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def test_read_write_json_unicode(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join( self.tempdir, 'test_read_write_json_unicode' + ext) if PY2 is True and ext != '.json': self.assertRaises( ValueError, fileio.open_sesame, filename, 'wt', None, True) else: fileio.write_json(expected, filename, mode='wt', auto_make_dirs=True) observed = list(fileio.read_json(filename, mode='rt', prefix=''))[0] self.assertEqual(observed, expected)
def test_read_write_json_bytes(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join( self.tempdir, 'test_read_write_json_bytes' + ext) if PY2 is True: fileio.write_json(expected, filename, mode='wb', auto_make_dirs=True) observed = list(fileio.read_json(filename, mode='rb', prefix=''))[0] self.assertEqual(observed, expected) else: self.assertRaises( TypeError, lambda: fileio.write_json(expected, filename, 'wb', auto_make_dirs=True))
def load(cls, path, fname_prefix=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk Returns: :class:`textacy.TextCorpus` """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines(meta_fname) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def load(cls, path, fname_prefix=None): """ Load serialized content and metadata from disk, and initialize a TextDoc. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_doc.bin' and 'metadata.json' when saving to disk Returns: :class:`textacy.TextDoc` .. warn:: If the `spacy.Vocab` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadata.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') docs_fname = os.path.join(path, 'spacy_doc.bin') metadata = list(fileio.read_json(meta_fname))[0] lang = metadata.pop('textacy_lang') spacy_version = metadata.pop('spacy_version') if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextDoc to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextDoc may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) spacy_vocab = data.load_spacy(lang).vocab return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0], lang=lang, metadata=metadata)
def load(cls, path, name=None): """ Load content and metadata from disk, and initialize a ``Doc``. Args: path (str): Directory on disk where content and metadata are saved. name (str): Identifying/uniquifying name prepended to the default filenames 'spacy_doc.bin' and 'metadata.json', used when doc was saved to disk via :meth:`Doc.save()`. Returns: :class:`textacy.Doc <Doc>` .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: meta_fname = os.path.join(path, '_'.join([name, 'metadata.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') docs_fname = os.path.join(path, 'spacy_doc.bin') metadata = list(fileio.read_json(meta_fname))[0] lang = metadata.pop('textacy_lang') spacy_version = metadata.pop('spacy_version') if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this Doc to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded Doc may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) spacy_vocab = data.load_spacy(lang).vocab return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0], lang=lang, metadata=metadata)
def test_read_write_json_bytes(self): expected = [{ 'idx': i, 'sent': sent.text } for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join(self.tempdir, 'test_read_write_json_bytes' + ext) if is_python2 is True: if ext == '.json.xz': self.assertRaises(ValueError, fileio.open_sesame, filename, 'wb', 'utf-8', True) else: fileio.write_json(expected, filename, mode='wb', auto_make_dirs=True) observed = list( fileio.read_json(filename, mode='rb', prefix=''))[0] self.assertEqual(observed, expected) else: self.assertRaises( TypeError, lambda: fileio.write_json( expected, filename, 'wb', auto_make_dirs=True))