def save(self, path, name=None): """ Save ``Doc`` content and metadata to disk. Args: path (str): Directory on disk where content + metadata will be saved. name (str): Prepend default filenames 'spacy_doc.bin' and 'metadata.json' with a name to identify/uniquify this particular document. .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: meta_fname = os.path.join(path, '_'.join([name, 'metadata.json'])) doc_fname = os.path.join(path, '_'.join([name, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') doc_fname = os.path.join(path, 'spacy_doc.bin') package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json( dict(package_info, **self.metadata), meta_fname) fileio.write_spacy_docs(self.spacy_doc, doc_fname)
def save(self, path, fname_prefix=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def save(self, path, fname_prefix=None): """ Save serialized TextDoc content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_doc.bin' and 'metadata.json' with additional identifying information .. warn:: If the `spacy.Vocab` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadata.json'])) doc_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') doc_fname = os.path.join(path, 'spacy_doc.bin') package_info = { 'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__ } fileio.write_json(dict(package_info, **self.metadata), meta_fname) fileio.write_spacy_docs(self.spacy_doc, doc_fname)
def test_read_write_spacy_doc(self): expected = [tok.lemma_ for tok in self.spacy_doc] filename = os.path.join(self.tempdir, 'test_read_write_spacy_doc.bin') fileio.write_spacy_docs(self.spacy_doc, filename) observed = [tok.lemma_ for doc in fileio.read_spacy_docs(self.spacy_pipeline.vocab, filename) for tok in doc] self.assertEqual(observed, expected)
def save(self, path, fname_prefix=None, compression=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines( (doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def test_read_write_spacy_doc(self): expected = [tok.lemma_ for tok in self.spacy_doc] filename = os.path.join(self.tempdir, 'test_read_write_spacy_doc.bin') fileio.write_spacy_docs(self.spacy_doc, filename) observed = [ tok.lemma_ for doc in fileio.read_spacy_docs( self.spacy_pipeline.vocab, filename) for tok in doc ] self.assertEqual(observed, expected)
def test_read_write_spacy_doc(self): expected = [tok.lemma_ for tok in self.spacy_doc] tempdir = tempfile.mkdtemp() filename = os.path.join(tempdir, 'test_read_write_spacy_doc.bin') fileio.write_spacy_docs(self.spacy_doc, filename) observed = [tok.lemma_ for doc in fileio.read_spacy_docs(self.spacy_pipeline.vocab, filename) for tok in doc] os.remove(filename) os.rmdir(tempdir) self.assertEqual(observed, expected)
def test_read_write_spacy_docs(self): expected = [tok.lemma_ for tok in self.spacy_doc] for ext in ('.pkl', '.pkl.gz', '.pkl.bz2', '.pkl.xz'): filename = os.path.join(self.tempdir, 'test_read_write_spacy_docs' + ext) if is_python2 is True and ext == '.pkl.xz': self.assertRaises(ValueError, fileio.open_sesame, filename, 'wb', None, True) else: fileio.write_spacy_docs(self.spacy_doc, filename, True) observed = [ tok.lemma_ for doc in fileio.read_spacy_docs(filename) for tok in doc ] self.assertEqual(observed, expected)
def test_read_write_spacy_docs(self): expected = [tok.lemma_ for tok in self.spacy_doc] for ext in ('.bin', '.bin.gz', '.bin.bz2', '.bin.xz'): filename = os.path.join( self.tempdir, 'test_read_write_spacy_docs' + ext) if PY2 is True and ext == '.bin.gz': # no idea why this is the case self.assertRaises( TypeError, fileio.write_spacy_docs, self.spacy_doc, filename, True) else: fileio.write_spacy_docs(self.spacy_doc, filename, True) observed = [ tok.lemma_ for doc in fileio.read_spacy_docs(self.spacy_pipeline.vocab, filename) for tok in doc] self.assertEqual(observed, expected)
def test_read_write_spacy_docs(self): expected = [tok.lemma_ for tok in self.spacy_doc] for ext in ('.bin', '.bin.gz', '.bin.bz2', '.bin.xz'): filename = os.path.join(self.tempdir, 'test_read_write_spacy_docs' + ext) if PY2 is True and ext == '.bin.xz': self.assertRaises(ValueError, fileio.open_sesame, filename, 'wb', None, True) elif PY2 is True and ext == '.bin.gz': # no idea why this is the case self.assertRaises(TypeError, fileio.write_spacy_docs, self.spacy_doc, filename, True) else: fileio.write_spacy_docs(self.spacy_doc, filename, True) observed = [ tok.lemma_ for doc in fileio.read_spacy_docs( self.spacy_lang.vocab, filename) for tok in doc ] self.assertEqual(observed, expected)
def save(self, path, fname_prefix=None): """ Save serialized TextDoc content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_doc.bin' and 'metadata.json' with additional identifying information """ if fname_prefix: meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadata.json'])) doc_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') doc_fname = os.path.join(path, 'spacy_doc.bin') package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json( dict(package_info, **self.metadata), meta_fname) fileio.write_spacy_docs(self.spacy_doc, doc_fname)
def save(self, path, fname_prefix=None, compression=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = { 'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__ } fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def save(self, path, name=None, compression=None): """ Save ``Corpus`` content and metadata to disk. Args: path (str): Directory on disk where content + metadata will be saved. name (str): Prepend default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json' with a name to identify/uniquify this particular corpus. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file, if any. .. warning:: If the ``spacy.Vocab`` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.pkl'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if is_python2 is False or compression is None else 'wb' package_info = { 'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__ } fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def save(self, path, name=None, compression=None): """ Save ``Corpus`` content and metadata to disk. Args: path (str): Directory on disk where content + metadata will be saved. name (str): Prepend default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json' with a name to identify/uniquify this particular corpus. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file, if any. .. warning:: If the ``spacy.Vocab`` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines( (doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)