def save(self, path, fname_prefix=None): """ Save serialized TextDoc content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_doc.bin' and 'metadata.json' with additional identifying information .. warn:: If the `spacy.Vocab` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadata.json'])) doc_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') doc_fname = os.path.join(path, 'spacy_doc.bin') package_info = { 'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__ } fileio.write_json(dict(package_info, **self.metadata), meta_fname) fileio.write_spacy_docs(self.spacy_doc, doc_fname)
def save(self, path, name=None): """ Save ``Doc`` content and metadata to disk. Args: path (str): Directory on disk where content + metadata will be saved. name (str): Prepend default filenames 'spacy_doc.bin' and 'metadata.json' with a name to identify/uniquify this particular document. .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: meta_fname = os.path.join(path, '_'.join([name, 'metadata.json'])) doc_fname = os.path.join(path, '_'.join([name, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') doc_fname = os.path.join(path, 'spacy_doc.bin') package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json( dict(package_info, **self.metadata), meta_fname) fileio.write_spacy_docs(self.spacy_doc, doc_fname)
def save(self, path, fname_prefix=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def test_read_write_json(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] filename = os.path.join(self.tempdir, 'test_read_write_json.json') fileio.write_json(expected, filename) observed = list(fileio.read_json(filename, prefix=''))[0] self.assertEqual(observed, expected)
def save(self, path, fname_prefix=None, compression=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines( (doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def test_read_write_json_prefix(self): to_write = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] expected = [item['sent'] for item in to_write] filename = os.path.join(self.tempdir, 'test_read_write_json_prefix.json') fileio.write_json(to_write, filename) observed = list(fileio.read_json(filename, prefix='item.sent')) self.assertEqual(observed, expected)
def test_read_write_json(self): expected = [{ 'idx': i, 'sent': sent.text } for i, sent in enumerate(self.spacy_doc.sents)] filename = os.path.join(self.tempdir, 'test_read_write_json.json') fileio.write_json(expected, filename) observed = list(fileio.read_json(filename, prefix=''))[0] self.assertEqual(observed, expected)
def test_read_write_json_prefix(self): to_write = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] for prefix in ('idx', 'sent'): expected = [item[prefix] for item in to_write] filename = os.path.join( self.tempdir, 'test_read_write_json_prefix.json') fileio.write_json(to_write, filename, auto_make_dirs=True) observed = list(fileio.read_json(filename, prefix='item.' + prefix)) self.assertEqual(observed, expected)
def test_read_write_json_unicode(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join( self.tempdir, 'test_read_write_json_unicode' + ext) if PY2 is True and ext != '.json': self.assertRaises( ValueError, fileio.open_sesame, filename, 'wt', None, True) else: fileio.write_json(expected, filename, mode='wt', auto_make_dirs=True) observed = list(fileio.read_json(filename, mode='rt', prefix=''))[0] self.assertEqual(observed, expected)
def test_read_write_json_bytes(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join( self.tempdir, 'test_read_write_json_bytes' + ext) if PY2 is True: fileio.write_json(expected, filename, mode='wb', auto_make_dirs=True) observed = list(fileio.read_json(filename, mode='rb', prefix=''))[0] self.assertEqual(observed, expected) else: self.assertRaises( TypeError, lambda: fileio.write_json(expected, filename, 'wb', auto_make_dirs=True))
def save(self, path, fname_prefix=None): """ Save serialized TextDoc content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_doc.bin' and 'metadata.json' with additional identifying information """ if fname_prefix: meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadata.json'])) doc_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') doc_fname = os.path.join(path, 'spacy_doc.bin') package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json( dict(package_info, **self.metadata), meta_fname) fileio.write_spacy_docs(self.spacy_doc, doc_fname)
def save(self, path, fname_prefix=None, compression=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = { 'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__ } fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def save(self, path, name=None, compression=None): """ Save ``Corpus`` content and metadata to disk. Args: path (str): Directory on disk where content + metadata will be saved. name (str): Prepend default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json' with a name to identify/uniquify this particular corpus. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file, if any. .. warning:: If the ``spacy.Vocab`` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.pkl'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if is_python2 is False or compression is None else 'wb' package_info = { 'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__ } fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def save(self, path, name=None, compression=None): """ Save ``Corpus`` content and metadata to disk. Args: path (str): Directory on disk where content + metadata will be saved. name (str): Prepend default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json' with a name to identify/uniquify this particular corpus. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file, if any. .. warning:: If the ``spacy.Vocab`` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines( (doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def test_read_write_json_bytes(self): expected = [{ 'idx': i, 'sent': sent.text } for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join(self.tempdir, 'test_read_write_json_bytes' + ext) if is_python2 is True: if ext == '.json.xz': self.assertRaises(ValueError, fileio.open_sesame, filename, 'wb', 'utf-8', True) else: fileio.write_json(expected, filename, mode='wb', auto_make_dirs=True) observed = list( fileio.read_json(filename, mode='rb', prefix=''))[0] self.assertEqual(observed, expected) else: self.assertRaises( TypeError, lambda: fileio.write_json( expected, filename, 'wb', auto_make_dirs=True))