Exemple #1
0
    def save(self, path, fname_prefix=None):
        """
        Save serialized TextCorpus content and metadata to disk.

        Args:
            path (str): directory on disk where content + metadata will be saved
            fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin'
                and 'metadatas.json' with additional identifying information

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__}
        fileio.write_json(package_info, info_fname)
        fileio.write_json_lines((doc.metadata for doc in self), meta_fname)
        fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
Exemple #2
0
 def test_read_write_json_lines(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     filename = os.path.join(self.tempdir, 'test_read_write_json_lines.json')
     fileio.write_json_lines(expected, filename)
     observed = list(fileio.read_json_lines(filename))
     self.assertEqual(observed, expected)
Exemple #3
0
    def save(self, path, fname_prefix=None, compression=None):
        """
        Save serialized TextCorpus content and metadata to disk.

        Args:
            path (str): directory on disk where content + metadata will be saved
            fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin'
                and 'metadatas.json' with additional identifying information
            compression ({'gzip', 'bz2', 'lzma'} or None): type of compression
                used to reduce size of metadatas json file

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'wt' if PY2 is False or compression is None else 'wb'
        package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__}
        fileio.write_json(package_info, info_fname)
        fileio.write_json_lines(
            (doc.metadata for doc in self), meta_fname, mode=meta_mode,
            ensure_ascii=False, separators=(',', ':'))
        fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
Exemple #4
0
 def test_read_write_json_lines(self):
     expected = [{
         'idx': i,
         'sent': sent.text
     } for i, sent in enumerate(self.spacy_doc.sents)]
     filename = os.path.join(self.tempdir,
                             'test_read_write_json_lines.json')
     fileio.write_json_lines(expected, filename)
     observed = list(fileio.read_json_lines(filename))
     self.assertEqual(observed, expected)
 def setUp(self):
     self.tempdir = tempfile.mkdtemp(
         prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
     reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
     if PY2 is False:
         write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wt',
                          auto_make_dirs=True)
     else:
         write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wb',
                          auto_make_dirs=True)
     self.redditreader = RedditReader(reddit_fname)
Exemple #6
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp(
         prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
     reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
     if is_python2 is False:
         write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wt',
                          auto_make_dirs=True)
     else:
         write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wb',
                          auto_make_dirs=True)
     self.redditreader = RedditReader(reddit_fname)
Exemple #7
0
 def test_read_write_json_lines_unicode(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_json_lines_unicode' + ext)
         if PY2 is True and ext != '.json':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wt', None, True)
         else:
             fileio.write_json_lines(expected, filename, mode='wt',
                                     auto_make_dirs=True)
             observed = list(fileio.read_json_lines(filename, mode='rt'))
             self.assertEqual(observed, expected)
Exemple #8
0
 def test_read_write_json_lines_unicode(self):
     expected = [{'idx': i, 'sent': sent.text}
                 for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_json_lines_unicode' + ext)
         if PY2 is True and ext != '.json':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wt', None, True)
         else:
             fileio.write_json_lines(expected, filename, mode='wt',
                                     auto_make_dirs=True)
             observed = list(fileio.read_json_lines(filename, mode='rt'))
             self.assertEqual(observed, expected)
Exemple #9
0
    def save(self, path, fname_prefix=None, compression=None):
        """
        Save serialized TextCorpus content and metadata to disk.

        Args:
            path (str): directory on disk where content + metadata will be saved
            fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin'
                and 'metadatas.json' with additional identifying information
            compression ({'gzip', 'bz2', 'lzma'} or None): type of compression
                used to reduce size of metadatas json file

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path,
                                      '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(
                path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(
                path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip' else
                                   '.bz2' if compression == 'bz2' else
                                   '.xz' if compression == 'lzma' else '')
        meta_mode = 'wt' if PY2 is False or compression is None else 'wb'
        package_info = {
            'textacy_lang': self.lang,
            'spacy_version': spacy.about.__version__
        }
        fileio.write_json(package_info, info_fname)
        fileio.write_json_lines((doc.metadata for doc in self),
                                meta_fname,
                                mode=meta_mode,
                                ensure_ascii=False,
                                separators=(',', ':'))
        fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
Exemple #10
0
    def save(self, path, fname_prefix=None):
        """
        Save serialized TextCorpus content and metadata to disk.

        Args:
            path (str): directory on disk where content + metadata will be saved
            fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin'
                and 'metadatas.json' with additional identifying information
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__}
        fileio.write_json(package_info, info_fname)
        fileio.write_json_lines((doc.metadata for doc in self), meta_fname)
        fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
Exemple #11
0
    def save(self, path, name=None, compression=None):
        """
        Save ``Corpus`` content and metadata to disk.

        Args:
            path (str): Directory on disk where content + metadata will be saved.
            name (str): Prepend default filenames 'spacy_docs.bin', 'metadatas.json',
                and 'info.json' with a name to identify/uniquify this particular
                corpus.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file, if any.

        .. warning:: If the ``spacy.Vocab`` object used to save this corpus is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.pkl']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip' else
                                   '.bz2' if compression == 'bz2' else
                                   '.xz' if compression == 'lzma' else '')
        meta_mode = 'wt' if is_python2 is False or compression is None else 'wb'
        package_info = {
            'textacy_lang': self.lang,
            'spacy_version': spacy.about.__version__
        }
        fileio.write_json(package_info, info_fname)
        fileio.write_json_lines((doc.metadata for doc in self),
                                meta_fname,
                                mode=meta_mode,
                                ensure_ascii=False,
                                separators=(',', ':'))
        fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
Exemple #12
0
 def test_read_write_json_lines_bytes(self):
     expected = [{
         'idx': i,
         'sent': sent.text
     } for i, sent in enumerate(self.spacy_doc.sents)]
     for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'):
         filename = os.path.join(self.tempdir,
                                 'test_read_write_json_lines_bytes' + ext)
         if is_python2 is True:
             if ext == '.json.xz':
                 self.assertRaises(ValueError, fileio.open_sesame, filename,
                                   'wb', 'utf-8', True)
             else:
                 fileio.write_json_lines(expected,
                                         filename,
                                         mode='wb',
                                         auto_make_dirs=True)
                 observed = list(fileio.read_json_lines(filename,
                                                        mode='rb'))
                 self.assertEqual(observed, expected)
         else:
             self.assertRaises(TypeError, fileio.write_json_lines, expected,
                               filename, 'wb', None, True)
Exemple #13
0
    def save(self, path, name=None, compression=None):
        """
        Save ``Corpus`` content and metadata to disk.

        Args:
            path (str): Directory on disk where content + metadata will be saved.
            name (str): Prepend default filenames 'spacy_docs.bin', 'metadatas.json',
                and 'info.json' with a name to identify/uniquify this particular
                corpus.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file, if any.

        .. warning:: If the ``spacy.Vocab`` object used to save this corpus is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'wt' if PY2 is False or compression is None else 'wb'
        package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__}
        fileio.write_json(package_info, info_fname)
        fileio.write_json_lines(
            (doc.metadata for doc in self), meta_fname, mode=meta_mode,
            ensure_ascii=False, separators=(',', ':'))
        fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)