def test_read_write_docs(self, tmpdir, spacy_doc): expected = [tok.lower_ for tok in spacy_doc] for ext in (".pkl", ".pkl.gz", ".pkl.bz2", ".pkl.xz"): filepath = str(tmpdir.join("test_read_write_spacy_docs" + ext)) io.write_spacy_docs(spacy_doc, filepath, make_dirs=True) observed = [ tok.lower_ for doc in io.read_spacy_docs(filepath) for tok in doc ] assert observed == expected
def test_read_write_docs_binary(self, tmpdir, spacy_doc): expected = [tok.lower_ for tok in spacy_doc] filepath = str(tmpdir.join("test_read_write_spacy_docs_binary.bin")) io.write_spacy_docs(spacy_doc, filepath, True, format="binary") with pytest.raises(ValueError): next(io.read_spacy_docs(filepath, format="binary", lang=None)) observed = [ tok.lower_ for doc in io.read_spacy_docs(filepath, format="binary", lang="en") for tok in doc ] assert observed == expected
def test_read_write_docs_binary_exclude(self, tmpdir, spacy_doc): expected = [tok.lower_ for tok in spacy_doc] filepath = str(tmpdir.join("test_read_write_spacy_docs_binary_exclude.bin")) io.write_spacy_docs( spacy_doc, filepath, True, format="binary", exclude=["sentiment", "user_data"], ) observed = [ tok.lower_ for doc in io.read_spacy_docs(filepath, format="binary", lang="en") for tok in doc ] assert observed == expected
def test_read_write_docs(self, tmpdir, spacy_doc): expected = [tok.lower_ for tok in spacy_doc] for ext in (".pkl", ".pkl.gz", ".pkl.bz2", ".pkl.xz"): filepath = str(tmpdir.join("test_read_write_spacy_docs" + ext)) if compat.PY2 is True and ext == ".pkl.xz": with pytest.raises(ValueError): io.open_sesame(filepath, mode="wb", encoding=None, make_dirs=True) else: io.write_spacy_docs(spacy_doc, filepath, True) observed = [ tok.lower_ for doc in io.read_spacy_docs(filepath) for tok in doc ] assert observed == expected
def test_read_write_spacy_docs(tmpdir, spacy_doc): expected = [tok.lemma_ for tok in spacy_doc] for ext in ('.pkl', '.pkl.gz', '.pkl.bz2', '.pkl.xz'): filename = str(tmpdir.join('test_read_write_spacy_docs' + ext)) if compat.is_python2 is True and ext == '.pkl.xz': with pytest.raises(ValueError): io.open_sesame( filename, mode='wb', encoding=None, make_dirs=True) else: io.write_spacy_docs(spacy_doc, filename, True) observed = [ tok.lemma_ for doc in io.read_spacy_docs(filename) for tok in doc] assert observed == expected
def test_read_write_docs_binary_attrs(self, tmpdir, spacy_doc): expected = [tok.tag_ for tok in spacy_doc] filepath = str( tmpdir.join("test_read_write_spacy_docs_binary_exclude.bin")) io.write_spacy_docs( spacy_doc, filepath, make_dirs=True, format="binary", attrs=["ORTH", "TAG"], store_user_data=False, ) observed = [ tok.tag_ for doc in io.read_spacy_docs( filepath, format="binary", lang="en_core_web_sm") for tok in doc ] assert observed == expected