def interpret_file(path, encoding='utf-8', readers: dict = None): """Read a file's using the proper loader from the extension""" path = Path(path).expanduser().resolve() s = path.suffix.lower() if readers is None: readers = {} elif not isinstance(readers, dict): assert callable(readers) readers = {s: readers} if s in readers: func = readers[s] assert callable(func) return func(path) elif s == '.json': return srsly.read_json(path) elif s == '.jsonl': return srsly.read_jsonl(path) elif s in ('.yml', '.yaml'): # return yaml.load(path.read_bytes(), Loader=YamlLoader) return yaml.load(path.read_bytes()) elif s in ('.pkl', '.bin', '.pickle'): return srsly.pickle_loads(path.read_text(encoding=encoding)) elif s not in _TEXT_EXT: return path.read_bytes() else: return path.read_text(encoding=encoding)
def test_pickle_with_flatten(linear): Xs = [linear.ops.alloc2f(2, 3), linear.ops.alloc2f(4, 3)] model = with_array(linear).initialize() pickled = srsly.pickle_dumps(model) loaded = srsly.pickle_loads(pickled) Ys = loaded.predict(Xs) assert len(Ys) == 2 assert Ys[0].shape == (Xs[0].shape[0], linear.get_dim("nO")) assert Ys[1].shape == (Xs[1].shape[0], linear.get_dim("nO"))
def test_pickle_string_store(text1, text2): stringstore = StringStore() store1 = stringstore[text1] store2 = stringstore[text2] data = srsly.pickle_dumps(stringstore, protocol=-1) unpickled = srsly.pickle_loads(data) assert unpickled[text1] == store1 assert unpickled[text2] == store2 assert len(stringstore) == len(unpickled)
def test_pickle_vocab(text1, text2): vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) vocab.set_vector("dog", numpy.ones((5, ), dtype="f")) lex1 = vocab[text1] lex2 = vocab[text2] assert lex1.norm_ == text1[:-1] assert lex2.norm_ == text2[:-1] data = srsly.pickle_dumps(vocab) unpickled = srsly.pickle_loads(data) assert unpickled[text1].orth == lex1.orth assert unpickled[text2].orth == lex2.orth assert unpickled[text1].norm == lex1.norm assert unpickled[text2].norm == lex2.norm assert unpickled[text1].norm != unpickled[text2].norm assert unpickled.vectors is not None assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
def test_pickle_vocab(text1, text2): vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) vocab.set_vector("dog", numpy.ones((5,), dtype="f")) lex1 = vocab[text1] lex2 = vocab[text2] assert lex1.norm_ == text1[:-1] assert lex2.norm_ == text2[:-1] data = srsly.pickle_dumps(vocab) unpickled = srsly.pickle_loads(data) assert unpickled[text1].orth == lex1.orth assert unpickled[text2].orth == lex2.orth assert unpickled[text1].norm == lex1.norm assert unpickled[text2].norm == lex2.norm assert unpickled[text1].norm != unpickled[text2].norm assert unpickled.vectors is not None assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
def test_pickle_doc(en_vocab): words = ["a", "b", "c"] deps = ["dep"] * len(words) heads = [0] * len(words) doc = Doc( en_vocab, words=words, deps=deps, heads=heads, ) data = srsly.pickle_dumps(doc) unpickled = srsly.pickle_loads(data) assert [t.text for t in unpickled] == words assert [t.dep_ for t in unpickled] == deps assert [t.head.i for t in unpickled] == heads assert list(doc.noun_chunks) == []
def test_phrase_matcher_pickle(en_vocab): matcher = PhraseMatcher(en_vocab) mock = Mock() matcher.add("TEST", [Doc(en_vocab, words=["test"])]) matcher.add("TEST2", [Doc(en_vocab, words=["test2"])], on_match=mock) doc = Doc(en_vocab, words=["these", "are", "tests", ":", "test", "test2"]) assert len(matcher) == 2 b = srsly.pickle_dumps(matcher) matcher_unpickled = srsly.pickle_loads(b) # call after pickling to avoid recursion error related to mock matches = matcher(doc) matches_unpickled = matcher_unpickled(doc) assert len(matcher) == len(matcher_unpickled) assert matches == matches_unpickled # clunky way to vaguely check that callback is unpickled (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] assert isinstance(callbacks.get("TEST2"), Mock)
def train_epoch( model, sgd, hparams, train_X, train_y, dev_X, dev_y, device_id=-1, temperature=0.0 ): model, sgd, hparams = srsly.pickle_loads(srsly.pickle_dumps((model, sgd, hparams))) if device_id >= 0: model.to_gpu(device_id) sgd.ops = model.ops sgd.to_gpu() if isinstance(train_y, numpy.ndarray): train_y = model.ops.asarray(train_y) dev_y = model.ops.asarray(dev_y) hparams = resample_hyper_params(hparams, temperature) sgd.learn_rate = hparams["learn_rate"] sgd.beta1 = hparams["beta1"] sgd.beta2 = hparams["beta2"] sgd.L2 = hparams["L2"] train_acc = 0.0 train_n = 0 for X, y in minibatch( train_X, train_y, size=hparams["batch_size"], nr_update=hparams["nr_update"] ): yh, finish_update = model.begin_update(X, drop=hparams["dropout"]) if hasattr(y, "shape"): dy = (yh - y) / y.shape[0] train_acc += (y.argmax(axis=1) == yh.argmax(axis=1)).sum() train_n += y.shape[0] else: n_y = sum(len(y_i) for y_i in y) dy = [(yh[i] - y[i]) / n_y for i in range(len(yh))] for i in range(len(y)): train_acc += (y[i].argmax(axis=1) == yh[i].argmax(axis=1)).sum() train_n += n_y finish_update(dy, sgd=sgd) train_acc /= train_n with model.use_params(sgd.averages): dev_acc = model.evaluate(dev_X, dev_y) model.to_cpu() sgd.to_cpu() return device_id, ((model, sgd, hparams), float(train_acc), float(dev_acc))
def __setstate__(self, state_data): self.__dict__ = srsly.pickle_loads(state_data)
def from_bytes(self, bytes_data, **kwargs): pkls = srsly.pickle_loads(bytes_data) for field in self.serialization_fields: setattr(self, field, pkls[field]) return self
def pickle_loads(data): return srsly.pickle_loads(data)