def cpu_softmax(X, drop=0.0): ops = NumpyOps() def cpu_softmax_backward(dY, sgd=None): return dY return ops.softmax(X), cpu_softmax_backward
def mnist(): train_data, dev_data, _ = datasets.mnist() train_X, train_y = NumpyOps().unzip(train_data) dev_X, dev_y = NumpyOps().unzip(dev_data) dev_y = to_categorical(dev_y, nb_classes=10) train_y = to_categorical(dev_y, nb_classes=10) return (train_X[:1000], train_y[:1000]), (dev_X, dev_y)
def from_bytes(self, bytes_data): """Deserialize the DocBin's annotations from a bytestring. bytes_data (bytes): The data to load from. RETURNS (DocBin): The loaded DocBin. DOCS: https://spacy.io/api/docbin#from_bytes """ msg = srsly.msgpack_loads(zlib.decompress(bytes_data)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) lengths = numpy.frombuffer(msg["lengths"], dtype="int32") flat_spaces = numpy.frombuffer(msg["spaces"], dtype=bool) flat_tokens = numpy.frombuffer(msg["tokens"], dtype="uint64") shape = (flat_tokens.size // len(self.attrs), len(self.attrs)) flat_tokens = flat_tokens.reshape(shape) flat_spaces = flat_spaces.reshape((flat_spaces.size, 1)) self.tokens = NumpyOps().unflatten(flat_tokens, lengths) self.spaces = NumpyOps().unflatten(flat_spaces, lengths) if self.store_user_data and "user_data" in msg: self.user_data = list(msg["user_data"]) for tokens in self.tokens: assert len( tokens.shape) == 2, tokens.shape # this should never happen return self
def from_bytes(self, string): """Deserialize the binder's annotations from a byte string.""" msg = srsly.msgpack_loads(gzip.decompress(string)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) lengths = numpy.fromstring(msg["lengths"], dtype="int32") flat_spaces = numpy.fromstring(msg["spaces"], dtype=bool) flat_tokens = numpy.fromstring(msg["tokens"], dtype="uint64") shape = (flat_tokens.size // len(self.attrs), len(self.attrs)) flat_tokens = flat_tokens.reshape(shape) flat_spaces = flat_spaces.reshape((flat_spaces.size, 1)) self.tokens = NumpyOps().unflatten(flat_tokens, lengths) self.spaces = NumpyOps().unflatten(flat_spaces, lengths) for tokens in self.tokens: assert len(tokens.shape) == 2, tokens.shape return self
def test_add_label(parser): doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].head.i == 1 assert doc[0].dep_ == "left" assert doc[1].head.i == 1 assert doc[2].head.i == 3 assert doc[2].head.i == 3 parser.add_label("right") doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].head.i == 1 assert doc[0].dep_ == "left" assert doc[1].head.i == 1 assert doc[2].head.i == 3 assert doc[2].head.i == 3 sgd = Adam(NumpyOps(), 0.001) for i in range(10): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]) parser.update([doc], [gold], sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].dep_ == "right" assert doc[2].dep_ == "left"
def test_add_label(parser): doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc = parser(doc) assert doc[0].head.i == 1 assert doc[0].dep_ == 'left' assert doc[1].head.i == 1 assert doc[2].head.i == 3 assert doc[2].head.i == 3 parser.add_label('right') doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc = parser(doc) assert doc[0].head.i == 1 assert doc[0].dep_ == 'left' assert doc[1].head.i == 1 assert doc[2].head.i == 3 assert doc[2].head.i == 3 sgd = Adam(NumpyOps(), 0.001) for i in range(10): losses = {} doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=['right', 'ROOT', 'left', 'ROOT']) parser.update([doc], [gold], sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc = parser(doc) assert doc[0].dep_ == 'right' assert doc[2].dep_ == 'left'
def test_square_sequences(): ops = NumpyOps() seqs = [numpy.zeros((5, 4)), numpy.zeros((8, 4)), numpy.zeros((2, 4))] arr, size_at_t, unpad = ops.square_sequences(seqs) assert arr.shape == (8, 3, 4) assert size_at_t[0] == 3 assert size_at_t[1] == 3 assert size_at_t[2] == 2 assert size_at_t[3] == 2 assert size_at_t[4] == 2 assert size_at_t[5] == 1 assert size_at_t[6] == 1 assert size_at_t[7] == 1 unpadded = unpad(arr) assert unpadded[0].shape == (5, 4) assert unpadded[1].shape == (8, 4) assert unpadded[2].shape == (2, 4)
def forward(X, drop=0.0): if isinstance(X, numpy.ndarray): ops = NumpyOps() else: ops = CupyOps() output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) def backward(y, sgd=None): dX = ops.allocate(X.shape) dX[:, idx] += y return dX return output, backward
def _train_parser(parser): fix_random_seed(1) parser.add_label("left") parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) for i in range(5): losses = {} doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) parser.update([doc], [gold], sgd=sgd, losses=losses) return parser
def parser(vocab): parser = DependencyParser(vocab) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) parser.update([doc], [gold], sgd=sgd, losses=losses) return parser
def load(cls, lang, filepath): """ Load previously saved :class:`Corpus` binary data, reproduce the original `:class:`spacy.tokens.Doc`s tokens and annotations, and instantiate a new :class:`Corpus` from them. Args: lang (str or :class:`spacy.language.Language`) filepath (str): Full path to file on disk where :class:`Corpus` data was previously saved as a binary file. Returns: :class:`Corpus` See Also: :meth:`Corpus.save()` """ spacy_lang = _get_spacy_lang(lang) with tio.open_sesame(filepath, mode="rb") as f: msg = srsly.msgpack_loads(f.read()) if spacy_lang.meta != msg["meta"]: LOGGER.warning("the spacy langs are different!") for string in msg["strings"]: spacy_lang.vocab[string] attrs = msg["attrs"] lengths = np.frombuffer(msg["lengths"], dtype="int32") flat_tokens = np.frombuffer(msg["tokens"], dtype="uint64") flat_tokens = flat_tokens.reshape( (flat_tokens.size // len(attrs), len(attrs))) tokens = np.asarray(NumpyOps().unflatten(flat_tokens, lengths)) user_datas = msg["user_datas"] def _make_spacy_docs(tokens, user_datas): for toks, user_data in compat.zip_(tokens, user_datas): doc = spacy.tokens.Doc( spacy_lang.vocab, words=[ spacy_lang.vocab.strings[orth] for orth in toks[:, 0] ], spaces=np.ndarray.tolist(toks[:, 1]), ) doc = doc.from_array(attrs[2:], toks[:, 2:]) doc.user_data = user_data yield doc return cls(spacy_lang, data=_make_spacy_docs(tokens, user_datas))
def parser(vocab): parser = DependencyParser(vocab) parser.cfg['token_vector_width'] = 4 parser.cfg['hidden_width'] = 32 #parser.add_label('right') parser.add_label('left') parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) for i in range(10): losses = {} doc = Doc(vocab, words=['a', 'b', 'c', 'd']) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=['left', 'ROOT', 'left', 'ROOT']) parser.update([doc], [gold], sgd=sgd, losses=losses) return parser
class SpacyVectors(Model): ops = NumpyOps() name = 'spacy-vectors' def __init__(self, nlp, nO): Model.__init__(self) self._id_map = {0: 0} self.nO = nO self.nM = nlp.vocab.vectors_length self.nlp = nlp @property def nV(self): return len(self.nlp.vocab) def begin_update(self, ids, drop=0.): if not isinstance(ids, numpy.ndarray): ids = ids.get() gpu_in = True else: gpu_in = False uniqs, inverse = numpy.unique(ids, return_inverse=True) vectors = self.ops.allocate((uniqs.shape[0], self.nM)) for i, orth in enumerate(uniqs): vectors[i] = self.nlp.vocab[orth].vector def finish_update(gradients, sgd=None): if gpu_in: gradients = gradients.get() self.d_W += self.ops.batch_outer(gradients, vectors[inverse, ]) if sgd is not None: ops = sgd.ops sgd.ops = self.ops sgd(self._mem.weights, self._mem.gradient, key=id(self._mem)) sgd.ops = ops return None dotted = self.ops.batch_dot(vectors, self.W) if gpu_in: return cupy.asarray(dotted[inverse, ]), finish_update else: return dotted[inverse, ], finish_update
def cpu_ops(): return NumpyOps()
def get_model(W_values, b_values): model = Affine(W_values.shape[0], W_values.shape[1], ops=NumpyOps()) model.initialize_params() model.W[:] = W_values model.b[:] = b_values return model
def sgd(): return SGD(NumpyOps(), 0.001)
def get_input(nr_batch, nr_in): ops = NumpyOps() return ops.allocate((nr_batch, nr_in))
trainer = context['trainer'] def each_epoch(): global epoch_train_acc acc = model.evaluate(dev_X, dev_y) with model.use_params(trainer.optimizer.averages): avg_acc = model.evaluate(dev_X, dev_y) stats = (acc, avg_acc, float(epoch_train_acc) / n_train, trainer.dropout) print("%.3f (%.3f) dev acc, %.3f train acc, %.4f drop" % stats) epoch_train_acc = 0. return each_epoch remapping = layerize(remap_ids(NumpyOps())) def preprocess(ops, data, nr_tag): Xs, ys = zip(*data) Xs = [ops.asarray(remapping(x)) for x in Xs] ys = [ops.asarray(to_categorical(y, nb_classes=nr_tag)) for y in ys] return Xs, ys _i = 0 def debug(X, drop=0.): global _i if _i % 1000 == 0:
def ops(): return NumpyOps()
def get_ops(): return NumpyOps()