Example #1
0
def cpu_softmax(X, drop=0.0):
    ops = NumpyOps()

    def cpu_softmax_backward(dY, sgd=None):
        return dY

    return ops.softmax(X), cpu_softmax_backward
Example #2
0
def mnist():
    train_data, dev_data, _ = datasets.mnist()
    train_X, train_y = NumpyOps().unzip(train_data)
    dev_X, dev_y = NumpyOps().unzip(dev_data)
    dev_y = to_categorical(dev_y, nb_classes=10)
    train_y = to_categorical(dev_y, nb_classes=10)
    return (train_X[:1000], train_y[:1000]), (dev_X, dev_y)
Example #3
0
def cpu_softmax(X, drop=0.0):
    ops = NumpyOps()

    def cpu_softmax_backward(dY, sgd=None):
        return dY

    return ops.softmax(X), cpu_softmax_backward
Example #4
0
    def from_bytes(self, bytes_data):
        """Deserialize the DocBin's annotations from a bytestring.

        bytes_data (bytes): The data to load from.
        RETURNS (DocBin): The loaded DocBin.

        DOCS: https://spacy.io/api/docbin#from_bytes
        """
        msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
        self.attrs = msg["attrs"]
        self.strings = set(msg["strings"])
        lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
        flat_spaces = numpy.frombuffer(msg["spaces"], dtype=bool)
        flat_tokens = numpy.frombuffer(msg["tokens"], dtype="uint64")
        shape = (flat_tokens.size // len(self.attrs), len(self.attrs))
        flat_tokens = flat_tokens.reshape(shape)
        flat_spaces = flat_spaces.reshape((flat_spaces.size, 1))
        self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
        self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
        if self.store_user_data and "user_data" in msg:
            self.user_data = list(msg["user_data"])
        for tokens in self.tokens:
            assert len(
                tokens.shape) == 2, tokens.shape  # this should never happen
        return self
Example #5
0
 def from_bytes(self, string):
     """Deserialize the binder's annotations from a byte string."""
     msg = srsly.msgpack_loads(gzip.decompress(string))
     self.attrs = msg["attrs"]
     self.strings = set(msg["strings"])
     lengths = numpy.fromstring(msg["lengths"], dtype="int32")
     flat_spaces = numpy.fromstring(msg["spaces"], dtype=bool)
     flat_tokens = numpy.fromstring(msg["tokens"], dtype="uint64")
     shape = (flat_tokens.size // len(self.attrs), len(self.attrs))
     flat_tokens = flat_tokens.reshape(shape)
     flat_spaces = flat_spaces.reshape((flat_spaces.size, 1))
     self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
     self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
     for tokens in self.tokens:
         assert len(tokens.shape) == 2, tokens.shape
     return self
Example #6
0
def test_add_label(parser):
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    doc = parser(doc)
    assert doc[0].head.i == 1
    assert doc[0].dep_ == "left"
    assert doc[1].head.i == 1
    assert doc[2].head.i == 3
    assert doc[2].head.i == 3
    parser.add_label("right")
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    doc = parser(doc)
    assert doc[0].head.i == 1
    assert doc[0].dep_ == "left"
    assert doc[1].head.i == 1
    assert doc[2].head.i == 3
    assert doc[2].head.i == 3
    sgd = Adam(NumpyOps(), 0.001)
    for i in range(10):
        losses = {}
        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
        gold = GoldParse(doc,
                         heads=[1, 1, 3, 3],
                         deps=["right", "ROOT", "left", "ROOT"])
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
    doc = parser(doc)
    assert doc[0].dep_ == "right"
    assert doc[2].dep_ == "left"
Example #7
0
def test_add_label(parser):
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc = parser(doc)
    assert doc[0].head.i == 1
    assert doc[0].dep_ == 'left'
    assert doc[1].head.i == 1
    assert doc[2].head.i == 3
    assert doc[2].head.i == 3
    parser.add_label('right')
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc = parser(doc)
    assert doc[0].head.i == 1
    assert doc[0].dep_ == 'left'
    assert doc[1].head.i == 1
    assert doc[2].head.i == 3
    assert doc[2].head.i == 3
    sgd = Adam(NumpyOps(), 0.001)
    for i in range(10):
        losses = {}
        doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
        gold = GoldParse(doc,
                         heads=[1, 1, 3, 3],
                         deps=['right', 'ROOT', 'left', 'ROOT'])
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc = parser(doc)
    assert doc[0].dep_ == 'right'
    assert doc[2].dep_ == 'left'
Example #8
0
def test_square_sequences():
    ops = NumpyOps()
    seqs = [numpy.zeros((5, 4)), numpy.zeros((8, 4)), numpy.zeros((2, 4))]
    arr, size_at_t, unpad = ops.square_sequences(seqs)
    assert arr.shape == (8, 3, 4)
    assert size_at_t[0] == 3
    assert size_at_t[1] == 3
    assert size_at_t[2] == 2
    assert size_at_t[3] == 2
    assert size_at_t[4] == 2
    assert size_at_t[5] == 1
    assert size_at_t[6] == 1
    assert size_at_t[7] == 1
    unpadded = unpad(arr)
    assert unpadded[0].shape == (5, 4)
    assert unpadded[1].shape == (8, 4)
    assert unpadded[2].shape == (2, 4)
Example #9
0
    def forward(X, drop=0.0):
        if isinstance(X, numpy.ndarray):
            ops = NumpyOps()
        else:
            ops = CupyOps()
        output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)

        def backward(y, sgd=None):
            dX = ops.allocate(X.shape)
            dX[:, idx] += y
            return dX

        return output, backward
Example #10
0
def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
    parser.begin_training([], **parser.cfg)
    sgd = Adam(NumpyOps(), 0.001)

    for i in range(5):
        losses = {}
        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
        gold = GoldParse(doc,
                         heads=[1, 1, 3, 3],
                         deps=["left", "ROOT", "left", "ROOT"])
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    return parser
Example #11
0
def parser(vocab):
    parser = DependencyParser(vocab)
    parser.cfg["token_vector_width"] = 4
    parser.cfg["hidden_width"] = 32
    # parser.add_label('right')
    parser.add_label("left")
    parser.begin_training([], **parser.cfg)
    sgd = Adam(NumpyOps(), 0.001)

    for i in range(10):
        losses = {}
        doc = Doc(vocab, words=["a", "b", "c", "d"])
        gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    return parser
Example #12
0
    def load(cls, lang, filepath):
        """
        Load previously saved :class:`Corpus` binary data, reproduce the original
        `:class:`spacy.tokens.Doc`s tokens and annotations, and instantiate
        a new :class:`Corpus` from them.

        Args:
            lang (str or :class:`spacy.language.Language`)
            filepath (str): Full path to file on disk where :class:`Corpus` data
                was previously saved as a binary file.

        Returns:
            :class:`Corpus`

        See Also:
            :meth:`Corpus.save()`
        """
        spacy_lang = _get_spacy_lang(lang)
        with tio.open_sesame(filepath, mode="rb") as f:
            msg = srsly.msgpack_loads(f.read())
        if spacy_lang.meta != msg["meta"]:
            LOGGER.warning("the spacy langs are different!")
        for string in msg["strings"]:
            spacy_lang.vocab[string]
        attrs = msg["attrs"]
        lengths = np.frombuffer(msg["lengths"], dtype="int32")
        flat_tokens = np.frombuffer(msg["tokens"], dtype="uint64")
        flat_tokens = flat_tokens.reshape(
            (flat_tokens.size // len(attrs), len(attrs)))
        tokens = np.asarray(NumpyOps().unflatten(flat_tokens, lengths))
        user_datas = msg["user_datas"]

        def _make_spacy_docs(tokens, user_datas):
            for toks, user_data in compat.zip_(tokens, user_datas):
                doc = spacy.tokens.Doc(
                    spacy_lang.vocab,
                    words=[
                        spacy_lang.vocab.strings[orth] for orth in toks[:, 0]
                    ],
                    spaces=np.ndarray.tolist(toks[:, 1]),
                )
                doc = doc.from_array(attrs[2:], toks[:, 2:])
                doc.user_data = user_data
                yield doc

        return cls(spacy_lang, data=_make_spacy_docs(tokens, user_datas))
Example #13
0
def parser(vocab):
    parser = DependencyParser(vocab)
    parser.cfg['token_vector_width'] = 4
    parser.cfg['hidden_width'] = 32
    #parser.add_label('right')
    parser.add_label('left')
    parser.begin_training([], **parser.cfg)
    sgd = Adam(NumpyOps(), 0.001)

    for i in range(10):
        losses = {}
        doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
        gold = GoldParse(doc,
                         heads=[1, 1, 3, 3],
                         deps=['left', 'ROOT', 'left', 'ROOT'])
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    return parser
class SpacyVectors(Model):
    ops = NumpyOps()
    name = 'spacy-vectors'

    def __init__(self, nlp, nO):
        Model.__init__(self)
        self._id_map = {0: 0}
        self.nO = nO
        self.nM = nlp.vocab.vectors_length
        self.nlp = nlp

    @property
    def nV(self):
        return len(self.nlp.vocab)

    def begin_update(self, ids, drop=0.):
        if not isinstance(ids, numpy.ndarray):
            ids = ids.get()
            gpu_in = True
        else:
            gpu_in = False
        uniqs, inverse = numpy.unique(ids, return_inverse=True)
        vectors = self.ops.allocate((uniqs.shape[0], self.nM))
        for i, orth in enumerate(uniqs):
            vectors[i] = self.nlp.vocab[orth].vector

        def finish_update(gradients, sgd=None):
            if gpu_in:
                gradients = gradients.get()
            self.d_W += self.ops.batch_outer(gradients, vectors[inverse, ])
            if sgd is not None:
                ops = sgd.ops
                sgd.ops = self.ops
                sgd(self._mem.weights, self._mem.gradient, key=id(self._mem))
                sgd.ops = ops
            return None

        dotted = self.ops.batch_dot(vectors, self.W)
        if gpu_in:
            return cupy.asarray(dotted[inverse, ]), finish_update
        else:
            return dotted[inverse, ], finish_update
Example #15
0
def cpu_ops():
    return NumpyOps()
Example #16
0
def get_model(W_values, b_values):
    model = Affine(W_values.shape[0], W_values.shape[1], ops=NumpyOps())
    model.initialize_params()
    model.W[:] = W_values
    model.b[:] = b_values
    return model
Example #17
0
def sgd():
    return SGD(NumpyOps(), 0.001)
Example #18
0
def get_input(nr_batch, nr_in):
    ops = NumpyOps()
    return ops.allocate((nr_batch, nr_in))
Example #19
0
    trainer = context['trainer']

    def each_epoch():
        global epoch_train_acc
        acc = model.evaluate(dev_X, dev_y)
        with model.use_params(trainer.optimizer.averages):
            avg_acc = model.evaluate(dev_X, dev_y)
        stats = (acc, avg_acc, float(epoch_train_acc) / n_train,
                 trainer.dropout)
        print("%.3f (%.3f) dev acc, %.3f train acc, %.4f drop" % stats)
        epoch_train_acc = 0.

    return each_epoch


remapping = layerize(remap_ids(NumpyOps()))


def preprocess(ops, data, nr_tag):
    Xs, ys = zip(*data)
    Xs = [ops.asarray(remapping(x)) for x in Xs]
    ys = [ops.asarray(to_categorical(y, nb_classes=nr_tag)) for y in ys]
    return Xs, ys


_i = 0


def debug(X, drop=0.):
    global _i
    if _i % 1000 == 0:
Example #20
0
def ops():
    return NumpyOps()
Example #21
0
def get_ops():
    return NumpyOps()