def noop_models(): return [ with_padded(noop()), with_array(noop()), with_array2d(noop()), with_list(noop()), with_ragged(noop()) ]
def concatenate_lists(*layers, **kwargs): # pragma: no cover """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` """ if not layers: return noop() drop_factor = kwargs.get("drop_factor", 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) def concatenate_lists_fwd(Xs, drop=0.0): if drop is not None: drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype="i") flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) return ys, concatenate_lists_bwd model = wrap(concatenate_lists_fwd, concat) return model
def TransitionModel(tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()): """Set up a stepwise transition-based model""" if upper is None: has_upper = False upper = noop() else: has_upper = True # don't define nO for this object, because we can't dynamically change it return Model( name="parser_model", forward=forward, dims={"nI": tok2vec.maybe_get_dim("nI")}, layers=[tok2vec, lower, upper], refs={ "tok2vec": tok2vec, "lower": lower, "upper": upper }, init=init, attrs={ "has_upper": has_upper, "unseen_classes": set(unseen_classes), "resize_output": resize_output, }, )
def test_noop(): data = numpy.asarray([1, 2, 3], dtype="f") model = noop(Linear(), Linear()) model.initialize(data, data) Y, backprop = model(data, is_train=True) assert numpy.array_equal(Y, data) dX = backprop(Y) assert numpy.array_equal(dX, data)
def test_layerize_update_noop(model1, model2, nI): ones = numpy.ones((10, nI)) model = layerize(noop(model1, model2)) y, finish_update = model.begin_update(ones) assert_allclose(y, ones) grad_in = numpy.ones(y.shape) + 1.0 grad_out = finish_update(grad_in) assert_allclose(grad_in, grad_out)
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2): import torch.nn from thinc.api import with_square_sequences from thinc.extra.wrappers import PyTorchWrapperRNN if depth == 0: return layerize(noop()) model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout) return with_square_sequences(PyTorchWrapperRNN(model))
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2): if depth == 0: return layerize(noop()) model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout) return with_square_sequences(PyTorchWrapperRNN(model))
def TorchBiLSTMEncoder(config): import torch.nn from thinc.extra.wrappers import PyTorchWrapperRNN width = config["width"] depth = config["depth"] if depth == 0: return layerize(noop()) return with_square_sequences( PyTorchWrapperRNN(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) )
def BiLSTMEncoder(width: int, depth: int, dropout: float) -> Model[List[Floats2d], List[Floats2d]]: """Encode context using bidirectonal LSTM layers. Requires PyTorch. width (int): The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between 64 and 300. depth (int): The number of recurrent layers. dropout (float): Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0 to disable this functionality. """ if depth == 0: return noop() return with_padded( PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout))
def concatenate_lists(*layers, **kwargs): # pragma: no cover """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` """ if not layers: return noop() drop_factor = kwargs.get("drop_factor", 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) def concatenate_lists_fwd(Xs, drop=0.0): drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype="i") flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) return ys, concatenate_lists_bwd model = wrap(concatenate_lists_fwd, concat) return model
def pretrain( texts_loc, vectors_model, output_dir, width=128, depth=4, embed_rows=1000, dropout=0.2, nr_iter=10, seed=0, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pre-trained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pre-trained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) output_dir = ensure_path(output_dir) random.seed(seed) numpy.random.seed(seed) if not output_dir.exists(): output_dir.mkdir() with (output_dir / "config.json").open("w") as file_: file_.write(json.dumps(config)) has_gpu = prefer_gpu() nlp = spacy.load(vectors_model) tok2vec = Tok2Vec_LSTM(width, embed_rows, depth, dropout) print(dir(tok2vec)) model = create_pretraining_model(nlp, tok2vec) optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker() print("Epoch", "#Words", "Loss", "L/W", "w/s") texts = stream_texts() if texts_loc == "-" else load_texts(texts_loc) for epoch in range(nr_iter): for batch in minibatch(texts, size=256): docs = make_docs(nlp, batch, heads=False) loss = make_update(model, docs, optimizer, drop=dropout) progress = tracker.update(epoch, loss, docs) if progress: print(*progress) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 6: break with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_: # This is annoying -- work around how Parser expects this file_.write(chain(tok2vec, layerize(noop())).to_bytes()) with (output_dir / "log.jsonl").open("a") as file_: file_.write( json.dumps( {"nr_word": tracker.nr_word, "loss": tracker.loss, "epoch": epoch} ) ) if texts_loc != "-": texts = load_texts(texts_loc)
def test_tuplify_dulicates_input(): model = tuplify(noop(), noop()) ones = numpy.ones([10]) out = model.predict(ones) assert out == (ones, ones)
def test_layerize_predict_noop(model1, model2, nI): ones = numpy.ones((10, nI)) model = layerize(noop(model1, model2)) y = model(ones) assert_allclose(y, ones)