def foreach_sentence(layer: Model, drop_factor: float = 1.0) -> Model: """Map a layer across sentences (assumes spaCy-esque .sents interface)""" def sentence_fwd(docs: List[Doc], drop: Dropout = 0.0) -> Tuple[Acts, Callable]: if not all(doc.is_sentenced for doc in docs): return layer.begin_update([d[:] for d in docs], drop=drop) sents = flatten_list([list(doc.sents) for doc in docs]) words_per_doc = [len(d._.get(ATTRS.word_pieces)) for d in docs] words_per_sent = [len(s._.get(ATTRS.word_pieces)) for s in sents] sents_per_doc = [len(list(d.sents)) for d in docs] assert sum(words_per_doc) == sum(words_per_sent) acts, bp_acts = layer.begin_update(sents, drop=drop) # To go from "per sentence" activations to "per doc" activations, we # just have to tell it where the sequences end. acts.lh.lengths = words_per_doc acts.po.lengths = sents_per_doc def sentence_bwd(d_acts: Acts, sgd: Optional[Optimizer] = None) -> None: assert isinstance(d_acts, Acts) # Translate back to the per-sentence activations if d_acts.has_lh: assert d_acts.lh.data.shape[0] == sum(d_acts.lh.lengths) assert d_acts.lh.lengths == words_per_doc d_acts.lh.lengths = words_per_sent d_acts.po.lengths = [1 for _ in words_per_sent] d_ids = bp_acts(d_acts, sgd=sgd) if not (d_ids is None or all(ds is None for ds in d_ids)): raise ValueError("Expected gradient of sentence to be None") return d_ids return acts, sentence_bwd return wrap(sentence_fwd, layer)
def without_length_batching(model: PyTT_Wrapper, _: Any) -> Model: Input = List[Array] Output = List[Activations] Backprop = Callable[[Output, Optional[Optimizer]], Optional[Input]] model_begin_update: Callable[[Array, Dropout], Tuple[Activations, Backprop]] model_begin_update = model.begin_update def apply_model_padded(inputs: Input, drop: Dropout = 0.0) -> Tuple[Output, Backprop]: activs, get_dX = model_begin_update(pad_batch(inputs), drop) outputs = [ activs.get_slice(i, slice(0, len(seq))) for i, seq in enumerate(inputs) ] def backprop_batched(d_outputs, sgd=None): d_activs = pad_batch_activations(d_outputs) dX = get_dX(d_activs, sgd=sgd) if dX is not None: d_inputs = [ dX[i, :len(seq)] for i, seq in enumerate(d_outputs) ] else: d_inputs = None return d_inputs return outputs, backprop_batched return wrap(apply_model_padded, model)
def without_length_batching(model: PyTT_Wrapper, _: Any) -> Model: Input = List[Array] Output = List[Activations] Backprop = Callable[[Output, Optional[Optimizer]], Optional[Input]] model_begin_update: Callable[[Array, Dropout], Tuple[Activations, Backprop]] model_begin_update = model.begin_update def apply_model_padded(inputs: Input, drop: Dropout = 0.0) -> Tuple[Output, Backprop]: activs, get_dX = model_begin_update(pad_batch(inputs), drop) last_hiddens = [ activs.lh[i, :len(seq)] for i, seq in enumerate(inputs) ] outputs = [Activations(y, [], [], []) for y in last_hiddens] def backprop_batched(d_outputs, sgd=None): d_last_hiddens = [x.lh for x in d_outputs] dY = pad_batch(d_last_hiddens) dY = dY.reshape(len(d_outputs), -1, dY.shape[-1]) d_activs = Activations(dY, [], [], [], is_grad=True) dX = get_dX(d_activs, sgd=sgd) if dX is not None: d_inputs = [ dX[i, :len(seq)] for i, seq in enumerate(d_outputs) ] else: d_inputs = None return d_inputs return outputs, backprop_batched return wrap(apply_model_padded, model)
def apply_layers(*layers): """Take a sequence of input layers, and expect input tuples. Apply layers[0] to inputs[1], layers[1] to inputs[1], etc. """ def apply_layers_forward(inputs, drop=0.): assert len(inputs) == len(layers), (len(inputs), len(layers)) outputs = [] callbacks = [] for layer, input_ in zip(layers, inputs): output, callback = layer.begin_update(input_, drop=drop) outputs.append(output) callbacks.append(callback) def apply_layers_backward(d_outputs, sgd=None): d_inputs = [] for callback, d_output in zip(callbacks, d_outputs): if callback is None: d_inputs.append(None) else: d_inputs.append(callback(d_output, sgd=sgd)) return d_inputs return tuple(outputs), apply_layers_backward return wrap(apply_layers_forward, *layers)
def truncate_long_inputs(model, max_len): """Truncate inputs on the way into a model, and restore their shape on the way out. """ def with_truncate_forward(X, drop=0.0): # Dim 1 should be batch, dim 2 sequence length if X.shape[1] < max_len: return model.begin_update(X, drop=drop) X_short = X[:, :max_len] Y_short, get_dX_short = model.begin_update(X_short, drop=drop) short_lh = Y_short.lh Y = model.ops.allocate((short_lh.shape[0], X.shape[1]) + short_lh.shape[2:]) Y[:, :max_len] = short_lh outputs = Activations(Y, [], [], []) def with_truncate_backward(dY, sgd=None): dY_short = dY.get_slice(slice(0, None), slice(0, max_len)) dX_short = get_dX_short(dY_short, sgd=sgd) if dX_short is None: return None dX = model.ops.allocate((dX_short.shape[0], dY.shape[1]) + dY_short.shape[2:]) dX[:, :max_len] = dX_short return dX return outputs, with_truncate_backward return wrap(with_truncate_forward, model)
def truncate_long_inputs(model: PyTT_Wrapper, max_len: int) -> PyTT_Wrapper: """Truncate inputs on the way into a model, and restore their shape on the way out. """ def with_truncate_forward(X: Array, drop: Dropout = 0.0 ) -> Tuple[Activations, Callable]: # Dim 1 should be batch, dim 2 sequence length if X.shape[1] < max_len: return model.begin_update(X, drop=drop) X_short = X[:, :max_len] Y_short, get_dX_short = model.begin_update(X_short, drop=drop) outputs = pad_batch_activations([Y_short], to=X.shape[1]) def with_truncate_backward(dY, sgd=None): dY_short = dY.get_slice(slice(0, None), slice(0, max_len)) dX_short = get_dX_short(dY_short, sgd=sgd) if dX_short is None: return None dX = model.ops.allocate((dX_short.shape[0], dY.shape[1]) + dY_short.shape[2:]) dX[:, :max_len] = dX_short return dX return outputs, with_truncate_backward return wrap(with_truncate_forward, model)
def concatenate_lists(*layers, **kwargs): # pragma: no cover """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` """ if not layers: return noop() drop_factor = kwargs.get("drop_factor", 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) def concatenate_lists_fwd(Xs, drop=0.0): if drop is not None: drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype="i") flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) return ys, concatenate_lists_bwd model = wrap(concatenate_lists_fwd, concat) return model
def block_gradients(model): from thinc.api import wrap def forward(X, drop=0.0): Y, _ = model.begin_update(X, drop=drop) return Y, None return wrap(forward, model)
def concatenate_ragged(*layers): model = concatenate(*[chain(layer, getitem(0)) for layer in layers]) def begin_update(X_lengths, drop=0.): X, lengths = X_lengths Y, get_dX = model.begin_update(X_lengths, drop=drop) return (Y, lengths), get_dX output = wrap(begin_update, model) return output
def with_length_batching(model: PyTT_Wrapper, max_words: int) -> Model: """Wrapper that applies a model to variable-length sequences by first batching and padding the sequences. This allows us to group similarly-lengthed sequences together, making the padding less wasteful. If min_batch==1, no padding will be necessary. """ if max_words < 1: return without_length_batching(model, max_words) Input = List[Array] Output = List[Activations] Backprop = Callable[[Output, Optional[Optimizer]], Optional[Input]] def apply_model_to_batches( inputs: List[Array], drop: Dropout = 0.0) -> Tuple[List[Activations], Backprop]: batches: List[List[int]] = batch_by_length(inputs, max_words) # Initialize this, so we can place the outputs back in order. unbatched: List[Optional[Activations]] = [None for _ in inputs] backprops = [] for indices in batches: X: Array = pad_batch([inputs[i] for i in indices]) activs, get_dX = model.begin_update(X, drop=drop) backprops.append(get_dX) for i, j in enumerate(indices): unbatched[j] = activs.get_slice(i, slice(0, len(inputs[j]))) outputs: List[Activations] = [y for y in unbatched if y is not None] assert len(outputs) == len(unbatched) def backprop_batched(d_outputs: Output, sgd: Optimizer = None) -> Optional[Input]: d_inputs: List[Optional[Array]] = [None for _ in inputs] for indices, get_dX in zip(batches, backprops): d_activs = pad_batch_activations( [d_outputs[i] for i in indices]) dX = get_dX(d_activs, sgd=sgd) if dX is not None: for i, j in enumerate(indices): # As above, put things back in order, unpad. d_inputs[j] = dX[i, :len(d_outputs[j])] not_none = [x for x in d_inputs if x is not None] if len(not_none) == 0: return None else: assert len(not_none) == len(d_inputs) return not_none return outputs, backprop_batched return wrap(apply_model_to_batches, model)
def with_cpu(ops, model): """Wrap a model that should run on CPU, transferring inputs and outputs as necessary.""" model.to_cpu() def with_cpu_forward(inputs, drop=0.0): cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop) gpu_outputs = _to_device(ops, cpu_outputs) def with_cpu_backprop(d_outputs, sgd=None): cpu_d_outputs = _to_cpu(d_outputs) return backprop(cpu_d_outputs, sgd=sgd) return gpu_outputs, with_cpu_backprop return wrap(with_cpu_forward, model)
def masked_language_model(vocab, model, mask_prob=0.15): """Convert a model into a BERT-style masked language model""" random_words = _RandomWords(vocab) def mlm_forward(docs, drop=0.0): mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) output, backprop = model.begin_update(docs, drop=drop) def mlm_backward(d_output, sgd=None): d_output *= 1 - mask return backprop(d_output, sgd=sgd) return output, mlm_backward return wrap(mlm_forward, model)
def reapply(layer, n_times): def reapply_fwd(X, drop=0.): backprops = [] for i in range(n_times): Y, backprop = layer.begin_update(X, drop=drop) X = Y backprops.append(backprop) def reapply_bwd(dY, sgd=None): dX = None for backprop in reversed(backprops): dY = backprop(dY, sgd=sgd) if dX is None: dX = dY else: dX += dY return dX return Y, reapply_bwd return wrap(reapply_fwd, layer)
def foreach_sentence(layer: Model, drop_factor: float = 1.0) -> Model: """Map a layer across sentences (assumes spaCy-esque .sents interface)""" ops = layer.ops Output = List[Activations] Backprop = Callable[[Output, Optional[Optimizer]], None] def sentence_fwd(docs: List[Doc], drop: Dropout = 0.0) -> Tuple[Output, Backprop]: sents: List[Span] sent_acts: List[Activations] bp_sent_acts: Callable[..., Optional[List[None]]] nested: List[List[Activations]] doc_sent_lengths: List[List[int]] doc_acts: List[Activations] sents = flatten_list([list(doc.sents) for doc in docs]) sent_acts, bp_sent_acts = layer.begin_update(sents, drop=drop) nested = unflatten_list(sent_acts, [len(list(doc.sents)) for doc in docs]) doc_sent_lengths = [[len(sa) for sa in doc_sa] for doc_sa in nested] doc_acts = [Activations.join(doc_sa) for doc_sa in nested] assert len(docs) == len(doc_acts) def sentence_bwd(d_doc_acts: Output, sgd: Optional[Optimizer] = None) -> None: d_nested = [ d_doc_acts[i].split(ops, doc_sent_lengths[i]) for i in range(len(d_doc_acts)) ] d_sent_acts = flatten_list(d_nested) d_ids = bp_sent_acts(d_sent_acts, sgd=sgd) if not (d_ids is None or all(ds is None for ds in d_ids)): raise ValueError("Expected gradient of sentence to be None") return doc_acts, sentence_bwd model = wrap(sentence_fwd, layer) return model
def concatenate_lists(*layers, **kwargs): # pragma: no cover """Compose two or more models `f`, `g`, etc, such that their outputs are concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))` """ if not layers: return noop() drop_factor = kwargs.get("drop_factor", 1.0) ops = layers[0].ops layers = [chain(layer, flatten) for layer in layers] concat = concatenate(*layers) def concatenate_lists_fwd(Xs, drop=0.0): drop *= drop_factor lengths = ops.asarray([len(X) for X in Xs], dtype="i") flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) ys = ops.unflatten(flat_y, lengths) def concatenate_lists_bwd(d_ys, sgd=None): return bp_flat_y(ops.flatten(d_ys), sgd=sgd) return ys, concatenate_lists_bwd model = wrap(concatenate_lists_fwd, concat) return model
def with_length_batching(model: TransformersWrapper, max_words: int) -> TransformersWrapper: ops = model.ops def apply_model_to_batches(inputs: RaggedArray, drop: Dropout = 0.0) -> Tuple[Acts, Callable]: if max_words == 0 or inputs.data.shape[0] < max_words: return model.begin_update(inputs, drop=drop) Xs: List[Array] = ops.unflatten(inputs.data, inputs.lengths) outputs = None backprops = [] index2rows = {} start = 0 # Map each index to the slice of rows in the flattened data it refers to. for i, length in enumerate(inputs.lengths): index2rows[i] = [start + j for j in range(length)] start += length total_rows = sum(inputs.lengths) for indices in batch_by_length(Xs, max_words): X: Array = inputs.xp.concatenate([Xs[i] for i in indices]) lengths = [inputs.lengths[i] for i in indices] Y, get_dX = model.begin_update(RaggedArray(X, lengths), drop=drop) if outputs is None: lh_shape = (total_rows, Y.lh.data.shape[-1]) po_shape = (len(inputs.lengths), Y.po.data.shape[-1]) outputs = Acts( RaggedArray(Y.lh.xp.zeros(lh_shape, dtype="f"), inputs.lengths), RaggedArray(Y.po.xp.zeros(po_shape, dtype="f"), [1 for _ in inputs.lengths]), ) lh_rows = [] po_rows = [] for index in indices: lh_rows.extend(index2rows[index]) po_rows.append(index) lh_rows = outputs.xp.array(lh_rows, dtype="i") po_rows = outputs.xp.array(po_rows, dtype="i") outputs.lh.data[lh_rows] = Y.lh.data if outputs.has_po and po_rows.size: outputs.po.data[po_rows] = Y.po.data backprops.append((get_dX, lh_rows, po_rows, lengths)) def backprop_batched(d_outputs: Acts, sgd: Optimizer = None): for get_dX, lh_rows, po_rows, lengths in backprops: if d_outputs.has_lh: d_lh = d_outputs.lh.data[lh_rows] lh_lengths = lengths else: d_lh = d_outputs.lh.data lh_lengths = [] if d_outputs.has_po: d_po = d_outputs.po.data[po_rows] po_lengths = [1 for _ in lengths] else: d_po = d_outputs.po.data po_lengths = [] dY = Acts(RaggedArray(d_lh, lh_lengths), RaggedArray(d_po, po_lengths)) dX = get_dX(dY, sgd=sgd) assert dX is None return None return outputs, backprop_batched return wrap(apply_model_to_batches, model)