def backprop_pytt_pooler_output(d_outputs, sgd=None): total_grads = [] for doc, dY in zip(docs, d_outputs): if doc._.pytt_d_pooler_output.size == 0: xp = get_array_module(doc._.pytt_pooler_output) grads = xp.zeros(doc._.pytt_pooler_output.shape, dtype="f") doc._.pytt_d_pooler_output = grads doc._.pytt_d_pooler_output += dY xp = get_array_module(dY) total_grads.append(float(xp.abs(dY).sum())) return None
def from_truncated(cls, square: Array, lengths: List[int]) -> "RaggedArray": if len(lengths) != square.shape[0]: raise ValueError( "Truncated array must have shape[0] == len(lengths)") width = square.shape[1] max_len = max(lengths, default=0) extra_dims = square.shape[2:] if width == max_len: return RaggedArray(square, lengths) elif width > max_len: raise ValueError( "Expected width < max_len. Got {width} > {max_len}") xp = get_array_module(square) expanded = xp.zeros((sum(lengths), ) + extra_dims, dtype=square.dtype) # TODO: I know there's a way to do this without the loop :(. Escapes # me currently. start = 0 for i, length in enumerate(lengths): # We could have a row that's actually shorter than the width, # if the array was padded. Make sure we don't get junk values. row_width = min(width, length) expanded[start:start + row_width] = square[i, :row_width] start += length return cls(expanded, lengths)
def get_pytt_class_tokens(docs, drop=0.0): """Output a List[array], where the array is the class vector for each sentence in the document. To backprop, we increment the values in the doc._.pytt_d_last_hidden_state array. """ xp = get_array_module(docs[0]._.pytt_last_hidden_state) outputs = [] for doc in docs: wp_tensor = doc._.pytt_last_hidden_state class_vectors = [] for sent in doc.sents: if sent._.pytt_start is not None: class_vectors.append(wp_tensor[sent._.pytt_start]) else: class_vectors.append( xp.zeros((wp_tensor.shape[-1], ), dtype="f")) Y = xp.vstack(class_vectors) outputs.append(Y) def backprop_pytt_class_tokens(d_outputs, sgd=None): for doc, dY in zip(docs, d_outputs): if doc._.pytt_d_last_hidden_state.size == 0: xp = get_array_module(doc._.pytt_last_hidden_state) grads = xp.zeros(doc._.pytt_last_hidden_state.shape, dtype="f") doc._.pytt_d_last_hidden_state = grads for i, sent in enumerate(doc.sents): if sent._.pytt_start is not None: doc._.pytt_d_last_hidden_state[sent._.pytt_start] += dY[i] return None return outputs, backprop_pytt_class_tokens
def get_class_tokens(docs, drop=0.0): """Output a List[array], where the array is the class vector for each sentence in the document. To backprop, we increment the values in the Doc's d_last_hidden_state array. """ xp = get_array_module(docs[0]._.get(ATTRS.last_hidden_state)) outputs = [] doc_class_tokens = [] for doc in docs: class_tokens = [] for i, wp in enumerate(doc._.get(ATTRS.word_pieces_)): if is_class_token(wp): class_tokens.append(i) doc_class_tokens.append(xp.array(class_tokens, dtype="i")) wp_tensor = doc._.get(ATTRS.last_hidden_state) outputs.append(wp_tensor[doc_class_tokens[-1]]) def backprop_class_tokens(d_outputs, sgd=None): for doc, class_tokens, dY in zip(docs, doc_class_tokens, d_outputs): if doc._.get(ATTRS.d_last_hidden_state).size == 0: xp = get_array_module(doc._.get(ATTRS.last_hidden_state)) grads = xp.zeros(doc._.get(ATTRS.last_hidden_state).shape, dtype="f") doc._.set(ATTRS.d_last_hidden_state, grads) doc._.get(ATTRS.d_last_hidden_state)[class_tokens] += dY return None return outputs, backprop_class_tokens
def cosine(vec1, vec2): xp = get_array_module(vec1) norm1 = xp.linalg.norm(vec1) norm2 = xp.linalg.norm(vec2) if norm1 == 0.0 or norm2 == 0.0: return 0 else: return vec1.dot(vec2) / (norm1 * norm2)
def backprop_pooler_output(d_outputs, sgd=None): for doc, dY in zip(docs, d_outputs): if doc._.get(ATTRS.d_pooler_output).size == 0: xp = get_array_module(doc._.get(ATTRS.pooler_output)) grads = xp.zeros(doc._.get(ATTRS.pooler_output).shape, dtype="f") doc._.set(ATTRS.d_pooler_output, grads) doc._.set(ATTRS.d_pooler_output, doc._.get(ATTRS.d_pooler_output) + dY) return None
def backprop_class_tokens(d_outputs, sgd=None): for doc, class_tokens, dY in zip(docs, doc_class_tokens, d_outputs): if doc._.get(ATTRS.d_last_hidden_state).size == 0: xp = get_array_module(doc._.get(ATTRS.last_hidden_state)) grads = xp.zeros(doc._.get(ATTRS.last_hidden_state).shape, dtype="f") doc._.set(ATTRS.d_last_hidden_state, grads) doc._.get(ATTRS.d_last_hidden_state)[class_tokens] += dY return None
def backprop_pytt_pooler_output(d_outputs, sgd=None): for doc, dY in zip(docs, d_outputs): if doc._.pytt_d_pooler_output.size == 0: xp = get_array_module(doc._.pytt_pooler_output) grads = xp.zeros(doc._.pytt_pooler_output.shape, dtype="f") doc._.pytt_d_pooler_output = grads doc._.pytt_d_pooler_output += dY return None
def backprop_pytt_class_tokens(d_outputs, sgd=None): for doc, class_tokens, dY in zip(docs, doc_class_tokens, d_outputs): if doc._.pytt_d_last_hidden_state.size == 0: xp = get_array_module(doc._.pytt_last_hidden_state) grads = xp.zeros(doc._.pytt_last_hidden_state.shape, dtype="f") doc._.pytt_d_last_hidden_state = grads doc._.pytt_d_last_hidden_state[class_tokens] += dY return None
def backprop_pytt_last_hidden(d_outputs, sgd=None): for doc, d_lh in zip(docs, d_outputs): xp = get_array_module(d_lh) shape = d_lh.shape dtype = d_lh.dtype if doc._.pytt_d_last_hidden_state.size == 0: doc._.pytt_d_last_hidden_state = xp.zeros(shape, dtype=dtype) doc._.pytt_d_last_hidden_state += d_lh return None
def cosine_similarity(vec1, vec2) -> float: """Compute the cosine similarity of two vectors.""" if vec1.all() == 0 or vec2.all() == 0: return 0.0 xp = get_array_module(vec1) norm1 = xp.linalg.norm(vec1) norm2 = xp.linalg.norm(vec2) if norm1 == norm2: return 1.0 return xp.dot(vec1, vec2) / (norm1 * norm2)
def from_padded(cls, padded: Array, lengths: List[int]) -> "RaggedArray": if max(lengths, default=0) > padded.shape[1]: return cls.from_truncated(padded, lengths) mask = lengths2mask(lengths) assert sum(mask) == sum(lengths) all_rows = padded.reshape((-1,) + padded.shape[2:]) xp = get_array_module(all_rows) data = xp.ascontiguousarray(all_rows[mask]) assert data.shape[0] == sum(lengths) return cls(data, lengths)
def tanh(X, drop=0.0): xp = get_array_module(X) Y = xp.tanh(X) def backprop_tanh(dY, sgd=None): one = Y.dtype.type(1) dX = dY * (one - Y * Y) return dX return Y, backprop_tanh
def backprop_pytt_class_tokens(d_outputs, sgd=None): for doc, dY in zip(docs, d_outputs): if doc._.pytt_d_last_hidden_state.size == 0: xp = get_array_module(doc._.pytt_last_hidden_state) grads = xp.zeros(doc._.pytt_last_hidden_state.shape, dtype="f") doc._.pytt_d_last_hidden_state = grads for i, sent in enumerate(doc.sents): if sent._.pytt_start is not None: doc._.pytt_d_last_hidden_state[sent._.pytt_start] += dY[i] return None
def backprop_last_hidden(d_outputs, sgd=None): for doc, d_lh in zip(docs, d_outputs): xp = get_array_module(d_lh) shape = d_lh.shape dtype = d_lh.dtype if doc._.get(ATTRS.d_last_hidden_state).size == 0: doc._.set(ATTRS.d_last_hidden_state, xp.zeros(shape, dtype=dtype)) doc._.set(ATTRS.d_last_hidden_state, doc._.get(ATTRS.d_last_hidden_state) + d_lh) return None
def backprop_tensors(d_tensors, sgd=None): for doc, d_t in zip(docs, d_tensor): # Count how often each word-piece token is represented. This allows # a weighted sum, so that we can make sure doc.tensor.sum() # equals wp_tensor.sum(). Do this with sensitivity to boundary tokens wp_rows, align_sizes = _get_boundary_sensitive_alignment(doc) d_lh = _get_or_set_d_last_hidden_state(doc) for i, word_piece_slice in enumerate(wp_rows): for j in word_piece_slice: d_lh[j] += d_tensor[i] xp = get_array_module(d_lh) d_lh /= xp.array(align_sizes, dtype="f").reshape(-1, 1) return None
def get_cossim_loss(yh, y): # Add a small constant to avoid 0 vectors yh = yh + 1e-8 y = y + 1e-8 # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity xp = get_array_module(yh) norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) norm_y = xp.linalg.norm(y, axis=1, keepdims=True) mul_norms = norm_yh * norm_y cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2)) loss = xp.abs(cosine - 1).sum() return loss, -d_yh
def get_cossim_loss(yh, y): # Add a small constant to avoid 0 vectors yh = yh + 1e-8 y = y + 1e-8 # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity xp = get_array_module(yh) norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) norm_y = xp.linalg.norm(y, axis=1, keepdims=True) mul_norms = norm_yh * norm_y cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2)) loss = xp.abs(cosine - 1).sum() return loss, -d_yh
def logistic(X, drop=0.0): xp = get_array_module(X) if not isinstance(X, xp.ndarray): X = xp.asarray(X) # Clip to range (-10, 10) X = xp.minimum(X, 10.0, X) X = xp.maximum(X, -10.0, X) Y = 1.0 / (1.0 + xp.exp(-X)) def logistic_bwd(dY, sgd=None): dX = dY * (Y * (1 - Y)) return dX return Y, logistic_bwd
def s2v_doc_similarity(self, obj1, other): """Make a semantic similarity estimate. The default estimate is cosine similarity using an average of word vectors. other (object): The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. RETURNS (float): A scalar similarity score. Higher is more similar. DOCS: https://spacy.io/api/doc#similarity """ vector1 = self.get_s2v_doc_vector(obj1) vector2 = self.get_s2v_doc_vector(other) if len(vector1) == 0 or len(vector2) == 0: return -1.0 xp = get_array_module(vector1) return xp.dot(vector1, vector2) / (self.vector_norm(vector1) * self.vector_norm(vector2))
def tanh(X, drop=0.): xp = get_array_module(X) if not isinstance(X, xp.ndarray): X = xp.asarray(X) # Clip to range (-10, 10) X = xp.minimum(X, 10., X) X = xp.maximum(X, -10., X) e = xp.exp(2*X) Y = (e - 1.) / (e + 1.) def tanh_bwd(dY, sgd=None): dX = dY * (1 - Y * Y) return dX return Y, tanh_bwd
def get_cossim_loss(yh, y, ignore_zeros=False): xp = get_array_module(yh) # Find the zero vectors if ignore_zeros: zero_indices = xp.abs(y).sum(axis=1) == 0 # Add a small constant to avoid 0 vectors yh = yh + 1e-8 y = y + 1e-8 # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) norm_y = xp.linalg.norm(y, axis=1, keepdims=True) mul_norms = norm_yh * norm_y cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2)) losses = xp.abs(cosine - 1) if ignore_zeros: # If the target was a zero vector, don't count it in the loss. d_yh[zero_indices] = 0 losses[zero_indices] = 0 loss = losses.sum() return loss, -d_yh
def xp(self): return get_array_module(self.data)