def _represent_mentions(mention_ids, token_rep): ''' :param mention_ids: :param token_rep: :return: ''' try: if len(mention_ids) == 0: final_mention_representation = F.embed_id( self.xp.asarray([0]).astype('i'), token_rep) else: final_mention_representation = F.embed_id( self.xp.asarray(mention_ids).astype('i'), token_rep) if len(mention_ids) > 1: final_mention_representation = F.average( final_mention_representation, axis=0) except: #TODO: when word is a substring of a word or when word is the NONE, can be split ''' #TODO: If reason is because the word is a substring, use the index of the enclosing word. In this case, position is the index. If reason is because the word is a NONE word, use the index of the NONE token. In this case, position is the last index. # xs_embed = self.sequence_embed([self.xp.asarray([3]).astype('i')], self.embed_wordtype, TRAIN=False) #index 3 is the NONE word # h, c, word_embedding = self.bilstm(None, None, xs_embed) # none_position_embedding = self.embed_positiontype(self.xp.asarray([self.max_pos - 1])) # TODO: current is strict, better way is to get the position where it is a subset # final_mention_representation = F.concat((word_embedding[0], none_position_embedding), axis=1) ''' final_mention_representation = F.embed_id( self.xp.asarray([0]).astype('i'), token_rep) return final_mention_representation
def gnn(self, vertex, edge, adjacency, vertex_): x_vertex = self.embed_vertex(vertex) x_edge = self.embed_edge(edge) V, degree = edge.shape for _ in range(layer_gnn): x_adja = F.embed_id(adjacency, x_vertex, ignore_label=-1) h_adja = F.relu( self.W_vertex(F.sum(x_adja, 1)) + self.W_edge(F.sum(x_edge, 1))) x_vertex_ = F.embed_id(vertex_, x_vertex, ignore_label=-1) x_vertex_ = F.reshape(x_vertex_, (V * degree, dim)) x_adja = F.reshape(x_adja, (V * degree, dim)) h_side = F.relu(self.W_vertex(x_vertex_ + x_adja)) """Update x_vertex.""" x_vertex = F.sigmoid(F.relu(self.W_vertex(x_vertex)) + h_adja) """Update x_edge.""" x_edge = F.reshape(x_edge, (V * degree, dim)) x_edge = F.sigmoid(F.relu(self.W_edge(x_edge)) + h_side) x_edge = F.reshape(x_edge, (V, degree, dim)) y = F.expand_dims(F.sum(x_vertex, 0), 0) return y
def forward(self, hs_flatten, pairs, ckeys, lengths): xp = chainer.cuda.get_array_module(hs_flatten) p1, p2 = xp.asarray(pairs.T) ckeys = xp.asarray(ckeys) h_p1 = F.embed_id(p1, hs_flatten) h_p2 = F.embed_id(p2, hs_flatten) h_cnext = F.embed_id(ckeys + 1, hs_flatten) h_cprev = F.embed_id(ckeys - 1, hs_flatten) fs = F.concat((h_p1 - h_cnext, h_p2 - h_cprev), axis=1) return fs
def train_forward(self, input_ids, output_ids, input_masks=None, output_masks=None): input_embeddings = F.embed_id(input_ids, self.source_vocab.embeddings) output_embeddings = F.embed_id(output_ids, self.target_vocab.embeddings) encodings = self.encode(input_embeddings, input_masks=input_masks) token_probs = self.decode(encodings, output_embeddings, input_masks=input_masks, output_masks=output_masks) return token_probs
def forward(self, input_ids, input_masks=None, length=None): batch_size, input_length = input_ids.shape[0], input_ids.shape[1] input_embeddings = F.embed_id(input_ids, self.source_vocab.embeddings) encodings = self.encode(input_embeddings, input_masks=input_masks) output_probs = None output_embeddings = self.target_vocab.embed( [self.target_vocab.start_id]) output_embeddings = F.expand_dims(output_embeddings, 0) output_embeddings = F.tile(output_embeddings, (batch_size, 1, 1)) end_predicted = F.tile(F.reshape(xp.array([False]), (1, 1)), (batch_size, 1)) all_done = False current_length = 0 while (length is None and not all_done) or (length is not None and current_length < length): token_probs = self.decode(encodings, output_embeddings, input_masks=input_masks) next_token_probs = token_probs[:, -1, :] next_token_ids = F.argmax(next_token_probs, axis=-1) next_token_embeddings = F.embed_id(next_token_ids, self.target_vocab.embeddings) next_token_embeddings = F.expand_dims(next_token_embeddings, axis=1) output_embeddings = F.concat( [output_embeddings, next_token_embeddings], axis=1) next_output_probs = F.expand_dims(next_token_probs, axis=1) if output_probs is None: output_probs = next_output_probs else: output_probs = F.concat([output_probs, next_output_probs], axis=1) next_token_end = (next_token_ids.array == self.target_vocab.end_id) next_end_predicted = F.expand_dims( end_predicted[:, -1].array | next_token_end, -1) end_predicted = F.concat([end_predicted, next_end_predicted], axis=-1) all_done = xp.all(next_end_predicted.array) current_length += 1 return output_probs
def fit_partial(self, rdoc_ids, rword_indices, window=5): doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) pivot_idx = next(move(self.xp, rword_indices[window:-window])) pivot = F.embed_id(pivot_idx, self.sampler.W) doc_at_pivot = rdoc_ids[window:-window] doc = self.mixture(next(move(self.xp, doc_at_pivot))) loss = 0.0 start, end = window, rword_indices.shape[0] - window context = (F.dropout(doc, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rword_indices[start + frame:end + frame] doc_at_target = rdoc_ids[start + frame:end + frame] doc_is_same = doc_at_target == doc_at_pivot rand = np.random.uniform(0, 1, doc_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') weight = np.logical_and(doc_is_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() return loss.data
def fit_partial(self, rdoc_ids, rword_indices, window=5): doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) pivot_idx = next(move(self.xp, rword_indices[window: -window])) pivot = F.embed_id(pivot_idx, self.sampler.W) doc_at_pivot = rdoc_ids[window: -window] doc = self.mixture(next(move(self.xp, doc_at_pivot))) loss = 0.0 start, end = window, rword_indices.shape[0] - window context = (F.dropout(doc, self.dropout_ratio) + F.dropout(pivot, self.dropout_ratio)) for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rword_indices[start + frame: end + frame] doc_at_target = rdoc_ids[start + frame: end + frame] doc_is_same = doc_at_target == doc_at_pivot rand = np.random.uniform(0, 1, doc_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') weight = np.logical_and(doc_is_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() return loss.data
def fit_partial(self, rsty_ids, raut_ids, rwrd_ids, window=5): sty_ids, aut_ids, wrd_ids = move(self.xp, rsty_ids, raut_ids, rwrd_ids) pivot_idx = next(move(self.xp, rwrd_ids[window: -window])) pivot = F.embed_id(pivot_idx, self.sampler.W) sty_at_pivot = rsty_ids[window: -window] aut_at_pivot = raut_ids[window: -window] sty = self.mixture_sty(next(move(self.xp, sty_at_pivot))) # aut = self.mixture_aut(next(move(self.xp, aut_at_pivot))) loss = 0.0 start, end = window, rwrd_ids.shape[0] - window context = F.dropout(pivot, self.dropout_ratio) # + aut + sty for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rwrd_ids[start + frame: end + frame] sty_at_target = rsty_ids[start + frame: end + frame] # aut_at_target = raut_ids[start + frame: end + frame] sty_is_same = sty_at_target == sty_at_pivot # aut_is_same = aut_at_target == aut_at_pivot # Randomly dropout words (default is to never do this) rand = np.random.uniform(0, 1, sty_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') # sty_and_aut_are_same = np.logical_and(sty_is_same, aut_is_same) # weight = np.logical_and(sty_and_aut_are_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx # * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() return loss.data
def fit_partial(self, rsty_ids, raut_ids, rwrd_ids, window=5): sty_ids, aut_ids, wrd_ids = move(self.xp, rsty_ids, raut_ids, rwrd_ids) pivot_idx = next(move(self.xp, rwrd_ids[window: -window])) pivot = F.embed_id(pivot_idx, self.sampler.W) sty_at_pivot = rsty_ids[window: -window] aut_at_pivot = raut_ids[window: -window] sty = self.mixture_sty(next(move(self.xp, sty_at_pivot))) aut = self.mixture_aut(next(move(self.xp, aut_at_pivot))) loss = 0.0 start, end = window, rwrd_ids.shape[0] - window context = sty + aut + F.dropout(pivot, self.dropout_ratio) for frame in range(-window, window + 1): # Skip predicting the current pivot if frame == 0: continue # Predict word given context and pivot word # The target starts before the pivot targetidx = rwrd_ids[start + frame: end + frame] sty_at_target = rsty_ids[start + frame: end + frame] aut_at_target = raut_ids[start + frame: end + frame] sty_is_same = sty_at_target == sty_at_pivot aut_is_same = aut_at_target == aut_at_pivot # Randomly dropout words (default is to never do this) rand = np.random.uniform(0, 1, sty_is_same.shape[0]) mask = (rand > self.word_dropout_ratio).astype('bool') sty_and_aut_are_same = np.logical_and(sty_is_same, aut_is_same) weight = np.logical_and(sty_and_aut_are_same, mask).astype('int32') # If weight is 1.0 then targetidx # If weight is 0.0 then -1 targetidx = targetidx * weight + -1 * (1 - weight) target, = move(self.xp, targetidx) loss = self.sampler(context, target) loss.backward() return loss.data
def loop_function(self, prev, h, output_ptojection=False): if output_ptojection: prev = prev * self.W + self.b prev_symbol = F.argmax(prev, 1) emb_prev = F.embed_id(prev_symbol, normalizing(self.embed.W, 1)) emb_prev = F.concat([emb_prev, h], 1) return emb_prev
def __call__(self, x_list): xs_f = F.embed_id(xp.array(x_list, dtype=xp.int32), self.identity, ignore_label=-1) xs_f = xp.reshape(xs_f, (self.batch_size, 1, self.max_len, self.vocab_size)) conv1 = self.conv_1(xs_f) # (batch, max(200, width*50), len(word)) pooled1 = F.sum( F.max_pooling_2d(F.tanh(conv1), 3, 3), axis=2) # pool->(batch, max(200, width*50), len(word)/3) conv2 = self.conv_2( F.pad(xs_f, [(0, 0), (0, 0), (1, 0), (0, 0)], 'constant')) pooled2 = F.sum(F.max_pooling_2d(F.tanh(conv2), 3, 3), axis=2) conv3 = self.conv_3( F.pad(xs_f, [(0, 0), (0, 0), (1, 1), (0, 0)], 'constant')) pooled3 = F.sum(F.max_pooling_2d(F.tanh(conv3), 3, 3), axis=2) conv4 = self.conv_4( F.pad(xs_f, [(0, 0), (0, 0), (2, 1), (0, 0)], 'constant')) pooled4 = F.sum(F.max_pooling_2d(F.tanh(conv4), 3, 3), axis=2) conv5 = self.conv_5( F.pad(xs_f, [(0, 0), (0, 0), (2, 2), (0, 0)], 'constant')) pooled5 = F.sum(F.max_pooling_2d(F.tanh(conv5), 3, 3), axis=2) conv6 = self.conv_6( F.pad(xs_f, [(0, 0), (0, 0), (3, 2), (0, 0)], 'constant')) pooled6 = F.sum(F.max_pooling_2d(F.tanh(conv6), 3, 3), axis=2) conv7 = self.conv_7( F.pad(xs_f, [(0, 0), (0, 0), (3, 3), (0, 0)], 'constant')) pooled7 = F.sum(F.max_pooling_2d(F.tanh(conv7), 3, 3), axis=2) e = F.concat( (pooled1, pooled2, pooled3, pooled4, pooled5, pooled6, pooled7), axis=1) # (batch, max(200, width*50)*7) return self.linear( self.highway_2( self.highway_1(xp.reshape(e, (self.batch_size, 1700)))))
def position2onehot(self, inds, dim): inds = chaFunc.flatten(inds) inds = inds.data.astype('float32') % self.max_n_spans inds = inds.astype('int32') eye = self.xp.identity(dim).astype(self.xp.float32) onehot = chaFunc.embed_id(inds, eye) return onehot
def _populate_features(self, features, batch_index): _feats = self.xp.array(features[:, :4].flatten()) mask = _feats == -1 fs = F.embed_id(_feats, self.hs[batch_index], ignore_label=-1) fs += F.tile(self.pads, (len(features), 1)) \ * self.xp.expand_dims(mask, axis=1) return fs
def forward(self, inputs): """ Compute context insensitive token embeddings for ELMo representations. Parameters ---------- inputs: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length)`` of token ids representing the current batch. Returns ------- Dict with keys: ``'token_embedding'``: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context insensitive token representations. ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask. """ # Add BOS/EOS # mask = ((inputs > 0).sum(axis=-1) > 0) mask = (inputs > 0) token_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( inputs, mask, self._beginning_of_sentence_token, self._end_of_sentence_token) token_embedding = F.embed_id(token_ids_with_bos_eos, self._token_embedding_weights) # (batch_size, sequence_length, embedding_dim) return {'mask': mask_with_bos_eos, 'token_embedding': token_embedding}
def forward(self, ws, cs, ls, dep_ts=None): ws = map(self.emb_word, ws) cs = [F.squeeze( F.max_pooling_2d( self.conv_char( F.expand_dims( self.emb_char(c), 1)), (int(l[0]), 1))) for c, l in zip(cs, ls)] xs_f = [F.dropout(F.concat([w, c]), 0.5) for w, c in zip(ws, cs)] xs_b = [x[::-1] for x in xs_f] _, _, hs_f = self.lstm_f(None, None, xs_f) _, _, hs_b = self.lstm_b(None, None, xs_b) hs_b = [x[::-1] for x in hs_b] hs = [F.concat([h_f, h_b]) for h_f, h_b in zip(hs_f, hs_b)] dep_ys = [self.biaffine_arc( F.elu(F.dropout(self.arc_dep(h), 0.32)), F.elu(F.dropout(self.arc_head(h), 0.32))) for h in hs] if dep_ts is not None: heads = dep_ts else: heads = [F.argmax(y, axis=1) for y in dep_ys] cat_ys = [self.biaffine_tag( F.elu(F.dropout(self.rel_dep(h), 0.32)), F.elu(F.dropout(self.rel_head( F.embed_id(t, h, ignore_label=IGNORE)), 0.32))) for h, t in zip(hs, heads)] return cat_ys, dep_ys
def forward(self, indexs): # print("self.edge2vec:",self.edge2vec[0][0]) mask = np.random.rand(len(indexs)) >= self.dropout_ratio mask = mask*1 vecs = F.embed_id(indexs, self.edge2vec).reshape(-1, self.vecDims) vecs = vecs.T*mask vecs = vecs.T return vecs
def forward(self, indexs): mask = np.random.rand(len(indexs)) >= self.dropout_ratio mask = mask*1 vecs = F.embed_id(indexs, self.nodeVecs).reshape(-1, self.vecDims) # vecs = F.einsum('ij,i->ij', vecs, mask) vecs = vecs.T*mask vecs = vecs.T return vecs
def forward(self, ckeys, hs_flatten, lengths): n_ckeys = np.array([len(ckeys_i) for ckeys_i in ckeys], np.int32) ckeys = [ckeys_i + offset for ckeys_i, offset in zip(ckeys, np.insert(lengths, 0, 0)[:-1].cumsum())] ckeys = np.concatenate(ckeys).astype(np.int32) hs_ckeys = F.embed_id(self.xp.asarray(ckeys), hs_flatten) scores = self.linear(hs_ckeys) return scores, n_ckeys.cumsum().astype(np.int32)
def __init__(self, w): super(Encoder, self).__init__() self.out_units = 300 with self.init_scope(): self.embed = lambda x: F.embed_id(x, w) self.encoder = L.NStepLSTM(n_layers=1, in_size=300, out_size=self.out_units, dropout=0.5)
def embed_predict(self, examples): """Just a forward prediction of given example.""" # examples (..., 1+L) ex = self.embed(examples[..., 1:]) # (..., L, E) task_id = F.embed_id(examples[..., 0] - 1, np.eye(TASKS, dtype=np.float32)) # (..., T) flat_ex = F.reshape(ex, ex.shape[:-2] + (-1, )) # (..., L*E) combined_ex = F.concat((flat_ex, task_id), axis=-1) # (..., L*E+T) return self.predict(combined_ex) # (..., V)
def embed_predict(self, examples): """Just a forward prediction of given example.""" # examples (..., 1+W*H) ex = F.reshape(examples[..., 1:], examples.shape[:-1] + tuple(GRID)) # (..., W, H) ex = self.embed(ex) # (..., W, H, E) task_id = F.embed_id(examples[..., 0] - 1, np.eye(TASKS, dtype=np.float32)) # (..., T) task_id = F.tile(task_id[..., None, None, :], ex.shape[-3:-1] + (1, )) # (..., W, H, T) combined_ex = F.concat((ex, task_id), axis=-1) # (..., W, H, E+T) return self.predict(combined_ex) # (..., V)
def predict_embed(self, xs, embedW, labels=None, dropout=0., mode='sampling', temp=1., word_lower_bound=0., gold_lower_bound=0., gumbel=True, residual=0., wordwise=True, add_original=0., augment_ratio=0.25): x_len = [len(x) for x in xs] with chainer.using_config('train', False), chainer.no_backprop_mode(): t_out_concat = self.encode(xs, labels=labels) prob_concat = self.output.output(t_out_concat).data prob_concat /= temp prob_concat += self.xp.random.gumbel( size=prob_concat.shape).astype('f') prob_concat = F.softmax(prob_concat).data out_concat = F.embed_id( self.xp.argmax(prob_concat, axis=1).astype(np.int32), embedW) # insert eos eos = embedW[0][None] new_out = [] count = 0 for i, x in enumerate(xs): new_out.append(eos) new_out.append(out_concat[count:count + len(xs) - 2]) new_out.append(eos) count += len(xs) - 2 out_concat = F.concat(new_out, axis=0) def embed_func(x): return F.embed_id(x, embedW, ignore_label=-1) raw_concat = F.concat(sequence_embed(embed_func, xs, self.dropout), axis=0) b, u = raw_concat.shape mask = self.xp.broadcast_to( (self.xp.random.rand(b, 1) < augment_ratio), raw_concat.shape) out_concat = F.where(mask, out_concat, raw_concat) x_len = [len(x) for x in xs] x_section = np.cumsum(x_len[:-1]) out_concat = F.dropout(out_concat, dropout) exs = F.split_axis(out_concat, x_section, 0) return exs
def _extract(start, end): spans = [] start, end, offset = _uniq(start, end) ofs, lb, ub = 0, 0, 0 for k in range(len(start)): lb, ub = min(lb, start[k]), max(ub, end[k]) if ub - lb > block_size and k > 0: spans.append(_sum(start[ofs:k], end[ofs:k])) ofs, lb, ub = k, start[k], end[k] spans.append(_sum(start[ofs:], end[ofs:])) spans = F.vstack(spans) / xp.asarray(end - start)[:, None] return F.embed_id(xp.asarray(offset), spans)
def _feature_repl(hs_flatten, pairs, ckeys, lengths): xp = chainer.cuda.get_array_module(hs_flatten) begins, ends = pairs.T begins_ = xp.asarray(begins) ends_ = xp.asarray(ends) ckeys_ = xp.asarray(ckeys) h_b = F.embed_id(begins_, hs_flatten) h_b_pre = F.embed_id(begins_ - 1, hs_flatten, ignore_label=-1) out_of_span = np.insert(lengths[:-1].cumsum(), 0, 0) - 1 is_out_of_span = np.isin(begins - 1, out_of_span) h_b_pre = F.where( xp.asarray(is_out_of_span)[:, None], xp.zeros_like(h_b_pre.data), h_b_pre) h_e = F.embed_id(ends_, hs_flatten) h_e_post = F.embed_id(ends_ + 1, hs_flatten, hs_flatten.shape[0]) out_of_span = lengths.cumsum() is_out_of_span = np.isin(ends + 1, out_of_span) h_e_post = F.where( xp.asarray(is_out_of_span)[:, None], xp.zeros_like(h_e_post.data), h_e_post) h_k_pre = F.embed_id(ckeys_ - 1, hs_flatten) h_k_post = F.embed_id(ckeys_ + 1, hs_flatten) repl1 = F.absolute(h_b_pre * (h_b - h_k_post)) repl2 = F.absolute(h_e_post * (h_e - h_k_pre)) return repl1, repl2
def _compute_metrics(parsed, gold_batch, lengths, use_predicted_arcs_for_rels=True): logits_arc, logits_rel, *_ = parsed true_arcs, true_rels, *_ = zip(*gold_batch) # exclude attachment from the root logits_arc, logits_rel = logits_arc[:, 1:], logits_rel[:, 1:] true_arcs = F.pad_sequence(true_arcs, padding=-1)[:, 1:] true_rels = F.pad_sequence(true_rels, padding=-1)[:, 1:] lengths = np.array(lengths, dtype=np.int32) - 1 xp = chainer.cuda.get_array_module(logits_arc) if xp is not np: true_arcs.to_gpu() true_rels.to_gpu() b, n_deps, n_heads = logits_arc.shape logits_arc_flatten = F.reshape(logits_arc, (b * n_deps, n_heads)) true_arcs_flatten = F.reshape(true_arcs, (b * n_deps, )) arc_loss = F.softmax_cross_entropy(logits_arc_flatten, true_arcs_flatten, ignore_label=-1) arc_accuracy = _accuracy(logits_arc_flatten, true_arcs_flatten, ignore_label=-1) if use_predicted_arcs_for_rels: parsed_arcs = xp.argmax(logits_arc.data, axis=2) else: parsed_arcs = true_arcs.data parsed_arcs = chainer.cuda.to_cpu(parsed_arcs) b, n_deps, n_heads, n_rels = logits_rel.shape base1, base2 = n_deps * n_heads, np.arange(n_deps) * n_heads parsed_arcs_flatten = np.concatenate( [base1 * i + base2 + arcs for i, arcs in enumerate(parsed_arcs)]) logits_rel_flatten = F.embed_id(xp.asarray(parsed_arcs_flatten), F.reshape(logits_rel, (b * base1, n_rels))) true_rels_flatten = F.reshape(true_rels, (b * n_deps, )) rel_loss = F.softmax_cross_entropy(logits_rel_flatten, true_rels_flatten, ignore_label=-1) rel_accuracy = _accuracy(logits_rel_flatten, true_rels_flatten, ignore_label=-1) return { 'arc_loss': arc_loss, 'arc_accuracy': arc_accuracy, 'rel_loss': rel_loss, 'rel_accuracy': rel_accuracy }
def __call__(self, x): x = F.embed_id(x, self.embed_weights) conved = [] for conv in self.convs: h = F.relu(conv(x)) h = F.max_pooling_2d(h, (2, self.embed_dim)) conved.append(h) # concatenate along conved dimention (axis=2) x = F.concat(conved, axis=2) x = F.dropout(F.relu(self.fc4(x)), self.dropout) if chainer.config.train: return self.fc5(x) return F.softmax(self.fc5(x))
def embed_seq_batch(embed, seq_batch, dropout=0., context=None): x_len = [len(seq) for seq in seq_batch] x_section = np.cumsum(x_len[:-1]) ex = embed(F.concat(seq_batch, axis=0)) ex = F.dropout(ex, dropout) if context is not None: ids = [embed.xp.full((l, ), i).astype('i') for i, l in enumerate(x_len)] ids = embed.xp.concatenate(ids, axis=0) cx = F.embed_id(ids, context) ex = F.concat([ex, cx], axis=1) exs = F.split_axis(ex, x_section, 0) return exs
def __call__(self, x_data, x_char_data=None, x_additional=None): hx = None cx = None self.n_length = [len(_x) for _x in x_data] self.inds = np.argsort([-len(_x) for _x in x_data]).astype('i') if self.use_char: # CharCNN x_char_data_flat = [] for _ in x_char_data: x_char_data_flat.extend(_) char_vecs = self.char_cnn(x_char_data_flat) char_index = self.char_cnn.char_index(self.n_length) xs = [] for i, x in enumerate(x_data): x = my_variable(x, volatile=not self.train) x = self.word_embed(x) if self.use_char: x_char = F.embed_id(char_index[i], char_vecs, ignore_label=-1) x = F.concat([x, x_char], axis=1) if x_additional: for add_i in six.moves.range(self.n_add_feature): x_add = x_additional[add_i][i] x_add = my_variable(x_add, volatile=not self.train) add_emb_layer = self.get_layer('add_embed_' + str(add_i)) x_add = add_emb_layer(x_add) x = F.concat([x, x_add], axis=1) x = my_dropout(x, ratio=self.use_dropout, train=self.train) xs.append(x) _hy_f, _cy_f, h_vecs = self.rnn( hx=hx, cx=cx, xs=xs, ) h_vecs = F.concat(h_vecs, axis=0) if self.use_dropout: h_vecs = my_dropout(h_vecs, ratio=self.use_dropout, train=self.train) # Label Predict output = self.output_layer(h_vecs) output_list = F.split_axis(output, output.data.shape[0], axis=0) return output_list
def __init__(self, w): # super(Encoder, self).__init__()で別ファイルのスーパークラス(chainer.Chain)のメソッドを呼び出すことが出来る。 super(Encoder, self).__init__() # 300はWord2Vecの次元数 self.out_units = 300 # with構文でファイルを扱う # Chainクラスで重みの更新がされるのは self.init_scope()内に書いている linkオブジェクト with self.init_scope(): self.embed = lambda x: F.embed_id(x, w) # 学習するLSTMの形を設定する self.encoder = L.NStepLSTM(n_layers=1, in_size=300, out_size=self.out_units, dropout=0.5)
def wsd_with_tc(self, sent, trf_encoded_matrix, labels): ### WSD ### if self.model_type == "TRF-Multi" or self.model_type == "TRF-Delay-Multi": y_wsd = self.wsd_only(trf_encoded_matrix, labels) elif self.model_type == "TRF-Sequential": y_wsd, task_type = self.wsd_model(sent, None, None, True) ## 読み込みsequential y_wsd_soft = F.softmax(y_wsd) ## 予測結果にSoftmaxをかける argmax_wsd = F.argmax(y_wsd_soft, axis=1) ## 最大のインデクス値を取ってくる cond = chainer.Variable( self.xp.array([ True if i != "<PAD>" else False for i in list(chain(*labels)) ])) ## 語義のラベルがついていない単語は無視するための条件 pad_array = chainer.Variable( -1 * self.xp.ones(argmax_wsd.shape, dtype=argmax_wsd.dtype)) pad_array_argmax_wsd = F.where(cond, argmax_wsd, pad_array) sense_label_embed = F.embed_id(x=pad_array_argmax_wsd, W=self.xp.array( self.lookup_table_sense_fixed), ignore_label=-1) ## 固定. sense_label_embed = sense_label_embed.reshape( trf_encoded_matrix.shape[0], trf_encoded_matrix.shape[-1], -1) origin_shape = sense_label_embed.shape sense_label_embed = F.moveaxis(sense_label_embed, 1, 2) ## 置き換え ## cond_reshape = cond.reshape(cond.shape[0], -1) cond_reshape = F.broadcast_to( cond_reshape, (cond_reshape.shape[0], trf_encoded_matrix.shape[1])) cond_reshape = cond_reshape.reshape(origin_shape) cond_reshape = F.swapaxes(cond_reshape, 1, 2) replaced_trf_matrix = F.where(cond_reshape, sense_label_embed, trf_encoded_matrix) ### WSDの予測をTCに組み入れる ### tc = replaced_trf_matrix ## 置換後の文書行列 ### TC ### tc_features = F.sum(tc, axis=2) ## TC特徴 y_tc = self.fc2(tc_features) ### TCの予測結果 return (y_tc, y_wsd) if (self.model_type == "TRF-Multi") or ( self.model_type == "TRF-Delay-Multi") else y_tc
def forward(self, ws, ss, ps, dep_ts=None): batchsize = len(ws) xp = chainer.cuda.get_array_module(ws[0]) split = scanl(lambda x, y: x + y, 0, [w.shape[0] for w in ws])[1:-1] wss = self.emb_word(F.hstack(ws)) sss = F.reshape(self.emb_suf(F.vstack(ss)), (-1, 4 * self.afix_dim)) pss = F.reshape(self.emb_prf(F.vstack(ps)), (-1, 4 * self.afix_dim)) ins = F.dropout(F.concat([wss, sss, pss]), self.dropout_ratio, train=self.train) xs_f = list(F.split_axis(ins, split, 0)) xs_b = [x[::-1] for x in xs_f] cx_f, hx_f, cx_b, hx_b = self._init_state(xp, batchsize) _, _, hs_f = self.lstm_f(hx_f, cx_f, xs_f, train=self.train) _, _, hs_b = self.lstm_b(hx_b, cx_b, xs_b, train=self.train) hs_b = [x[::-1] for x in hs_b] # ys: [(sentence length, number of category)] hs = [F.concat([h_f, h_b]) for h_f, h_b in zip(hs_f, hs_b)] dep_ys = [ self.biaffine_arc( F.elu(F.dropout(self.arc_dep(h), 0.32, train=self.train)), F.elu(F.dropout(self.arc_head(h), 0.32, train=self.train))) for h in hs ] # if dep_ts is not None and random.random >= 0.5: if dep_ts is not None: heads = dep_ts else: heads = [F.argmax(y, axis=1) for y in dep_ys] heads = F.elu(F.dropout( self.rel_head( F.vstack([F.embed_id(t, h, ignore_label=IGNORE) \ for h, t in zip(hs, heads)])), 0.32, train=self.train)) childs = F.elu( F.dropout(self.rel_dep(F.vstack(hs)), 0.32, train=self.train)) cat_ys = self.biaffine_tag(childs, heads) cat_ys = list(F.split_axis(cat_ys, split, 0)) return cat_ys, dep_ys
def sequence_embed(xs): """Embed sequences of integers.""" # xt [(L1,), (L2,), ...] xs = list(xs) # Chainer quirk expects lists x_len = [len(x) for x in xs] x_section = np.cumsum(x_len[:-1]) x_concat = F.concat(xs, axis=0) # (L1+L2...,) # ex = self.embed(x_concat) # (..., E) ex = F.embed_id(x_concat, wordembeds, ignore_label=0) ex = F.tanh(self.embed(ex)) # (..., E) uex = self.uni_embed(ex) # (..., E) uvx = self.var_linear(ex) # (..., 1) uvx = F.sigmoid(F.squeeze(uvx, -1)) # (..., ) # evx = F.concat([ex, uvx[:, None]], -1) # (..., E+1) evxs = F.split_axis(ex, x_section, 0) uexs = F.split_axis(uex, x_section, 0) uvs = F.split_axis(uvx, x_section, 0) return evxs, uexs, uvs
def forward(self, inputs): """ Compute context insensitive token embeddings for ELMo representations. Parameters ---------- inputs: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length)`` of token ids representing the current batch. Returns ------- Dict with keys: ``'token_embedding'``: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context insensitive token representations. ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask. """ # Add BOS/EOS # mask = ((inputs > 0).sum(axis=-1) > 0) mask = (inputs > 0) token_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( inputs, mask, self._beginning_of_sentence_token, self._end_of_sentence_token ) token_embedding = F.embed_id( token_ids_with_bos_eos, self._token_embedding_weights ) # (batch_size, sequence_length, embedding_dim) return { 'mask': mask_with_bos_eos, 'token_embedding': token_embedding }
def forward(self, inputs): """ Compute context insensitive token embeddings for ELMo representations. Parameters ---------- inputs: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length, 50)`` of character ids representing the current batch. Returns ------- Dict with keys: ``'token_embedding'``: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context insensitive token representations. ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask. """ # Add BOS/EOS mask = ((inputs > 0).sum(axis=-1) > 0) character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( inputs, mask, self._beginning_of_sentence_characters, self._end_of_sentence_characters ) # the character id embedding max_chars_per_token = self._options['char_cnn']['max_characters_per_token'] # (batch_size * sequence_length, max_chars_per_token, embed_dim) character_embedding = F.embed_id( character_ids_with_bos_eos.reshape((-1, max_chars_per_token)), self._char_embedding_weights ) # run convolutions cnn_options = self._options['char_cnn'] if cnn_options['activation'] == 'tanh': activation = F.tanh elif cnn_options['activation'] == 'relu': activation = F.relu else: raise ConfigurationError("Unknown activation") # (batch_size * sequence_length, embed_dim, max_chars_per_token) character_embedding = F.transpose(character_embedding, (0, 2, 1)) character_embedding = character_embedding[:, :, :, None] convs = [] for i in range(len(self._convolutions)): conv = getattr(self, 'char_conv_{}'.format(i)) convolved = conv(character_embedding) # (batch_size * sequence_length, n_filters for this width) convolved = F.max(convolved, axis=(2, 3)) convolved = activation(convolved) convs.append(convolved) # (batch_size * sequence_length, n_filters) token_embedding = F.concat(convs, axis=-1) # apply the highway layers (batch_size * sequence_length, n_filters) token_embedding = self._highways.forward(token_embedding) # final projection (batch_size * sequence_length, embedding_dim) token_embedding = self._projection(token_embedding) # reshape to (batch_size, sequence_length, embedding_dim) batch_size, sequence_length, _ = character_ids_with_bos_eos.shape return { 'mask': mask_with_bos_eos, 'token_embedding': token_embedding.reshape((batch_size, sequence_length, -1)) }