def __call__(self, query, options, gold, lengths, query_no): if len(options) == 1: return None, 0 final = [] if args.word_vectors: qvecs = [dy.lookup(self.pEmbedding, w) for w in query] qvec_max = dy.emax(qvecs) qvec_mean = dy.average(qvecs) for otext, features in options: inputs = dy.inputTensor(features) if args.word_vectors: ovecs = [dy.lookup(self.pEmbedding, w) for w in otext] ovec_max = dy.emax(ovecs) ovec_mean = dy.average(ovecs) inputs = dy.concatenate( [inputs, qvec_max, qvec_mean, ovec_max, ovec_mean]) if args.drop > 0: inputs = dy.dropout(inputs, args.drop) h = inputs for pH, pB in zip(self.hidden, self.bias): h = dy.affine_transform([pB, pH, h]) if args.nonlin == "linear": pass elif args.nonlin == "tanh": h = dy.tanh(h) elif args.nonlin == "cube": h = dy.cube(h) elif args.nonlin == "logistic": h = dy.logistic(h) elif args.nonlin == "relu": h = dy.rectify(h) elif args.nonlin == "elu": h = dy.elu(h) elif args.nonlin == "selu": h = dy.selu(h) elif args.nonlin == "softsign": h = dy.softsign(h) elif args.nonlin == "swish": h = dy.cmult(h, dy.logistic(h)) final.append(dy.sum_dim(h, [0])) final = dy.concatenate(final) nll = -dy.log_softmax(final) dense_gold = [] for i in range(len(options)): dense_gold.append(1.0 / len(gold) if i in gold else 0.0) answer = dy.inputTensor(dense_gold) loss = dy.transpose(answer) * nll predicted_link = np.argmax(final.npvalue()) return loss, predicted_link
def one_pass(self, datum): datum = dynet.inputTensor(datum) w1 = dynet.parameter(self.layers[0]['W']) b1 = dynet.parameter(self.layers[0]['b']) w2 = dynet.parameter(self.layers[1]['W']) b2 = dynet.parameter(self.layers[1]['b']) hidden = (datum * w1) + b1 hidden_activation = dynet.logistic(hidden) output = (hidden_activation * w2) + b2 output_activation = dynet.logistic(output) return output_activation
def transduce(self,inputs,masks,predict=False): if not self.init: print("No Initial state provided") return outputs = [] batch_size = inputs[0].dim()[1] for idx,input_tensor in enumerate(inputs): recur_s = [] cell_s = [] out = [] hidden = self.hidden_previous cell = self.cell_previous if not predict: input_tensor = dy.cmult(input_tensor,self.input_drop_mask) hidden = dy.cmult(hidden,self.recur_drop_mask) gates = dy.affine_transform([self.b.expr(),self.WXH.expr(),dy.concatenate([input_tensor,hidden])]) iga = dy.pickrange(gates,0,self.recur_size) fga = dy.pickrange(gates,self.recur_size,2*self.recur_size) oga = dy.pickrange(gates,2*self.recur_size,3*self.recur_size) cga = dy.pickrange(gates,3*self.recur_size,4*self.recur_size) ig = dy.logistic(iga) fg = dy.logistic(fga) # +self.forget_bias og = dy.logistic(oga) c_tilda = dy.tanh(cga) new_cell = dy.cmult(cell,fg) + dy.cmult(c_tilda,ig) new_hidden = dy.cmult(dy.tanh(new_cell),og) for jdx in range(batch_size): if masks[idx][jdx] == 1: h_t = dy.pick_batch_elem(new_hidden,jdx) recur_s.append(h_t) cell_s.append(dy.pick_batch_elem(new_cell,jdx)) out.append(h_t) else: recur_s.append(dy.pick_batch_elem(hidden,jdx)) cell_s.append(dy.pick_batch_elem(cell,jdx)) out.append(dy.zeros(self.recur_size)) new_cell = dy.concatenate_to_batch(cell_s) new_hidden = dy.concatenate_to_batch(recur_s) self.cell_previous = new_cell self.hidden_previous = new_hidden outputs.append(dy.concatenate_to_batch(out)) return outputs
def _calc_scores_embedded_mlp(self, sentences, W_emb, W_mlp, b_mlp, V_mlp, a_mlp, meta_data=None): """ calculating the score for a a NN network (in a specific state along learning phase) :param sentences: list list of lists of sentences (represented already as numbers and not letters) :param W_emb: lookup parameter (dynet obj). size: (emb_size x nwords) matrix holding the word embedding values :param W_mlp: model parameter (dynet obj). size: (hid_size, emb_size + meta_data_dim) matrix holding weights of the mlp phase :param b_mlp: model parameter (dynet obj). size: (hid_size,) vector holding weights of intercept for each hidden state :param V_mlp: model parameter (dynet obj). size: (2, hid_size) matrix holding weights of the logisitc regression phase. 2 is there due to the fact we are in a binary classification :param a_mlp: model parameter (dynet obj). size: (1,) intercept value for the logistic regression phase :param meta_data: dict or None meta data features for the model. If None - meta data is not used :return: dynet parameter. size: (2,) prediction of the instance to be a drawing one according to the model (vector of 2, first place is the probability to be a drawing team) """ dy.renew_cg() # sentences_len = len(sentences) word_embs = [[dy.lookup(W_emb, w) for w in words] for words in sentences] # taking the average over all words first_layer_avg = dy.average([dy.average(w_em) for w_em in word_embs]) # case we don't wish to use meta features for the model if meta_data is None: h = dy.tanh((W_mlp * first_layer_avg) + b_mlp) prediction = dy.logistic((V_mlp * h) + a_mlp) else: meta_data_ordered = [ value for key, value in sorted(meta_data.items()) ] meta_data_vector = dy.inputVector(meta_data_ordered) first_layer_avg_and_meta_data = dy.concatenate( [first_layer_avg, meta_data_vector]) h = dy.tanh((W_mlp * first_layer_avg_and_meta_data) + b_mlp) prediction = dy.logistic((V_mlp * h) + a_mlp) return prediction
def on_calc_additional_loss(self, reward): if not self.learn_segmentation: return None ret = LossBuilder() if self.length_prior_alpha > 0: reward += self.segment_length_prior * self.length_prior_alpha reward = dy.cdiv(reward - dy.mean_batches(reward), dy.std_batches(reward)) # Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): baseline_loss.append(dy.squared_distance(reward, baseline)) ret.add_loss("Baseline", dy.esum(baseline_loss)) # Reinforce Loss lmbd = self.lmbd.get_value(self.warmup_counter) if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - self.bs[i] else: r_i = reward reinforce_loss.append(dy.logistic(r_i) * ll) ret.add_loss("Reinforce", -dy.esum(reinforce_loss) * lmbd) # Total Loss return ret
def train(self, trainning_set): for sentence, eid, entity, trigger, label, pos, chars, rule in trainning_set: features = self.encode_sentence(sentence, pos, chars) loss = [] entity_embeds = features[entity] attention, context = self.self_attend(features) ty = dy.vecInput(len(sentence)) ty.set([0 if i!=trigger else 1 for i in range(len(sentence))]) loss.append(dy.binary_log_loss(dy.reshape(attention,(len(sentence),)), ty)) h_t = dy.concatenate([context, entity_embeds]) hidden = dy.tanh(self.lb.expr() * h_t + self.lb_bias.expr()) out_vector = dy.reshape(dy.logistic(self.lb2.expr() * hidden + self.lb2_bias.expr()), (1,)) label = dy.scalarInput(label) loss.append(dy.binary_log_loss(out_vector, label)) pres = [0] for pattern in rule: probs = self.decoder(features, pres) loss.append(-dy.log(dy.pick(probs, pattern))) pres.append(pattern) loss = dy.esum(loss) loss.backward() self.trainer.update() dy.renew_cg()
def test_item(model, document): word_lookups = [] for preprocessed_sentence in document.preprocessed_sentences: seq = [ model.wlookup[int(model.w2i.get(entry, 0))] for entry in preprocessed_sentence ] if len(seq) > 0: word_lookups.append(seq) sentences_lookups = [] for seq in word_lookups: sentence_encode = encode_sequence(model, seq, model.sentence_rnn) global_max = max_pooling(sentence_encode) global_min = average_pooling(sentence_encode) if len(sentence_encode) > 0: last_out = sentence_encode[-1] context = dy.concatenate([last_out, global_max, global_min]) sentences_lookups.append(context) document_encode = encode_sequence(model, sentences_lookups, model.document_rnn) global_max = max_pooling(document_encode) global_min = average_pooling(document_encode) if len(document_encode) > 0: last_out = sentence_encode[-1] context = dy.concatenate([last_out, global_max, global_min]) y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b) document.prediction_result = y_pred.scalar_value() dy.renew_cg() return document.prediction_result return 0
def test_network(pWeight, input_dy): # add parameters to graph as expressions Weight = dy.parameter(pWeight) # return what the network returns output = dy.logistic(dy.tanh(Weight * input_dy)) return output
def word_repr(self, char_seq, cembs): """ obtain the word representation when given its character sequence :param char_seq: character index sequence :param cembs: character embedding sequence :return: """ wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) chars = dy.concatenate(cembs) # [c1;c2...] # reste_gate = sigmoid(W_r_l * chars + b_r_l), shape: (m,char_dim) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) # word = tanh(W_c_l * (reset_gate .* chars) + b_c_l) word = dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]) if self.known_words is not None and tuple( char_seq) in self.known_words: # Frequent word = (word + word_embed) / 2 return (word + dy.lookup(self.params['word_embed'], self.known_words[tuple(char_seq)])) / 2. return word
def word_repr(self, char_seq, cembs): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) chars = dy.concatenate(cembs) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) word = dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]) if self.known_words is not None and tuple( char_seq) in self.known_words: return (word + dy.lookup(self.params['word_embed'], self.known_words[tuple(char_seq)])) / 2. return word
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def compute_loss(self, state, word): top_state = self.top_lstm.initial_state() top_state = top_state.set_s(self.top_initial_state) assert len(state.open_constits) == len(state.spine) for open_constit, spine_word in zip(state.open_constits, state.spine): constit_emb = open_constit.output() if self.residual and spine_word != -1: spine_word_emb = self.embed_word(spine_word) if False: constit_emb += spine_word_emb else: inp = dy.concatenate([constit_emb, spine_word_emb]) mask = self.gate_mlp(inp) mask = dy.logistic(mask) constit_emb = dy.cmult(1 - mask, constit_emb) constit_emb = constit_emb + dy.cmult(mask, spine_word_emb) top_state = top_state.add_input(constit_emb) #debug_top_state = self.debug_embed() #assert np.isclose(top_state.output().npvalue(), debug_top_state.output().npvalue()).all() logits = self.final_mlp(top_state.output()) loss = dy.pickneglogsoftmax(logits, word) #if not self.warned: # sys.stderr.write('WARNING: compute_loss hacked to not include actual terminals.\n') # self.warned = True #if word != 0 and word != 1: # probs = -dy.softmax(logits) # left_prob = dy.pick(probs, 0) # right_prob = dy.pick(probs, 1) # loss = dy.log(1 - left_prob - right_prob) #else: # loss = dy.pickneglogsoftmax(logits, word) return loss
def __train(self, data): def encode_sequence(seq): rnn_forward = self.phrase_rnn[0].initial_state() for entry in seq: vec = self.wlookup[int(self.w2i.get(entry, 0))] rnn_forward = rnn_forward.add_input(vec) return rnn_forward.output() tagged_loss = 0 untagged_loss = 0 for index, sentence_report in enumerate(data): for phrase in sentence_report.all_phrases: loss = None encoded_phrase = encode_sequence(phrase) y_pred = dy.logistic((self.mlp_w*encoded_phrase) + self.mlp_b) if sentence_report.mark: loss = dy.binary_log_loss(y_pred, dy.scalarInput(1)) else: loss = dy.binary_log_loss(y_pred, dy.scalarInput(0)) if index % 1000 == 0: print("Description : {}".format(index+1)) print("Marked {} Prediction Result {} : ".format(sentence_report.mark, y_pred.scalar_value())) print("Tagged loss {} Untagged Loss {} Total loss {}".format(tagged_loss, untagged_loss, tagged_loss+untagged_loss)) if sentence_report.mark: tagged_loss += loss.scalar_value()/(index+1) else: untagged_loss += loss.scalar_value()/(index+1) loss.backward() self.trainer.update() dy.renew_cg()
def _predict(self, batch, train=True): # load the network parameters W_hid = dy.parameter(self.W_hid) b_hid = dy.parameter(self.b_hid) w_clf = dy.parameter(self.w_clf) b_clf = dy.parameter(self.b_clf) probas = [] # predict the probability of positive sentiment for each sentence for _, sent in batch: sent_embed = [dy.lookup(self.embed, w) for w in sent] dropout_embed = [] # $@$ Task3 implying dropout to regluarization training if train == True: for embed in sent_embed: embed = dy.dropout(embed, 0.5) dropout_embed.append(embed) sent_embed = dy.average(dropout_embed) else: sent_embed = dy.average(sent_embed) # hid = tanh(b + W * sent_embed) # but it's faster to use affine_transform in dynet hid = dy.affine_transform([b_hid, W_hid, sent_embed]) hid = dy.tanh(hid) y_score = dy.affine_transform([b_clf, w_clf, hid]) y_proba = dy.logistic(y_score) probas.append(y_proba) return probas
def __call__(self, word_embeddings): highway_memories = word_embeddings for birnn_layer, highway_i_factor, \ highway_o_factor, highway_bias in zip(self.birnn_layers, self.highway_i_factors, self.highway_o_factors, self.highway_biases): output_tensors = birnn_layer(highway_memories) if highway_memories is word_embeddings: highway_memories = output_tensors else: new_highway_memories = [] for memory_vector, output_vector in zip( highway_memories, output_tensors): highway_bias_expr = highway_bias.expr() highway_i_factor_expr = highway_i_factor.expr() highway_o_factor_expr = highway_o_factor.expr() transform_rate = dn.logistic( dn.affine_transform([ highway_bias_expr, highway_i_factor_expr, memory_vector, highway_o_factor_expr, output_vector ])) keep_rate = 1 - transform_rate new_highway_memories.append( dn.cmult(keep_rate, memory_vector) + dn.cmult(transform_rate, output_vector)) highway_memories = new_highway_memories return highway_memories
def run_instance(tokens, polarity, model_elems, embeddings): # Renew the computational graph dy.renew_cg() builder = model_elems.builder V = model_elems.V W = model_elems.W b = model_elems.b # Fetch the embeddings for the current sentence words = tokens # print('words of a sentence:') # print([word for word in words]) # print('embedding for empty character') # print(embeddings[''].npvalue()) # input('press enter to continue') inputs = [embeddings[w] for w in words] # Run FF over the LSTM lstm = builder.initial_state() outputs = lstm.transduce(inputs) # Get the last embedding selected = outputs[-1] # Concatenate the polarity bit to the selected vector prediction_input = dy.concatenate([selected, dy.scalarInput(1 if polarity else 0)]) #prediction_input = selected # Run the FF network for classification prediction = dy.logistic(V * (W * prediction_input + b)) return prediction
def __train(model, data): tagged_loss = 0 untagged_loss = 0 for index, sentence_report in enumerate(data): for phrase in sentence_report.all_phrases: loss = None encoded_phrase = __encode_sequence(model, phrase) if model.options.external_info != "no_info": encoded_phrase = dy.concatenate( [encoded_phrase, model.doclookup[sentence_report.app_id]]) y_pred = dy.logistic((model.mlp_w * encoded_phrase) + model.mlp_b) if sentence_report.mark: loss = dy.binary_log_loss(y_pred, dy.scalarInput(1)) else: loss = dy.binary_log_loss(y_pred, dy.scalarInput(0)) if sentence_report.mark: tagged_loss += loss.scalar_value() / (index + 1) else: untagged_loss += loss.scalar_value() / (index + 1) loss.backward() model.trainer.update() dy.renew_cg()
def expr_for_tree(self,xt,tree,node,is_train): if is_train: # in the training phase, perform dropout W_dropout = dy.dropout(self.WP, self.dropout_rate) WR_dropout = dy.dropout(self.WR, self.dropout_rate) WC_dropout = dy.dropout(self.WC, self.dropout_rate) else: W_dropout = self.WP WR_dropout = self.WR WC_dropout = self.WC if node is None or node.is_leaf(): Wx = W_dropout * xt # h = dy.tanh(Wx + self.bc) h = dy.tanh(dy.affine_transform([self.bc, self.WC, xt])) return h #get child nodes children=tree.children(node.identifier) children_sum=dy.zeros((self.n_out)) for i in range(len(children)): hc=self.expr_for_tree(xt=xt,tree=tree,node=children[i],is_train=is_train) rt = dy.logistic(self.WR * xt +self.UR*hc+self.br) children_sum=children_sum+dy.cmult(rt, hc) Wx = W_dropout * xt h = dy.tanh(Wx + self.bp+self.UP*children_sum) return h
def train_item(args, model, sentence): loss = None seq = [ model.wlookup[int(model.w2i.get(entry, 0))] for entry in sentence.preprocessed_sentence ] if len(seq) > 0: encoded_sequence = encode_sequence(model, seq, model.sentence_rnn) last_output = encoded_sequence[-1] global_max = max_pooling(encoded_sequence) global_min = average_pooling(encoded_sequence) context = dy.concatenate([last_output, global_max, global_min]) y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b) if sentence.permissions[args.permission_type]: loss = dy.binary_log_loss(y_pred, dy.scalarInput(1)) else: loss = dy.binary_log_loss(y_pred, dy.scalarInput(0)) loss.backward() model.trainer.update() loss_val = loss.scalar_value() dy.renew_cg() return loss_val return 0
def _predict(self, batch, train=True): # load the network parameters W_hid = dy.parameter(self.W_hid) b_hid = dy.parameter(self.b_hid) w_clf = dy.parameter(self.w_clf) b_clf = dy.parameter(self.b_clf) probas = [] # predict the probability of positive sentiment for each sentence for _, sent in batch: sent_embed = [dy.lookup(self.embed, w) for w in sent] sent_embed = dy.average(sent_embed) # hid = tanh(b + W * sent_embed) # but it's faster to use affine_transform in dynet hid = dy.affine_transform([b_hid, W_hid, sent_embed]) hid = dy.tanh(hid) y_score = dy.affine_transform([b_clf, w_clf, hid]) y_proba = dy.logistic(y_score) probas.append(y_proba) return probas
def add_input(self, cur, x): h = cur.hidden_state c = cur.memory_cell i = dy.logistic(self._biaffine(x, self.Wi, h)) f = dy.logistic(self._biaffine(x, self.Wf, h)) o = dy.logistic(self._biaffine(x, self.Wo, h)) u = dy.tanh(self._biaffine(x, self.Wu, h)) c_out = dy.cmult(i, u) + dy.cmult(f, c) h_out = dy.cmult(o, dy.tanh(c_out)) _cur = LSTMState(self, cur.state_idx + 1, prev_state=cur, out=h_out, hidden=h_out, memory=c_out) return _cur
def _upsample(self, mgc, start, stop): mgc_index = start / len(self.upsample_w_s) ups_index = start % len(self.upsample_w_s) upsampled = [] mgc_vect = dy.inputVector(mgc[mgc_index]) for x in range(stop - start): sigm = dy.logistic(self.upsample_w_s[ups_index].expr(update=True) * mgc_vect + self.upsample_b_s[ups_index].expr(update=True)) tnh = dy.tanh(self.upsample_w_t[ups_index].expr(update=True) * mgc_vect + self.upsample_b_t[ups_index].expr(update=True)) r = dy.cmult(sigm, tnh) upsampled.append(r) ups_index += 1 if ups_index == len(self.upsample_w_s): ups_index = 0 mgc_index += 1 if mgc_index == len( mgc ): # last frame is sometimes not processed, but it should have similar parameters mgc_index -= 1 else: mgc_vect = dy.inputVector(mgc[mgc_index]) return upsampled
def test_item(model, sentence): seq = [ model.wlookup[int(model.w2i.get(entry, 0))] for entry in sentence.preprocessed_sentence ] if len(seq) > 0: encoded_sequence = encode_sequence(model, seq, model.sentence_rnn) global_max = max_pooling(encoded_sequence) global_min = average_pooling(encoded_sequence) if len(encoded_sequence) > 0: att_mlp_outputs = [] for e in encoded_sequence: mlp_out = (model.attention_w * e) + model.attention_b att_mlp_outputs.append(mlp_out) lst = [] for o in att_mlp_outputs: lst.append(dy.exp(dy.sum_elems(dy.cmult(o, model.att_context)))) sum_all = dy.esum(lst) probs = [dy.cdiv(e, sum_all) for e in lst] att_context = dy.esum( [dy.cmult(p, h) for p, h in zip(probs, encoded_sequence)]) context = dy.concatenate([att_context, global_max, global_min]) y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b) sentence.prediction_result = y_pred.scalar_value() dy.renew_cg() return sentence.prediction_result return 0
def add_input(self, input_vec): x = dynet.concatenate([input_vec, self.h]) i = dynet.logistic(self.W_i * x + self.b_i) f = dynet.logistic(self.W_f * x + self.b_f) g = dynet.tanh(self.W_c * x + self.b_c) o = dynet.logistic(self.W_o * x + self.b_o) c = dynet.cwise_multiply(f, self.c) + dynet.cwise_multiply(i, g) h = dynet.cwise_multiply(o, dynet.tanh(c)) self.c = c self.h = h self.outputs.append(h) return self
def highway(input_, train): for func, weight, bias in zip(funcs, weights, biases): proj = dy.rectify(func(input_, train)) transform = dy.logistic(dy.affine_transform([bias, weight, input_])) input_ = dy.cmult(transform, proj) + dy.cmult( input_, 1 - transform) return input_
def step(self, x, hx, cx): if not self.test: if self.dropout_x > 0: x = dy.cmult(self.dropout_mask_x, x) if self.dropout_h > 0: hx = dy.cmult(self.dropout_mask_h, hx) gates = dy.affine_transform( [self.bias, self.weight_ih, x, self.weight_hh, hx]) i = dy.pickrange(gates, 0, self.n_hidden) f = dy.pickrange(gates, self.n_hidden, self.n_hidden * 2) g = dy.pickrange(gates, self.n_hidden * 2, self.n_hidden * 3) o = dy.pickrange(gates, self.n_hidden * 3, self.n_hidden * 4) i, f, g, o = dy.logistic(i), dy.logistic(f), dy.tanh(g), dy.logistic(o) cy = dy.cmult(f, cx) + dy.cmult(i, g) hy = dy.cmult(o, dy.tanh(cy)) return hy, cy
def transduce(self, embed_sent): src = embed_sent.as_tensor() sent_len = src.dim()[0][1] src_width = 1 batch_size = src.dim()[1] pad_size = (self.window_receptor-1)/2 #TODO adapt it also for even window size src = dy.concatenate([dy.zeroes((self.input_dim,pad_size),batch_size=batch_size),src,dy.zeroes((self.input_dim,pad_size), batch_size=batch_size)],d=1) padded_sent_len = sent_len + 2*pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src,(self.input_dim,padded_sent_len,1),batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn,conv1,bias1,stride=[1,1]) hidden_layer = dy.reshape(cnn_layer1,(self.internal_dim,sent_len,1),batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid),dy.parameter(bias_hid),stride=[1,1]) hidden_layer = dy.reshape(hidden_layer,(self.internal_dim,sent_len,1),batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer,last_conv,last_bias,stride=[1,1]) output = dy.reshape(output, (sent_len,self.output_dim),batch_size=batch_size) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def _calc_scores_two_layers(self, sentences, W_emb, first_lstm, W_mlp, b_mlp, V_mlp, a_mlp, meta_data=None): """ calculating the score for parallel LSTM network (in a specific state along learning phase) :param sentences: list list of lists of sentences (represented already as numbers and not letters) :param first_lstm: :param W_mlp: model parameter (dynet obj). size: (hid_size, emb_size + meta_data_dim) matrix holding weights of the mlp phase :param b_mlp: model parameter (dynet obj). size: (hid_size,) vector holding weights of intercept for each hidden state :param V_mlp: model parameter (dynet obj). size: (2, hid_size) matrix holding weights of the logisitc regression phase. 2 is there due to the fact we are in a binary classification :param a_mlp: model parameter (dynet obj). size: (1,) intercept value for the logistic regression phase :param meta_data: dict or None meta data features for the model. If None - meta data is not used :return: dynet parameter. size: (2,) prediction of the instance to be a drawing one according to the model (vector of 2, first place is the probability to be a drawing team) """ dy.renew_cg() sentences_len = len(sentences) word_embs = [[dy.lookup(W_emb, w) for w in words] for words in sentences] first_init = first_lstm.initial_state() first_embs=[] for wb in word_embs: first_embs.append(first_init.transduce(wb)) last_comp_in_first_layer = [i[-1] for i in first_embs] # calculating the avg over all last components of the LSTMs # if wanted to take the maximum, one can use dy.emax instead of dy.average (but it is not too recommended) first_layer_avg = dy.average(last_comp_in_first_layer) if meta_data is None: h = dy.tanh((W_mlp * first_layer_avg) + b_mlp) prediction = dy.logistic((V_mlp * h) + a_mlp) else: meta_data_ordered = [value for key, value in sorted(meta_data.items())] meta_data_vector = dy.inputVector(meta_data_ordered) first_layer_avg_and_meta_data = dy.concatenate([first_layer_avg, meta_data_vector]) h = dy.tanh((W_mlp * first_layer_avg_and_meta_data) + b_mlp) prediction = dy.logistic((V_mlp * h) + a_mlp) return prediction
def transduce(self, embed_sent): src = embed_sent.as_tensor() W = dy.parameter(self.pW) b = dy.parameter(self.pb) l1 = dy.affine_transform([b, W, src]) output = l1 if self.nonlinearity is 'linear': output = l1 elif self.nonlinearity is 'sigmoid': output = dy.logistic(l1) elif self.nonlinearity is 'tanh': output = 2 * dy.logistic(l1) - 1 elif self.nonlinearity is 'relu': output = dy.rectify(l1) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def add_input(self, input_vec): """ Note that this function updates the existing State object! """ x = dynet.concatenate([input_vec, self.h]) i = dynet.logistic(self.W_i * x + self.b_i) f = dynet.logistic(self.W_f * x + self.b_f) g = dynet.tanh(self.W_c * x + self.b_c) o = dynet.logistic(self.W_o * x + self.b_o) c = dynet.cmult(f, self.c) + dynet.cmult(i, g) h = dynet.cmult(o, dynet.tanh(c)) self.c = c self.h = h self.outputs.append(h) return self
def __call__(self, src): src = src.as_tensor() # convolutional layer src = padding(src, src.dim()[0][0], src.dim()[0][1], self.filter_width, self.stride, src.dim()[1]) l1 = dy.rectify( dy.conv2d(src, dy.parameter(self.filter_conv), stride=[self.stride, self.stride], is_valid=True)) timestep = l1.dim()[0][1] features = l1.dim()[0][2] batch_size = l1.dim()[1] # transpose l1 to be (timesetp, dim), but keep the batch_size. rhn_in = dy.reshape(l1, (timestep, features), batch_size=batch_size) rhn_in = [dy.pick(rhn_in, i) for i in range(timestep)] for l in range(self.rhn_num_hidden_layers): rhn_out = [] # initialize a random vector for the first state vector, keep the same batch size. prev_state = dy.parameter(self.init[l]) # begin recurrent high way network for t in range(timestep): for m in range(0, self.rhn_microsteps): H = dy.affine_transform([ dy.parameter(self.recur[l][m][1]), dy.parameter(self.recur[l][m][0]), prev_state ]) T = dy.affine_transform([ dy.parameter(self.recur[l][m][3]), dy.parameter(self.recur[l][m][2]), prev_state ]) if m == 0: H += dy.parameter(self.linear[l][0]) * rhn_in[t] T += dy.parameter(self.linear[l][1]) * rhn_in[t] H = dy.tanh(H) T = dy.logistic(T) prev_state = dy.cmult(1 - T, prev_state) + dy.cmult( T, H) # ((1024, ), batch_size) rhn_out.append(prev_state) if self.residual and l > 0: rhn_out = [sum(x) for x in zip(rhn_out, rhn_in)] rhn_in = rhn_out # Compute the attention-weighted average of the activations rhn_in = dy.concatenate_cols(rhn_in) scores = dy.transpose(dy.parameter(self.attention[0][1])) * dy.tanh( dy.parameter(self.attention[0][0]) * rhn_in) # ((1,510), batch_size) scores = dy.reshape(scores, (scores.dim()[0][1], ), batch_size=scores.dim()[1]) attn_out = rhn_in * dy.softmax( scores ) # # rhn_in.as_tensor() is ((1024,510), batch_size) softmax is ((510,), batch_size) return ExpressionSequence(expr_tensor=attn_out)
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Sample K negative words for each predicted word at each position all_neg_words = np.random.choice(nwords, size=2*N*K*len(emb), replace=True, p=word_probabilities) # W_w = dy.parameter(W_w_p) # Step through the sentence and calculate the negative and positive losses all_losses = [] for i, my_emb in enumerate(emb): neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N] pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) neg_loss = -dy.log(dy.logistic(-dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words)))) pos_loss = -dy.log(dy.logistic(dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words)))) all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss)) return dy.esum(all_losses)
def expr_for_tree(self, tree): if tree.isleaf(): return self.E[self.w2i.get(tree.label,0)] if len(tree.children) == 1: assert(tree.children[0].isleaf()) emb = self.expr_for_tree(tree.children[0]) Wi,Wo,Wu = [dy.parameter(w) for w in self.WS] bi,bo,bu,_ = [dy.parameter(b) for b in self.BS] i = dy.logistic(Wi*emb + bi) o = dy.logistic(Wo*emb + bo) u = dy.tanh( Wu*emb + bu) c = dy.cmult(i,u) expr = dy.cmult(o,dy.tanh(c)) return expr assert(len(tree.children) == 2),tree.children[0] e1 = self.expr_for_tree(tree.children[0]) e2 = self.expr_for_tree(tree.children[1]) Ui,Uo,Uu = [dy.parameter(u) for u in self.US] Uf1,Uf2 = [dy.parameter(u) for u in self.UFS] bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] e = dy.concatenate([e1,e2]) i = dy.logistic(Ui*e + bi) o = dy.logistic(Uo*e + bo) f1 = dy.logistic(Uf1*e1 + bf) f2 = dy.logistic(Uf2*e2 + bf) u = dy.tanh( Uu*e + bu) c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2) h = dy.cmult(o,dy.tanh(c)) expr = h return expr
def transduce(self, inputs, train): xs = inputs[:self.max_length] if not xs: return [] for i in range(self.lstm_layers): for n, d in ("f", 1), ("b", -1): Wr, br, Wh = [self.params["%s%d%s" % (p, i, n)] for p in ("Wr", "br", "Wh")] hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d]) hs = [hs_[0]] for t in range(1, len(hs_)): r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br) hs.append(dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t])) xs = hs if train: x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout) xs = [dy.pick(x, i, 1) for i in range(len(xs))] return xs
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Step through the sentence and calculate binary prediction losses all_losses = [] for i, my_emb in enumerate(emb): scores = dy.logistic(W_c * my_emb) pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) word_repr = [[float(y) for y in np.binary_repr(x).zfill(nbits)] for x in pos_words] word_repr = [dy.inputVector(x) for x in word_repr] all_losses.extend([dy.binary_log_loss(scores, x) for x in word_repr]) return dy.esum(all_losses)
pa = m.add_parameters(1, device="CPU") if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) dy.renew_cg() W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa) x = dy.vecInput(2, "GPU:1") y = dy.scalarInput(0, "CPU") h1 = dy.tanh((W1*x) + b1) h1_gpu0 = dy.to_device(h1, "GPU:0") h2 = dy.tanh((W2*h1_gpu0) + b2) h2_cpu = dy.to_device(h2, "CPU") if xsent: y_pred = dy.logistic((V*h2_cpu) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V*h2_cpu) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1 for iter in range(ITERATIONS): mloss = 0.0 for mi in range(4): x1 = mi % 2 x2 = (mi // 2) % 2
def highway(input_, train): for func, weight, bias in zip(funcs, weights, biases): proj = dy.rectify(func(input_, train)) transform = dy.logistic(dy.affine_transform([bias, weight, input_])) input_ = dy.cmult(transform, proj) + dy.cmult(input_, 1 - transform) return input_
m = dy.Model() trainer = dy.SimpleSGDTrainer(m) W = m.add_parameters((HIDDEN_SIZE, 2)) b = m.add_parameters(HIDDEN_SIZE) V = m.add_parameters((1, HIDDEN_SIZE)) a = m.add_parameters(1) if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) x = dy.vecInput(2) y = dy.scalarInput(0) h = dy.tanh((W*x) + b) if xsent: y_pred = dy.logistic((V*h) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V*h) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1 for iter in range(ITERATIONS): mloss = 0.0 for mi in range(4): x1 = mi % 2 x2 = (mi // 2) % 2