def learn(self, wave, mgc, batch_size): # disc, wave = self.dio.ulaw_encode(wave) # from ipdb import set_trace # set_trace() last_proc = 0 dy.renew_cg() total_loss = 0 losses = [] cnt = 0 noise = np.random.normal(0, 1.0, (len(wave) + self.UPSAMPLE_COUNT)) for mgc_index in range(len(mgc)): curr_proc = int((mgc_index + 1) * 100 / len(mgc)) if curr_proc % 5 == 0 and curr_proc != last_proc: while last_proc < curr_proc: last_proc += 5 sys.stdout.write(' ' + str(last_proc)) sys.stdout.flush() if mgc_index < len(mgc) - 1: output, excitation, filter, vuv = self._predict_one(mgc[mgc_index], noise[ self.UPSAMPLE_COUNT * mgc_index:self.UPSAMPLE_COUNT * mgc_index + 2 * self.UPSAMPLE_COUNT]) # reconstruction error t_vect = wave[self.UPSAMPLE_COUNT * mgc_index:self.UPSAMPLE_COUNT * mgc_index + self.UPSAMPLE_COUNT] loss = dy.squared_distance(output, dy.inputVector(t_vect)) # dynamic error o1 = dy.pickrange(output, 0, self.UPSAMPLE_COUNT - 1) o2 = dy.pickrange(output, 1, self.UPSAMPLE_COUNT) delta = o2 - o1 real_delta = t_vect[1:self.UPSAMPLE_COUNT] - t_vect[0:self.UPSAMPLE_COUNT - 1] loss += dy.squared_distance(delta, dy.inputVector(real_delta)) # excitation error # loss += dy.sum_elems(excitation) # o1 = dy.pickrange(excitation, 0, self.UPSAMPLE_COUNT - 1) # o2 = dy.pickrange(excitation, 1, self.UPSAMPLE_COUNT) # loss += dy.sum_elems(dy.abs(o2 - o1)) losses.append(loss) cnt += self.UPSAMPLE_COUNT if len(losses) >= batch_size: loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() losses = [] dy.renew_cg() if len(losses) > 0: loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() dy.renew_cg() return total_loss / cnt
def train(self, mini_batch, num_train, k): words, pos_tags, chars, langs, signs, masks = mini_batch # Getting the last hidden layer from BiLSTM. rnn_out = self.rnn_mlp(mini_batch, True) h_out = rnn_out[-1] t_out_d = dy.reshape(h_out, (h_out.dim()[0][0], h_out.dim()[1])) t_out = dy.transpose(t_out_d) # Calculating the kq values for NCE. kq = dy.scalarInput(float(k) / num_train) lkq = dy.log(kq) loss_values = [] for i in range(len(langs)): for j in range(i + 1, len(langs)): if (langs[i] != langs[j]) and (signs[i] == 1 or signs[j] == 1): lu = -dy.squared_distance(t_out[i], t_out[j]) denom = dy.log(dy.exp(lu) + kq) if signs[i] == signs[j]: # both one nom = lu else: nom = lkq loss_values.append(denom - nom) err_value = 0 if len(loss_values) > 0: err = dy.esum(loss_values) / len(loss_values) err.forward() err_value = err.value() err.backward() self.trainer.update() dy.renew_cg() return err_value
def calc_loss(self, model: 'model_base.ConditionedModel', src: Union[sent.Sentence, 'batchers.Batch'], trg: Union[sent.Sentence, 'batchers.Batch']) -> losses.FactoredLossExpr: search_outputs = model.generate_search_output(src, self.search_strategy) sign = -1 if self.inv_eval else 1 total_loss = losses.FactoredLossExpr() for search_output in search_outputs: # Calculate rewards eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = self.remove_eos(sample_i.tolist()) ref_i = trg_i.words[:trg_i.len_unpadded()] score = self.evaluation_metric.evaluate_one_sent(ref_i, sample_i) eval_score.append(sign * score) reward = dy.inputTensor(eval_score, batched=True) # Composing losses loss = losses.FactoredLossExpr() baseline_loss = [] cur_losses = [] for state, mask in zip(search_output.state, search_output.mask): bs_score = self.baseline.transform(dy.nobackprop(state.as_vector())) baseline_loss.append(dy.squared_distance(reward, bs_score)) logsoft = model.decoder.scorer.calc_log_probs(state.as_vector()) loss_i = dy.cmult(logsoft, reward - bs_score) cur_losses.append(dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) loss.add_loss("reinforce", dy.sum_elems(dy.esum(cur_losses))) loss.add_loss("reinf_baseline", dy.sum_elems(dy.esum(baseline_loss))) # Total losses total_loss.add_factored_loss_expr(loss) return loss
def calc_loss(self, translator, src, trg): search_outputs = translator.generate_search_output(src, self.search_strategy) sign = -1 if self.inv_eval else 1 total_loss = FactoredLossExpr() for search_output in search_outputs: self.eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = self.remove_eos(sample_i.tolist()) ref_i = trg_i.words[:trg_i.len_unpadded()] score = self.evaluation_metric.evaluate_one_sent(ref_i, sample_i) self.eval_score.append(sign * score) self.reward = dy.inputTensor(self.eval_score, batched=True) # Composing losses loss = FactoredLossExpr() if self.baseline is not None: baseline_loss = [] losses = [] for state, logsoft, mask in zip(search_output.state, search_output.logsoftmaxes, search_output.mask): bs_score = self.baseline.transform(state) baseline_loss.append(dy.squared_distance(self.reward, bs_score)) loss_i = dy.cmult(logsoft, self.reward - bs_score) valid = list(np.nonzero(mask)[0]) losses.append(dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) loss.add_loss("reinforce", dy.sum_elems(dy.esum(losses))) loss.add_loss("reinf_baseline", dy.sum_elems(dy.esum(baseline_loss))) else: loss.add_loss("reinforce", dy.sum_elems(dy.cmult(self.true_score, dy.esum(logsofts)))) total_loss.add_factored_loss_expr(loss) return loss
def on_calc_additional_loss(self, reward): if not self.learn_segmentation: return None ret = LossBuilder() if self.length_prior_alpha > 0: reward += self.segment_length_prior * self.length_prior_alpha reward = dy.cdiv(reward - dy.mean_batches(reward), dy.std_batches(reward)) # Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): baseline_loss.append(dy.squared_distance(reward, baseline)) ret.add_loss("Baseline", dy.esum(baseline_loss)) # Reinforce Loss lmbd = self.lmbd.get_value(self.warmup_counter) if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - self.bs[i] else: r_i = reward reinforce_loss.append(dy.logistic(r_i) * ll) ret.add_loss("Reinforce", -dy.esum(reinforce_loss) * lmbd) # Total Loss return ret
def loss(self, instance): trans = instance.transformation #trans = 'lol' if trans not in self.known_transformations: newtrans = list(self.param_dict.keys())[0][0] ### SUPER ARBITRARY tqdm.write( "WARNING: unknown transformtion picked for instance {}; using transformation {}" .format(trans, newtrans)) trans = newtrans b1 = dy.parameter(self.param_dict[(trans, 'b1')]) W1 = dy.parameter(self.param_dict[(trans, 'W1')]) b2 = dy.parameter(self.param_dict[(trans, 'b2')]) W2 = dy.parameter(self.param_dict[(trans, 'W2')]) #b3 = dy.parameter(self.param_dict[(trans, 'b3')]) #W3 = dy.parameter(self.param_dict[(trans, 'W3')]) #b = dy.parameter(self.param_dict[(trans, 'b')]) #W = dy.parameter(self.param_dict[(trans, 'W')]) x = dy.inputVector(instance.xs_distr_vec) y = dy.inputVector(instance.ys_distr_vec) #prediction = dy.affine_transform([b, W, x]) prediction = dy.affine_transform( [b2, W2, dy.tanh(dy.affine_transform([b1, W1, x]))]) #prediction = dy.affine_transform( # [b3, W3, dy.tanh(dy.affine_transform( # [b2, W2, dy.tanh(dy.affine_transform([b1, W1, x])) ] ))]) loss = dy.squared_distance(prediction, y) return prediction, loss
def __call__(self, translator, dec_state, src, trg): # TODO: apply trg.mask ? samples = [] logsofts = [] self.bs = [] done = [False for _ in range(len(trg))] for _ in range(self.sample_length): dec_state.context = translator.attender.calc_context(dec_state.rnn_state.output()) if self.use_baseline: h_t = dy.tanh(translator.decoder.context_projector(dy.concatenate([dec_state.rnn_state.output(), dec_state.context]))) self.bs.append(self.baseline(dy.nobackprop(h_t))) logsoft = dy.log_softmax(translator.decoder.get_scores(dec_state)) sample = logsoft.tensor_value().categorical_sample_log_prob().as_numpy()[0] # Keep track of previously sampled EOS sample = [sample_i if not done_i else Vocab.ES for sample_i, done_i in zip(sample, done)] # Appending and feeding in the decoder logsoft = dy.pick_batch(logsoft, sample) logsofts.append(logsoft) samples.append(sample) dec_state = translator.decoder.add_input(dec_state, translator.trg_embedder.embed(xnmt.batcher.mark_as_batch(sample))) # Check if we are done. done = list(six.moves.map(lambda x: x == Vocab.ES, sample)) if all(done): break samples = np.stack(samples, axis=1).tolist() self.eval_score = [] for trg_i, sample_i in zip(trg, samples): # Removing EOS try: idx = sample_i.index(Vocab.ES) sample_i = sample_i[:idx] except ValueError: pass try: idx = trg_i.words.index(Vocab.ES) trg_i.words = trg_i.words[:idx] except ValueError: pass # Calculate the evaluation score score = 0 if not len(sample_i) else self.evaluation_metric.evaluate_fast(trg_i.words, sample_i) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) loss = LossBuilder() if self.use_baseline: for i, (score, _) in enumerate(zip(self.bs, logsofts)): logsofts[i] = dy.cmult(logsofts[i], score - self.true_score) loss.add_loss("Reinforce", dy.sum_elems(dy.esum(logsofts))) else: loss.add_loss("Reinforce", dy.sum_elems(dy.cmult(-self.true_score, dy.esum(logsofts)))) if self.use_baseline: baseline_loss = [] for bs in self.bs: baseline_loss.append(dy.squared_distance(self.true_score, bs)) loss.add_loss("Baseline", dy.sum_elems(dy.esum(baseline_loss))) return loss
def on_calc_additional_loss(self, translator_loss): if not self.learn_segmentation or self.segment_decisions is None: return None reward = -translator_loss["mle"] if not self.log_reward: reward = dy.exp(reward) reward = dy.nobackprop(reward) # Make sure that reward is not scalar, but rather based on the each batch item assert reward.dim()[1] == len(self.src_sent) # Mask enc_mask = self.enc_mask.get_active_one_mask().transpose() if self.enc_mask is not None else None # Compose the lose ret = LossBuilder() ## Length prior alpha = self.length_prior_alpha.value() if self.length_prior_alpha is not None else 0 if alpha > 0: reward += self.segment_length_prior * alpha # reward z-score normalization if self.z_normalization: reward = dy.cdiv(reward-dy.mean_batches(reward), dy.std_batches(reward) + EPS) ## Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): loss = dy.squared_distance(reward, baseline) if enc_mask is not None: loss = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), loss) baseline_loss.append(loss) ret.add_loss("Baseline", dy.esum(baseline_loss)) if self.print_sample: print(dy.exp(self.segment_logsoftmaxes[i]).npvalue().transpose()[0]) ## Reinforce Loss lmbd = self.lmbd.value() if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - dy.nobackprop(self.bs[i]) else: r_i = reward if enc_mask is not None: ll = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), ll) reinforce_loss.append(r_i * -ll) loss = dy.esum(reinforce_loss) * lmbd ret.add_loss("Reinforce", loss) if self.confidence_penalty: ls_loss = self.confidence_penalty(self.segment_logsoftmaxes, enc_mask) ret.add_loss("Confidence Penalty", ls_loss) # Total Loss return ret
def cost_sensitive_reg_loss(indices, logits, logit_len, costs=None, valid_actions=None, verbose=False): """Computes MSE loss over actions thereby solving cost-sensitive multiclass classification. Takes the same arguments as other losses for dynamic oracles for compatibility. The lower the cost the better, therefore all invalid actions (-np.inf) are remapped to large cost numbers.""" # map costs between 0 and 1, with a margin for invalid actions costs[np.isinf(costs)] = np.max(costs) + 1 costs = costs / np.max(costs) # given that 0. costs are in there, we end up perfectly in [0, 1] # NB! Note the minus to cancel out the minus added in the batch loss #if verbose and np.random.rand() > 0.99: print 'Costs, logits: ', costs, logits.npvalue()#[valid_actions] return -dy.squared_distance(logits, dy.inputVector(costs))
def print_outputs(self, samples): for sample in samples: dy.renew_cg(immediate_compute=False, check_validity=False) graph_output = self.deserializer.buildSample(sample.neurons) label = dy.scalarInput(sample.target) loss = dy.squared_distance(graph_output, label) # dy.print_text_graphviz() print(f'label: {label.value()}, output: {graph_output.value()}')
def calc_baseline_loss(self, reward, only_final_reward): pred_rewards = [] cur_losses = [] for i, state in enumerate(self.states): pred_reward = self.baseline.transform(dy.nobackprop(state)) pred_rewards.append(dy.nobackprop(pred_reward)) seq_reward = reward if only_final_reward else reward[i] if self.valid_pos is not None: pred_reward = dy.pick_batch_elems(pred_reward, self.valid_pos[i]) act_reward = dy.pick_batch_elems(seq_reward, self.valid_pos[i]) else: act_reward = seq_reward cur_losses.append(dy.sum_batches(dy.squared_distance(pred_reward, dy.nobackprop(act_reward)))) return pred_rewards, dy.esum(cur_losses)
def eval(self, mini_batch): words, pos_tags, chars, langs, signs, masks = mini_batch h_out = self.rnn_mlp(mini_batch, False)[-1] t_out = dy.transpose( dy.reshape(h_out, (h_out.dim()[0][0], h_out.dim()[1]))) sims = [] for i in range(len(langs)): for j in range(i + 1, len(langs)): sims.append(dy.sqrt(dy.squared_distance(t_out[i], t_out[j]))) sim = dy.esum(sims) sim.forward() sim_value = sim.value() / len(sims) dy.renew_cg() return sim_value
def calc_baseline_loss(self, rewards): avg_rewards = dy.average( rewards) # Taking average of the rewards accross multiple samples pred_rewards = [] loss = [] for i, state in enumerate(self.states): pred_reward = self.baseline(dy.nobackprop(state)) pred_rewards.append(dy.nobackprop(pred_reward)) if self.valid_pos is not None: pred_reward = dy.pick_batch_elems(pred_reward, self.valid_pos[i]) avg_reward = dy.pick_batch_elems(avg_rewards, self.valid_pos[i]) else: avg_reward = avg_rewards loss.append( dy.sum_batches(dy.squared_distance(pred_reward, avg_reward))) return pred_rewards, dy.esum(loss)
def _perform_calc_loss( self, model: 'model_base.ConditionedModel', src: Union[sent.Sentence, 'batchers.Batch'], trg: Union[sent.Sentence, 'batchers.Batch']) -> losses.FactoredLossExpr: search_outputs = model.generate_search_output(src, self.search_strategy) sign = -1 if self.inv_eval else 1 # TODO: Fix units total_loss = collections.defaultdict(int) for search_output in search_outputs: # Calculate rewards eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = utils.remove_eos(sample_i.tolist(), vocabs.Vocab.ES) ref_i = trg_i.words[:trg_i.len_unpadded()] score = self.evaluation_metric.evaluate_one_sent( ref_i, sample_i) eval_score.append(sign * score) reward = dy.inputTensor(eval_score, batched=True) # Composing losses baseline_loss = [] cur_losses = [] for state, mask in zip(search_output.state, search_output.mask): bs_score = self.baseline.transform( dy.nobackprop(state.as_vector())) baseline_loss.append(dy.squared_distance(reward, bs_score)) logsoft = model.decoder.scorer.calc_log_probs( state.as_vector()) loss_i = dy.cmult(logsoft, reward - bs_score) cur_losses.append( dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) total_loss["polc_loss"] += dy.sum_elems(dy.esum(cur_losses)) total_loss["base_loss"] += dy.sum_elems(dy.esum(baseline_loss)) units = [t.len_unpadded() for t in trg] total_loss = losses.FactoredLossExpr( {k: losses.LossExpr(v, units) for k, v in total_loss.items()}) return losses.FactoredLossExpr({"risk": total_loss})
def learn(self, src, dst): softmax_list, aux_list = self._predict(src, dst=dst, num_predictions=len(dst) + 1, runtime=False) for softmax, aux, entry in zip(softmax_list, aux_list, dst): word = entry.word.decode('utf-8').lower() if word in self.output_encodings.word2int: w_index = self.output_encodings.word2int[word] else: w_index = self.output_encodings.word2int["<UNK>"] w_emb, found = self.dst_we.get_word_embeddings(entry.word.decode('utf-8')) self.losses.append(-dy.log(dy.pick(softmax, w_index))) if found: vec1=aux vec2=dy.inputVector(w_emb) cosine = dy.dot_product(vec1, vec2) * dy.pow(dy.l2_norm(vec1) * dy.l2_norm(vec2), dy.scalarInput(-1)) self.losses.append(dy.squared_distance(cosine, dy.scalarInput(1.0))) self.losses.append(-dy.log(dy.pick(softmax_list[-1], self.EOS)))
def __call__(self, translator, initial_state, src, trg): # TODO(philip30): currently only using the best hypothesis / first sample for reinforce loss # A small further implementation is needed if we want to do reinforce with multiple samples. search_output = translator.search_strategy.generate_output( translator, initial_state)[0] # Calculate evaluation scores self.eval_score = [] for trg_i, sample_i in zip(trg, search_output.word_ids): # Removing EOS sample_i = self.remove_eos(sample_i.tolist()) ref_i = self.remove_eos(trg_i.words) # Evaluating if len(sample_i) == 0: score = 0 else: score = self.evaluation_metric.evaluate(ref_i, sample_i) * \ (-1 if self.inv_eval else 1) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) # Composing losses loss = LossBuilder() if self.use_baseline: baseline_loss = [] losses = [] for state, logsoft, mask in zip(search_output.state, search_output.logsoftmaxes, search_output.mask): bs_score = self.baseline(state) baseline_loss.append( dy.squared_distance(self.true_score, bs_score)) loss_i = dy.cmult(logsoft, self.true_score - bs_score) losses.append( dy.cmult(loss_i, dy.inputTensor(mask, batched=True))) loss.add_loss("reinforce", dy.sum_elems(dy.esum(losses))) loss.add_loss("reinf_baseline", dy.sum_elems(dy.esum(baseline_loss))) else: loss.add_loss( "reinforce", dy.sum_elems(dy.cmult(self.true_score, dy.esum(logsofts)))) return loss
def learn(self, samples): for iter in range(self.epochae): if (iter > 0 and iter % self.printouts == 0): print(iter, " average loss is:", total_loss / seen_instances) seen_instances = 0 total_loss = 0 for sample in samples: dy.renew_cg(immediate_compute=False, check_validity=False) label = dy.scalarInput(sample.target) graph_output = self.deserializer.buildSample(sample.neurons) loss = dy.squared_distance(graph_output, label) seen_instances += 1 total_loss += loss.value() loss.backward() self.trainer.update() self.print_outputs(samples)
def model(self): print("Model Creating...") hidden_size = 64 vocabulary_size = len(self.vocab_to_index) input_size = output_size = vocabulary_size # Input and Output size are equal to V. m = dy.Model() W = m.add_parameters((hidden_size, input_size)) b = m.add_parameters(hidden_size) V = m.add_parameters((output_size, hidden_size)) # Softmax weights a = m.add_parameters(output_size) # Softmax bias x = dy.vecInput(input_size) y = dy.vecInput(output_size) h = dy.tanh((W * x) + b) output = dy.softmax(V * h) # Softmax loss = dy.squared_distance(output, y) trainer = dy.SimpleSGDTrainer(m) epoch = 1 for iter in range(epoch): my_loss = 0.0 seen_instances = 0 for binary_word in self.bigram_poem: x.set(self.word_vectors[binary_word[0]]) y.set(self.word_vectors[binary_word[1]]) seen_instances += 1 my_loss += loss.scalar_value() loss.forward() loss.backward() trainer.update() if (seen_instances > 1 and seen_instances % 100 == 0): print(seen_instances, "/", len(self.bigram_poem), "***average loss is:", my_loss / seen_instances) print(my_loss / seen_instances)
def create_graph(init_weight, init_con, init_bias, weight, bias, con, input, target, number, nodes): input_layer, layer_nodes, sizes, matrices, mapping_numbers, inputs, layer, maximum = graphs[ number] values = [0] * nodes dy.renew_cg() shared_nodes = graph_shared[number] #node je srialove cislo a layer je vrstva for node, serial_node, layer in shared_nodes.keys(): #prejdenie cez vsetky zdielane vrcholy serial, serial_number, vrstva, graf = shared_nodes[(node, serial_node, layer)][0] we = shared_models[graf][3][vrstva].value() wt = weight[layer].value() wt[node][serial_node] = we[serial_number][serial] weight[layer].set_value(wt) x = dy.vecInput(input.size) x.set(input) y = dy.vecInput(target.size) y.set(target) result = init_weight * x hgt = dy.cmult(init_weight, init_con) init_weight.set_value(hgt.value()) result = dy.logistic(result + init_bias) for j in range(layer_nodes[0].__len__()): values[layer_nodes[0][j]] = result[j] for i in range(maximum): inp = [] for node in input_layer[i]: inp.extend([values[node]]) inp = dy.concatenate(inp) weight[i].set_value(dy.cmult(weight[i], con[i]).value()) result = weight[i] * inp result = dy.logistic(result + bias[i]) for j in range(layer_nodes[i + 1].__len__()): values[layer_nodes[i + 1][j]] = result[j] loss = dy.squared_distance(y, result) return loss
def train(self, train_word_to_type, test_word_to_type=None, iterations=50): training_examples = self.build_example_vectors(train_word_to_type) for iteration_idx in xrange(1, iterations+1): print "Starting training iteration %d/%d" % (iteration_idx, iterations) random.shuffle(training_examples) loss = 0 for example_index, (word_index, expected_output) in enumerate(training_examples, 1): out_expression = self.build_expression(word_index) expected_output_expr = dy.vecInput(len(self.type_indexer)) expected_output_expr.set(expected_output) sentence_error = dy.squared_distance(out_expression, expected_output_expr) # sentence_error = dy.pickneglogsoftmax(out_expression, np.argmax(expected_output)) loss += sentence_error.scalar_value() sentence_error.backward() self.trainer.update() # Trainer Status self.trainer.status() print loss / float(len(training_examples))
def create_graph(init_weight, init_con, init_bias, weight, bias, con, input, target): x = dy.vecInput(input.size) x.set(input) y=dy.vecInput(target.size) y.set(target) result = init_weight * x hgt = dy.cmult(init_weight, init_con) init_weight.set_value(hgt.value()) result = dy.logistic(result + init_bias) for j in range(layer_nodes[0].__len__()): values[layer_nodes[0][j]] = result[j] for i in range(maximum): inp = [] for node in input_layer[i]: inp.extend([values[node]]) inp = dy.concatenate(inp) weight[i].set_value(dy.cmult(weight[i], con[i]).value()) result = weight[i] * inp result = dy.logistic(result + bias[i]) for j in range(layer_nodes[i + 1].__len__()): values[layer_nodes[i + 1][j]] = result[j] loss = dy.squared_distance(y, result) return loss
def fit(self, train_X, train_Y, num_epochs, train_algo, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, learning_rate=0, trg_vectors=None, unsup_weight=1.0): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling """ print("read training data", file=sys.stderr) if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed training_algo = TRAINER_MAP[train_algo] if learning_rate > 0: trainer = training_algo(self.model, learning_rate=learning_rate) else: trainer = training_algo(self.model) # if we use word dropout keep track of counts if word_dropout_rate > 0.0: widCount = Counter() for sentence, _ in train_X: widCount.update([w for w in sentence]) assert (len(train_X) == len(train_Y)) train_data = list(zip(train_X, train_Y)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx %d is not at %d.' % (trg_start_id, len(trg_vectors)) print('Starting training for %d epochs...' % num_epochs) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print('Using early stopping with patience of %d...' % patience) for cur_iter in range(num_epochs): bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss = 0.0 total_tagged = 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): (word_indices, char_indices), y = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output = self.predict(word_indices, char_indices, train=True) if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: if trg_vectors: # use average instead of sum here so long sequences are not # preferred and so it can be combined with aux loss loss = dynet.average([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) other_loss = unsup_weight * dynet.average([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) loss += other_loss total_loss += loss.value() total_tagged += len(word_indices) loss.backward() trainer.update() bar.next() print("iter {2} {0:>12}: {1:.2f}".format("total loss", total_loss / total_tagged, cur_iter), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy %.4f is better than best val accuracy %.4f.' % (val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print('Accuracy %.4f is worse than best val loss %.4f.' % (val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for %d epochs. Early stopping...' % epochs_no_improvement) break
def loss(self, observation, target_rep): return dy.squared_distance(observation, dy.inputVector(target_rep))
inp = [] for node in input_layer[i]: inp.extend([values[node]]) a = matrices[i] weight[i] = m.add_parameters(a.shape, init=a) inp = dy.concatenate(inp) bias = m.add_parameters(layer_nodes[i + 1].__len__()) con = dy.const_parameter(m.add_parameters(a.shape, init=a)) weight[i] = dy.cmult(weight[i], con) result = weight[i] * inp result = dy.logistic(result + bias) for j in range(layer_nodes[i + 1].__len__()): values[layer_nodes[i + 1][j]] = result[j] y_pred = result y = dy.vecInput(sizes[maximum]) loss = dy.squared_distance(y_pred, y) mloss = 0.0 for learning_cycle in range(number_of_learning_cycles): for number in range(numberOfGraphs): inp = np.zeros(sizes[0]) for i in range(sizes[0]): if layer_nodes[0][i] in graph_inputs[number][0]: inp[i] = 1 input.set(np.random.uniform(0, 2, sizes[0])) target = y_pred.value() for i in range(sizes[maximum]): if layer_nodes[maximum][i] in graph_inputs[number][1]: target[i] = 1 y.set(target)
e1, stddev ) # add a noise to each element from a gausian with standard-dev = stddev dy.dropout(e1, p) # apply dropout with probability p # functions over lists of expressions e = dy.esum([e1, e2, ...]) # sum e = dy.average([e1, e2, ...]) # average e = dy.concatenate_cols( [e1, e2, ...] ) # e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...]) e = dy.concatenate([e1, e2, ...]) # concatenate e = dy.affine_transform([e0, e1, e2, ...]) # e = e0 + ((e1*e2) + (e3*e4) ...) ## Loss functions e = dy.squared_distance(e1, e2) e = dy.l1_distance(e1, e2) e = dy.huber_distance(e1, e2, c=1.345) # e1 must be a scalar that is a value between 0 and 1 # e2 (ty) must be a scalar that is a value between 0 and 1 # e = ty * log(e1) + (1 - ty) * log(1 - e1) e = dy.binary_log_loss(e1, e2) # e1 is row vector or scalar # e2 is row vector or scalar # m is number # e = max(0, m - (e1 - e2)) e = dy.pairwise_rank_loss(e1, e2, m=1.0) # Convolutions
vectorSize = len(wordVector) hiddenNeuronNumber = int(vectorSize / 100) m = dy.ParameterCollection() W = m.add_parameters((hiddenNeuronNumber, vectorSize * 2)) V = m.add_parameters((vectorSize, hiddenNeuronNumber)) b = m.add_parameters((hiddenNeuronNumber)) dy.renew_cg() x = dy.vecInput(vectorSize * 2) output = dy.logistic(V * (dy.tanh((W * x) + b))) y = dy.vecInput(vectorSize) loss = dy.squared_distance(output, y) trainer = dy.SimpleSGDTrainer(m) epoch = 0 while epoch < 10: epochLoss = 0 for i in range(0, len(wordList) - 2): x.set( defineWordVector(wordList[0], wordVector) + defineWordVector(wordList[1], wordVector)) y.set(defineWordVector(wordList[2], wordVector)) loss.backward() trainer.update()
def test_wsj(): print print '# test on wsj subset' data, n_types, n_labels = pickle.load(open('wsj.pkl', 'r')) d_emb = 50 d_rnn = 51 d_hid = 52 d_actemb = 5 minibatch_size = 5 n_epochs = 10 preprocess_minibatch = True model = dy.ParameterCollection() embed_word = model.add_lookup_parameters((n_types, d_emb)) f_gru = dy.GRUBuilder(1, d_emb, d_rnn, model) b_gru = dy.GRUBuilder(1, d_emb, d_rnn, model) embed_action = model.add_lookup_parameters((n_labels, d_actemb)) combine_arh_W = model.add_parameters((d_hid, d_actemb + d_rnn * 2 + d_hid)) combine_arh_b = model.add_parameters(d_hid) initial_h = model.add_parameters(d_hid, dy.ConstInitializer(0)) initial_actemb = model.add_parameters(d_actemb, dy.ConstInitializer(0)) policy_W = model.add_parameters((n_labels, d_hid)) policy_b = model.add_parameters(n_labels) optimizer = dy.AdamTrainer(model, alpha=0.01) for _ in xrange(n_epochs): total_loss = 0 for batch in minibatch(data, minibatch_size, True): dy.renew_cg() combine_arh_We = dy.parameter(combine_arh_W) combine_arh_be = dy.parameter(combine_arh_b) policy_We = dy.parameter(policy_W) policy_be = dy.parameter(policy_b) loss = 0 if preprocess_minibatch: # for efficiency, combine RNN outputs on entire # minibatch in one go (requires padding with zeros, # should be masked but isn't right now) all_tokens = [ex.tokens for ex in batch] max_length = max(map(len, all_tokens)) all_tokens = [[x[i] if len(x) > i else 0 for x in all_tokens] for i in range(max_length)] all_e = [dy.lookup_batch(embed_word, x) for x in all_tokens] all_rnn_out = bi_gru(f_gru, b_gru, all_e) losses = [] for batch_id, ex in enumerate(batch): N = len(ex.tokens) if preprocess_minibatch: rnn_out = [ dy.pick_batch_elem(x, batch_id) for x in all_rnn_out[:N] ] else: e = [embed_word[x] for x in ex.tokens] rnn_out = bi_gru(f_gru, b_gru, e) prev_h = dy.parameter(initial_h) # previous hidden state actemb = dy.parameter( initial_actemb) # embedding of previous action output = [] for t in xrange(N): # update hidden state based on most recent # *predicted* action (not ground truth) inputs = [actemb, prev_h, rnn_out[t]] h = dy.rectify( dy.affine_transform([ combine_arh_be, combine_arh_We, dy.concatenate(inputs) ])) # make prediction pred_vec = dy.affine_transform([policy_be, policy_We, h]) pred = pred_vec.npvalue().argmin() output.append(pred) # accumulate loss (squared error against costs) truth = np.ones(n_labels) truth[ex.labels[t]] = 0 losses.append( dy.squared_distance(pred_vec, dy.inputTensor(truth))) # cache hidden state, previous action embedding prev_h = h actemb = embed_action[pred] # print 'output=%s, truth=%s' % (output, ex.labels) loss = dy.esum(losses) loss.backward() total_loss += loss.value() optimizer.update() print total_loss
def predict_loss(self, encodings, usr): avg_enc = dy.mean_dim(encodings, 1) h = dy.rectify(dy.affine_transform([self.bh, self.Wh, avg_enc])) s = dy.affine_transform([self.bu, self.Wu, h]) return dy.mean_batches(dy.squared_distance(s, self.usr_vec))
def calc_loss(self, policy_reward, results={}): """ Calc policy networks loss. """ assert len(policy_reward) == len(self.states), "There should be a reward for every action taken" batch_size = self.states[0].dim()[1] loss = {} # Calculate the baseline loss of the reinforce loss for each timestep: # b = W_b * s + b_b # R = r - b # Also calculate the baseline loss # b = r_p (predicted) # loss_b = squared_distance(r_p - r_r) rewards = [] baseline_loss = [] units = np.zeros(batch_size) for i, state in enumerate(self.states): r_p = self.baseline.transform(dy.nobackprop(state)) rewards.append(policy_reward[i] - r_p) if self.valid_pos[i] is not None: r_p = dy.pick_batch_elems(r_p, self.valid_pos[i]) r_r = dy.pick_batch_elems(policy_reward[i], self.valid_pos[i]) units[self.valid_pos[i]] += 1 else: r_r = policy_reward[i] units += 1 baseline_loss.append(dy.sum_batches(dy.squared_distance(r_p, r_r))) loss["rl_baseline"] = losses.LossExpr(dy.esum(baseline_loss), units) # Z Normalization # R = R - mean(R) / std(R) rewards = dy.concatenate(rewards, d=0) r_dim = rewards.dim() if self.z_normalization: rewards_shape = dy.reshape(rewards, (r_dim[0][0], r_dim[1])) rewards_mean = dy.mean_elems(rewards_shape) rewards_std = dy.std_elems(rewards_shape) + 1e-20 rewards = (rewards - rewards_mean.value()) / rewards_std.value() rewards = dy.nobackprop(rewards) # Calculate Confidence Penalty if self.confidence_penalty: loss["rl_confpen"] = self.confidence_penalty.calc_loss(self.policy_lls) # Calculate Reinforce Loss # L = - sum([R-b] * pi_ll) reinf_loss = [] units = np.zeros(batch_size) for i, (policy, action) in enumerate(zip(self.policy_lls, self.actions)): reward = dy.pick(rewards, i) ll = dy.pick_batch(policy, action) if self.valid_pos[i] is not None: ll = dy.pick_batch_elems(ll, self.valid_pos[i]) reward = dy.pick_batch_elems(reward, self.valid_pos[i]) units[self.valid_pos[i]] += 1 else: units += 1 reinf_loss.append(dy.sum_batches(dy.cmult(ll, reward))) loss["rl_reinf"] = losses.LossExpr(-dy.esum(reinf_loss), units) # Pack up + return return losses.FactoredLossExpr(loss)
a = m.add_parameters(1) if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) x = dy.vecInput(2) y = dy.scalarInput(0) h = dy.tanh((W*x) + b) if xsent: y_pred = dy.logistic((V*h) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V*h) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1 for iter in range(ITERATIONS): mloss = 0.0 for mi in range(4): x1 = mi % 2 x2 = (mi // 2) % 2 x.set([T if x1 else F, T if x2 else F]) y.set(T if x1 != x2 else F) mloss += loss.scalar_value() loss.backward() trainer.update() mloss /= 4.
def unsupervised_with_baseline(self): decoder = self.create_decoder() assert (os.path.exists(self.options.result_dir + 'model_dec')) self.load_decoder(decoder) encoder = self.create_encoder() assert (os.path.exists(self.options.result_dir + 'model_enc')) self.load_encoder(encoder) baseline = self.create_baseline() if os.path.exists(self.options.result_dir + 'baseline'): self.load_baseline(baseline) enc_trainer = optimizers[self.options.optimizer](encoder.model) dec_trainer = optimizers[self.options.optimizer](decoder.model) baseline_trainer = optimizers[self.options.optimizer](baseline.model) lr = self.options.lr #used only for sgd i = 0 lowest_valid_loss = 9999 print('unsupervised training...') for epoch in range(self.options.epochs): sents = 0 total_loss = 0.0 train = self.reader.next_example(0) train_size = len(self.reader.data[0]) for data in train: s1, s2, s3, pos, act = data[0], data[1], data[2], data[ 3], data[4] sents += 1 # random sample enc_loss_act, _, act = encoder.parse(s1, s2, s3, pos, sample=True) _, dec_loss_act, dec_loss_word = decoder.compute_loss(s3, act) # save reward logpx = -dec_loss_word.scalar_value() total_loss -= logpx # reconstruction and regularization loss backprop to theta_d dec_loss_total = dec_loss_word + dec_loss_act * dy.scalarInput( self.options.dec_reg) dec_loss_total = dec_loss_total * dy.scalarInput( 1.0 / self.options.mcsamples) dec_loss_total.scalar_value() dec_loss_total.backward() # update decoder if self.options.optimizer == 'sgd': dec_trainer.update(lr) else: dec_trainer.update() if self.options.enc_update > 0: # compute baseline and backprop to theta_b b = baseline(s3) logpxb = b.scalar_value() b_loss = dy.squared_distance(b, dy.scalarInput(logpx)) b_loss.value() b_loss.backward() # update baseline if self.options.optimizer == 'sgd': baseline_trainer.update(lr) else: baseline_trainer.update() # policy and and regularization loss backprop to theta_e enc_loss_act = encoder.train(s1, s2, s3, pos, act) enc_loss_policy = enc_loss_act * dy.scalarInput( (logpx - logpxb) / len(s1)) enc_loss_total = enc_loss_policy * dy.scalarInput( self.options.enc_update ) - enc_loss_act * dy.scalarInput(self.options.enc_reg) enc_loss_total = enc_loss_total * dy.scalarInput( 1.0 / self.options.mcsamples) enc_loss_total.value() enc_loss_total.backward() # update encoder if self.options.optimizer == 'sgd': enc_trainer.update(lr) else: enc_trainer.update() e = float(i) / train_size if i % self.options.print_every == 0: print('epoch {}: loss per sentence: {}'.format( e, total_loss / sents)) sents = 0 total_loss = 0.0 if i != 0 and i % self.options.save_every == 0: print('computing loss on validation set...') total_valid_loss = 0 valid = self.reader.next_example(1) valid_size = len(self.reader.data[1]) for vdata in valid: s1, s2, s3, pos, act = vdata[0], vdata[1], vdata[ 2], vdata[3], vdata[4] _, _, valid_word_loss = decoder.compute_loss(s3, act) if valid_word_loss is not None: total_valid_loss += valid_word_loss.scalar_value() total_valid_loss = total_valid_loss * 1.0 / valid_size if total_valid_loss < lowest_valid_loss: lowest_valid_loss = total_valid_loss print('saving model...') encoder.Save(self.options.result_dir + 'model_enc') decoder.Save(self.options.result_dir + 'model_dec') baseline.Save(self.options.result_dir + 'baseline') else: lr = lr * self.options.decay i += 1