def evaluate(self, batches, mode, output_path=None): results = [] used_batches = [] for i, batch in enumerate(batches): input_feed = self.get_input_feed(batch, False) #outputs = np.array([random.randint(0, self.vocab.category.size-1) # for _ in range(batch.contexts.word.shape[0])]) outputs = self.sess.run(self.predictions, input_feed) try: used_batches += flatten_batch(batch) except Exception as e: pprint(batch) print(e) exit(1) results.append(outputs) results = np.concatenate(results, axis=0) sys.stdout = open(output_path, 'w') if output_path else sys.stdout accuracy = evaluate_and_print(used_batches, results, vocab=self.encoder.vocab) sys.stdout = sys.__stdout__ if output_path: sys.stderr.write( "Output the testing results to \'{}\' .\n".format(output_path)) summary_dict = {} summary_dict['category/%s/Accuracy' % mode] = accuracy summary = make_summary(summary_dict) return accuracy, summary
def run_epoch_one_task(self, task_name, batches, is_training): task_model = self.tasks[task_name] loss = 0.0 start_time = time.time() for i, raw_batch in enumerate(batches[task_name]): batch = {task_name:raw_batch} input_feed = self.get_input_feed(batch, is_training) output_feed = [self.tasks[task_name].loss] if is_training: output_feed.append(self.updates[task_name]) t = time.time() outputs = self.sess.run(output_feed, input_feed) t = time.time() - t step_loss = outputs[0] loss += step_loss print('epoch: %d,' % self.epoch.eval(), 'step: %d,' % i, 'task: %s,' % task_name, 'step_loss: %.3f,' % step_loss, 'step_time: %f,' % t) sys.stdout.flush() #break # DEBUG if math.isnan(step_loss): raise ValueError( "Nan loss detection ... (%s: step %d)" % (task_name, i)) loss /= i mode = 'train' if is_training else 'valid' summary_dict = {'%s/%s/loss' % (task_name, mode): loss} summary = tf_utils.make_summary(summary_dict) epoch_time = (time.time() - start_time) return epoch_time, loss, summary
def run_epoch(self, batches, is_training): start_time = time.time() num_steps_in_epoch = [0 for _ in self.tasks] loss = [0.0 for _ in self.tasks] is_uncomplete = True while is_uncomplete: is_uncomplete = False t = time.time() for i, (task_name, task_model) in enumerate(self.tasks.items()): try: raw_batch = batches[task_name].__next__() batch = {task_name:raw_batch} input_feed = self.get_input_feed(batch, is_training) if task_model.debug_ops: for ops, res in zip(task_model.debug_ops, self.sess.run(task_model.debug_ops, input_feed)): #print(ops, res.shape) print(ops) print(res) #exit(1) output_feed = [task_model.loss] if is_training: output_feed.append(self.updates[task_name]) t = time.time() outputs = self.sess.run(output_feed, input_feed) t = time.time() - t step_loss = outputs[0] print('epoch: %d,' % self.epoch.eval(), 'step: %d,' % num_steps_in_epoch[i], 'task: %s,' % task_name, 'step_loss: %.3f,' % step_loss, 'step_time: %f,' % t) sys.stdout.flush() if math.isnan(step_loss): raise ValueError( "Nan loss detection ... (%s: step %d)" % (task_name, num_steps_in_epoch[i]) ) num_steps_in_epoch[i] += 1 loss[i] += step_loss is_uncomplete = True except StopIteration as e: pass except ValueError as e: print (e) # print('subj.position\n', raw_batch.subj.position) # print('obj.position\n', raw_batch.obj.position) # print('text.raw\n', raw_batch.text.raw) # print('text.word\n', raw_batch.text.word) # print('text.char\n', raw_batch.text.char) # print('rel.word\n', raw_batch.rel.word) # print('rel.char\n', raw_batch.rel.char) exit(1) epoch_time = (time.time() - start_time) loss = [l/num_steps for l, num_steps in zip(loss, num_steps_in_epoch)] mode = 'train' if is_training else 'valid' summary_dict = {'%s/%s/loss' % (task_name, mode): l for task_name, l in zip(self.tasks, loss)} summary = tf_utils.make_summary(summary_dict) return epoch_time, loss, summary
def train(self): model = self.create_model(self.config) for epoch in range(model.epoch.eval(), self.config.max_epoch): sys.stdout.write( 'Save the model at the begining of epoch %02d as model.ckpt-%d\n' % (epoch, epoch)) self.save_model(model) self.output_variables_as_text(model) sente_win_rate, gote_win_rate = self.evaluate(model) self.logger.info( 'Epoch %d, Win Rate (sente, gote) = (%.3f, %.3f)' % (epoch, sente_win_rate, gote_win_rate)) batches = self.dataset.get_batches(self.config.batch_size, model.epoch.eval(), is_training=True) average_loss = 0.0 for i, batch in enumerate(batches): q_values, loss, _ = model.step(batch, i) average_loss += loss print('step = %d, loss = %f' % (i, loss)) average_loss /= (i + 1) self.logger.info('Epoch %d, Average loss = %f' % (epoch, average_loss)) summary_dict = {} summary_dict["train/loss"] = average_loss summary = tf_utils.make_summary(summary_dict) self.summary_writer.add_summary(summary, model.epoch.eval()) model.add_epoch() self.evaluate(model)
def evaluate(self, batches, mode, output_path=None): start_time = time.time() results = [] used_batches = [] for i, batch in enumerate(batches): input_feed = self.get_input_feed(batch, False) ce = self.sess.run(self.loss, input_feed) outputs = self.sess.run(self.outputs, input_feed) used_batches += flatten_batch(batch) results.append(outputs) results = np.concatenate(results, axis=0) epoch_time = time.time() - start_time sys.stdout = open(output_path, 'w') if output_path else sys.stdout acc, prec, recall = evaluate_and_print(used_batches, results, vocab=self.encoder.vocab) print('acc, p, r, f = %.2f %.2f %.2f %.2f' % (100.0 * acc, 100.0 * prec, 100.0 * recall, 100.0 * (prec + recall) / 2)) sys.stdout = sys.__stdout__ summary_dict = {} summary_dict['graph/%s/Accuracy' % mode] = acc summary_dict['graph/%s/Precision' % mode] = prec summary_dict['graph/%s/Recall' % mode] = recall summary_dict['graph/%s/F1' % mode] = (prec + recall) / 2 summary = make_summary(summary_dict) return (acc, prec, recall), summary
def run_epoch(self, batches, is_training): start_time = time.time() num_steps = 0 loss = np.array( [0.0 for t in self.tasks.values() if t.loss is not None]) mode = 'train' if is_training else 'valid' sys.stdout.write('<%s>\n' % mode) while True: t = time.time() # Once one of the batches of a task stops iteration in an epoch, go to the next epoch. batch = self.yield_examples(batches, self.tasks) if batch is None: break num_steps += 1 input_feed = self.get_input_feed(batch, is_training) output_feed = [] output_feed.extend(self.losses) if is_training: output_feed.append(self.updates) # for task_model in self.tasks.values(): # if task_model.debug_ops: # print(task_model) # print(task_model.debug_ops) # for ops, res in zip(task_model.debug_ops, # self.sess.run(task_model.debug_ops, input_feed)): # print(ops, res.shape) # print(res) t = time.time() outputs = self.sess.run(output_feed, input_feed) t = time.time() - t step_loss = outputs[:len(loss)] loss += np.array(step_loss) print('epoch: %d,' % self.epoch.eval(), 'step: %d,' % self.global_step.eval(), 'task: %s,' % ' '.join(self.trainable_tasks.keys()), 'step_loss: %s,' % ' '.join(["%.3f" % l for l in step_loss]), 'step_time: %f' % t) sys.stdout.flush() epoch_time = (time.time() - start_time) step_time = epoch_time / num_steps loss = [l / num_steps for l in loss] mode = 'train' if is_training else 'valid' summary_dict = { '%s/%s/loss' % (task_model.scopename, mode): l for task_model, l in zip(self.trainable_tasks.values(), loss) } summary = make_summary(summary_dict) return epoch_time, step_time, loss, summary
def run_epoch(self, batches, is_training): start_time = time.time() loss = np.array([0.0 for _ in self.tasks]) num_steps = 0 while True: t = time.time() batch = {} for i, (task_name, task_model) in enumerate(self.tasks.items()): try: if task_name in batches: raw_batch = batches[task_name].__next__() batch.update({task_name:raw_batch}) else: batch.update({task_name:{}}) except StopIteration as e: pass except ValueError as e: print (e) exit(1) # Once one of the batches of a task stops iteration in an epoch, go to the next epoch. if False in [task_name in batch for task_name in self.tasks]: break input_feed = self.get_input_feed(batch, is_training) output_feed = [] output_feed += self.losses if is_training: output_feed.append(self.updates) t = time.time() outputs = self.sess.run(output_feed, input_feed) t = time.time() - t step_loss = outputs[:len(self.tasks)] loss += np.array(step_loss) print('epoch: %d,' % self.epoch.eval(), 'step: %d,' % self.global_step.eval(), 'task: %s,' % ' '.join(self.tasks.keys()), 'step_loss: %s,' % ' '.join(["%.3f" % l for l in step_loss]), 'step_time: %f,' % t) num_steps += 1 sys.stdout.flush() epoch_time = (time.time() - start_time) loss = [l/num_steps for l in loss] mode = 'train' if is_training else 'valid' summary_dict = {'%s/%s/loss' % (task_name, mode): l for task_name, l in zip(self.tasks, loss)} summary = tf_utils.make_summary(summary_dict) return epoch_time, loss, summary
def evaluate(self, model): epoch = model.epoch.eval() self.output_variables_as_text(model) cmd = './simulator/evaluateAgent.sh %s' % self.root_path os.system(cmd) sente_log_path = self.root_path + '/evaluation/sente/%03d/sente_summary' % epoch for i, l in enumerate(open(sente_log_path)): if i == 1: sente_win_rate = float(l.split('/')[0]) gote_log_path = self.root_path + '/evaluation/gote/%03d/gote_summary' % epoch for i, l in enumerate(open(gote_log_path)): if i == 1: gote_win_rate = float(l.split('/')[1]) summary_dict = {} summary_dict["test/win_rate/sente"] = sente_win_rate summary_dict["test/win_rate/gote"] = gote_win_rate summary = tf_utils.make_summary(summary_dict) self.summary_writer.add_summary(summary, model.epoch.eval()) return sente_win_rate, gote_win_rate
def test(self, batches, mode, logger, output_path=None): results = [] used_batches = [] for i, batch in enumerate(batches): input_feed = self.get_input_feed(batch, False) relations, mentions = self.sess.run(self.predictions, input_feed) try: used_batches += flatten_batch(batch) except Exception as e: pprint(batch) print(e) exit(1) for rel, mention in zip(relations.tolist(), mentions.tolist()): results.append((rel, mention)) sys.stdout = open(output_path, 'w') if output_path else sys.stdout triples, mentions = dataset_class.formatize_and_print( used_batches, results, vocab=self.encoder.vocab) triple_precision, triple_recall, triple_f1 = dataset_class.evaluate_triples( triples) mention_precision, mention_recall, mention_f1 = dataset_class.evaluate_mentions( mentions) sys.stdout = sys.__stdout__ if output_path: sys.stderr.write( "Output the testing results to \'{}\' .\n".format(output_path)) summary_dict = {} summary_dict['relex/%s/triple/f1' % mode] = triple_f1 summary_dict['relex/%s/triple/precision' % mode] = triple_precision summary_dict['relex/%s/triple/recall' % mode] = triple_recall summary_dict['relex/%s/mention/f1' % mode] = mention_f1 summary_dict['relex/%s/mention/precision' % mode] = mention_precision summary_dict['relex/%s/mention/recall' % mode] = mention_recall summary = make_summary(summary_dict) return triple_f1, summary
def test(self, batches, mode, logger, output_path): results = np.zeros([0, 2]) used_batches = [] sys.stderr.write('Start decoding (%s) ...\n' % mode) for i, batch in enumerate(batches): input_feed = self.get_input_feed(batch, False) # output_feed = [ # self.predictions, # ] output_feed = self.predictions outputs = self.sess.run(output_feed, input_feed) # Flatten the batch and outputs. used_batches += flatten_batch(batch) results = np.concatenate([results, outputs]) sys.stdout = open(output_path, 'w') if output_path else sys.stdout sys.stderr.write('%d %d\n' % (len(results), len(used_batches))) acc = evaluate_and_print(used_batches, results) sys.stdout = sys.__stdout__ summary_dict = {} summary_dict['%s/%s/accuracy' % (self.scopename, mode)] = acc summary = make_summary(summary_dict) return acc, summary
def test(self, batches, mode, logger, output_path): results = [] used_batches = [] for i, batch in enumerate(batches): input_feed = self.get_input_feed(batch, False) outputs = self.sess.run(self.predictions, input_feed) try: used_batches += flatten_batch(batch) except Exception as e: pprint(batch) print(e) exit(1) results.append(outputs[:, 0, :]) results = flatten([r.tolist() for r in results]) sys.stdout = open(output_path, 'w') if output_path else sys.stdout bleu = evaluate_and_print(used_batches, results, vocab=self.vocab) if output_path: sys.stderr.write( "Output the testing results to \'{}\' .\n".format(output_path)) sys.stdout = sys.__stdout__ summary_dict = {} summary_dict['desc/%s/BLEU' % mode] = bleu summary = make_summary(summary_dict) return bleu, summary
def run_epoch(self, batches, is_training): start_time = time.time() num_steps = 1 loss = np.array([0.0 for _ in self.tasks]) mode = 'train' if is_training else 'valid' print('<%s>' % mode) while True: t = time.time() try: model = self.tasks[self.taskname] batch = [ batches[self.taskname].__next__() for _ in range(model.n_models) ] except StopIteration as e: break input_feed = self.get_input_feed(batch, is_training) task_model = self.tasks[self.taskname] if task_model.debug_ops: print(task_model) for ops, res in zip( task_model.debug_ops, self.sess.run(task_model.debug_ops, input_feed)): print(ops, res.shape) print(res) exit(1) output_feed = [] output_feed.append(self.losses) if is_training: output_feed.append(self.updates) t = time.time() outputs = self.sess.run(output_feed, input_feed) t = time.time() - t step_loss = outputs[:len(self.tasks)] loss += np.array(step_loss) print('epoch: %d,' % self.epoch.eval(), 'step: %d,' % self.global_step.eval(), 'task: %s,' % ' '.join(self.tasks.keys()), 'step_loss: %s,' % ' '.join(["%.3f" % l for l in step_loss]), 'step_time: %.3f,' % t) sys.stdout.flush() num_steps += 1 epoch_time = (time.time() - start_time) step_time = epoch_time / num_steps loss = [l / num_steps for l in loss] if loss[0] == 0: raise ValueError( 'Set max_rows of the data more than batch_size * num_gpus.') mode = 'train' if is_training else 'valid' summary_dict = { '%s/%s/loss' % (task_model.scopename, mode): l for task_model, l in zip(self.tasks.values(), loss) } summary = tf_utils.make_summary(summary_dict) return epoch_time, step_time, loss, summary
def evaluate(self, batches, gold_path, mode, official_stdout=False): def _k_to_tag(k): if k == -3: return "oracle" # use only gold spans. elif k == -2: return "actual" # use mention_spans as a result of pruning candidate_spans. elif k == -1: return "exact" # use the same number of candidate_spans as the gold_spans. elif k == 0: return "threshold" # use only candidate_spans with a score greater than 0. else: return "{}%".format(k) #mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] } mention_evaluators = { k: coref_util.RetrievalEvaluator() for k in [-3, -2, -1, 0] } coref_predictions = {} coref_evaluator = metrics.CorefEvaluator() results = OrderedDict() for example_num, example in enumerate(batches): input_feed = self.get_input_feed(example, False) gold_starts = input_feed[self.ph.gold_starts] gold_ends = input_feed[self.ph.gold_ends] ###### # debug # flattened_text_emb, mention_starts, mention_ends, gold_starts, gold_ends = self.sess.run(self.debug_ops, input_feed) # dbgprint(str(example_num) + ':') # print('text_shape', flattened_text_emb.shape) # print('pred_mentions', np.concatenate([np.expand_dims(mention_starts, -1), # np.expand_dims(mention_ends, -1)], # axis=-1)) # print('gold_mentions', np.concatenate([np.expand_dims(gold_starts, -1), # np.expand_dims(gold_ends, -1)], # axis=-1)) # print() ###### outputs = self.sess.run(self.outputs, input_feed) candidate_starts, candidate_ends, candidate_mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores = outputs[: 7] self.evaluate_mentions(candidate_starts, candidate_ends, mention_starts, mention_ends, candidate_mention_scores, gold_starts, gold_ends, example, mention_evaluators) predicted_antecedents = self.get_predicted_antecedents( antecedents, antecedent_scores) coref_predictions[example.doc_key] = self.evaluate_coref( mention_starts, mention_ends, predicted_antecedents, example.clusters, coref_evaluator) results[example.doc_key] = dotDict({ 'raw_text': example.text.raw, 'speakers': example.speakers, 'extracted_mentions': [(begin, end) for begin, end in zip(mention_starts, mention_ends)], 'predicted_antecedents': predicted_antecedents }) if len(outputs) > 7: mention_descs = {} pred_mention_desc = [ self.vocab.decoder.word.ids2tokens(s) for s in outputs[7][:, 0, :] ] gold_mention_desc = [ self.vocab.decoder.word.ids2tokens(s) for s in outputs[8][:, 0, :] ] for s, e, desc in zip(mention_starts, mention_ends, pred_mention_desc): mention_descs[(s, e)] = desc for s, e, desc in zip(gold_starts, gold_ends, pred_mention_desc): mention_descs[(s, e)] = desc results[example.doc_key].mention_descs = mention_descs else: results[example.doc_key].mention_descs = [] summary_dict = {} for k, evaluator in sorted(list(mention_evaluators.items()), key=operator.itemgetter(0)): tags = [ "mention/{} @ {}".format(t, _k_to_tag(k)) for t in ("R", "P", "F") ] results_to_print = [] for t, v in zip(tags, evaluator.metrics()): results_to_print.append("{:<10}: {:.2f}".format(t, v)) summary_dict["coref/%s/" % mode + t] = v print(", ".join(results_to_print)) conll_results = conll.evaluate_conll(gold_path, coref_predictions, official_stdout) val_types = ('p', 'r', 'f') for metric in conll_results: for val_type in val_types: summary_dict["coref/%s/%s/%s" % (mode, metric, val_type)] = conll_results[metric][val_type] print("%s (%s) : %s" % (metric, ", ".join(val_types), " ".join( ["%.2f" % x for x in conll_results[metric].values()]))) average_f1 = sum( conll_res["f"] for conll_res in list(conll_results.values())) / len(conll_results) summary_dict["coref/%s/Average F1 (conll)" % mode] = average_f1 print("Average F1 (conll): {:.2f}%".format(average_f1)) p, r, f = coref_evaluator.get_prf() summary_dict["coref/%s/Average F1 (py)" % mode] = f print("Average F1 (py): {:.2f}%".format(f * 100)) summary_dict["coref/%s/Average precision (py)" % mode] = p print("Average precision (py): {:.2f}%".format(p * 100)) summary_dict["coref/%s/Average recall (py)" % mode] = r print("Average recall (py): {:.2f}%".format(r * 100)) aligned_results = coref_evaluator.get_aligned_results() for doc_key, aligned in zip(results, aligned_results): results[doc_key]['aligned_results'] = aligned average_f1 = sum( [values['f'] for metric, values in conll_results.items()]) / len(conll_results) return tf_utils.make_summary(summary_dict), average_f1, results