def _run_epoch_dev_part(self, sess, dataset_dev): ''' test ("dev") for one epoch returns predicted answers might be able to merge with _run_epoch_val_part. didn't bother ''' si_pred = [] ei_pred = [] prog = util.Progbar( target=1 + int(len(dataset_dev['.ids.context']) / \ FLAGS.batch_size)) for i, batch in enumerate(self.get_mini(dataset_dev, FLAGS.batch_size, shuffle=False, span=False)): si_all, ei_all, mem = self._run_epoch_dev_minibatch(sess, batch) mem = int(mem)>>20 prog.update(i + 1, exact=[('mem', mem)]) si_pred.append(si_all) # order should be preserved. shuffle=False ei_pred.append(ei_all) si_pred = np.concatenate(si_pred) ei_pred = np.concatenate(ei_pred) span = self.pick_si_ei(si_pred, ei_pred) answers = self.span_to_pred(span, dataset_dev['.ids.context']) return answers
def _run_epoch_val_part(self, sess, dataset_val): ''' validates for one epoch returns validation metric which is the harmonic mean of f1 and em ''' si_pred = [] ei_pred = [] #losses = [] prog = util.Progbar( target=1 + int(len(dataset_val['.ids.context']) / \ FLAGS.batch_size)) for i, batch in enumerate(self.get_mini(dataset_val, FLAGS.batch_size, shuffle=False, span=True)): si_all, ei_all, loss = self._run_epoch_val_minibatch(sess, batch) prog.update(i + 1, exact=[('val loss', loss)]) si_pred.append(si_all) # order should be preserved. shuffle=False ei_pred.append(ei_all) #losses.append(loss) si_pred = np.concatenate(si_pred) ei_pred = np.concatenate(ei_pred) span = self.pick_si_ei(si_pred, ei_pred) answers = self.span_to_pred(span, dataset_val['.ids.context']) f1, em = self.evaluate(answers, dataset_val['.answer']) #f1f1em = 2/(1/f1+1/em) f1f1em = 2/(1/max(f1, 1e-10)+1/max(em, 1e-10)) # for small datasets logging.info('last val loss {}'.format(loss)) logging.info("F1: {}, EM: {}, F1F1EM: {}".format(f1, em, f1f1em)) return f1f1em
def _run_epoch_train_part(self, sess, dataset_train): ''' trains for one epoch ''' prog = util.Progbar( target=1 + int(len(dataset_train['.ids.context']) / \ FLAGS.batch_size)) for i, batch in enumerate(self.get_mini(dataset_train, FLAGS.batch_size, shuffle=True, span=True)): out = self._run_epoch_train_minibatch(sess, batch) #_, loss, gnorm, mem, grad, var = out _, loss, gnorm, mem = out mem = int(mem)>>20 prog.update(i + 1, exact=[('train loss', loss), ('gnorm', gnorm), ('mem', mem)]) # mb if np.isnan(gnorm): logging.info('gnorm nan') #np.save('nan_grad', grad) #np.save('nan_var', var) raise Exception('gnorm nan') logging.info( 'last train loss {}, gnorm {}, mem {}'.format(loss, gnorm, mem))
def docs(dataset_name): p = util.Progbar(target=(util.lines_in_file(directories.RAW + dataset_name))) for i, d in enumerate(util.load_json_lines(directories.RAW + dataset_name)): p.update(i + 1) yield d
def __init__(self, trainer, docs, data, message, replay_memory=None, beta=0, docs_per_iteration=10000): self.trainer = trainer self.data = data self.model = trainer.model self.message = message self.replay_memory = replay_memory self.beta = beta self.loss_aggregator = Aggregator() self.evaluators = [ evaluation.Evaluator(metric=evaluation.muc), evaluation.Evaluator(metric=evaluation.b_cubed), evaluation.Evaluator(metric=evaluation.ceafe) ] self.merged_pairs = {} self.training = self.replay_memory is not None print self.message random.shuffle(docs) if self.training: docs = docs[:docs_per_iteration] prog = util.Progbar(len(docs)) for i, (doc, actionstate) in enumerate(docs): self.trainer.doc = doc self.trainer.actionstate = actionstate if len(actionstate.possible_pairs) != 0: actionstate.load(self.data, self.trainer.pair_model, self.trainer.anaphoricity_model) s = State(doc, actionstate) doc_merged_pairs = self.run_agent(s, beta, i) for evaluator in self.evaluators: evaluator.update(doc) self.merged_pairs[doc.did] = doc_merged_pairs doc.reset() actionstate.clear() muc, b3, ceafe = (self.evaluators[i].get_f1() for i in range(3)) exact = [('muc', 100 * muc), ('b3', 100 * b3), ('ceafe', 100 * ceafe), ('conll', 100 * (muc + b3 + ceafe) / 3), ('loss', self.loss_aggregator.get_avg())] prog.update(i + 1, exact=exact)
def train_all(self): timer.start("train") model_weights = self.model.get_weights() prog = util.Progbar(len(self.memory)) random.shuffle(self.memory) for i, X in enumerate(self.memory): loss = self.train_on_example(X) prog.update(i + 1, [("loss", loss)]) self.size = 0 self.memory = [] timer.stop("train") weight_diffs = [(np.sum(np.abs(new_weight - old_weight)), new_weight.size) for new_weight, old_weight in zip( self.model.get_weights(), model_weights)] summed = np.sum(map(np.array, weight_diffs), axis=0) print "weight diffs", weight_diffs, summed
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None): doc_vectors = util.load_pickle(directories.MISC + name.replace("_reduced", "") + "_document_vectors.pkl") main_pairs = PairDataBuilder(columns) tune_pairs = PairDataBuilder(columns) main_mentions = MentionDataBuilder(columns) tune_mentions = MentionDataBuilder(columns) main_docs = DocumentDataBuilder(columns) tune_docs = DocumentDataBuilder(columns) print "Building dataset", name p = util.Progbar( target=(2 if reduced else util.lines_in_file(directories.RAW + name))) for i, d in enumerate(util.load_json_lines(directories.RAW + name)): if reduced and i > 2: break p.update(i + 1) if reduced and tune_fraction != 0: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if i == 0 else (tune_pairs, tune_mentions, tune_docs) else: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs) ms, ps = mentions.size(), pairs.size() mention_positions = {} for mention_num in sorted(d["mentions"].keys(), key=int): mention_positions[mention_num] = mentions.size() mentions.add_mention( d["mentions"][mention_num], vectors, doc_vectors[d["mentions"][mention_num]["doc_id"]]) for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))): k1, k2 = key.split() pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2], int(d["mentions"][k1]["doc_id"]), int(d["mentions"][k1]["mention_id"]), int(d["mentions"][k2]["mention_id"]), d["pair_features"][key]) me, pe = mentions.size(), pairs.size() docs.add_doc(ms, me, ps, pe, d["document_features"]) suffix = ("_reduced" if reduced else "") if tune_mentions.size() > 0: tune_mentions.write(name + "_tune" + suffix) tune_pairs.write(name + "_tune" + suffix) tune_docs.write(name + "_tune" + suffix) main_mentions.write(name + "_train" + suffix) main_pairs.write(name + "_train" + suffix) main_docs.write(name + "_train" + suffix) else: main_mentions.write(name + suffix) main_pairs.write(name + suffix) main_docs.write(name + suffix)