def __init__(self, config, sess, checkpoint_saver, best_model_saver, restore_if_possible=True): self.config = config self.checkpoint_saver = checkpoint_saver self.best_model_saver = best_model_saver tf.gfile.MakeDirs(config.checkpoints_dir) if restore_if_possible and tf.gfile.Exists(config.progress): history, current_file, current_line = utils.load_cpickle( config.progress, memoized=False) self.history = history self.unlabeled_data_reader = unlabeled_data.UnlabeledDataReader( config, current_file, current_line) utils.log( "Continuing from global step", dict(self.history[-1])["step"], "(lm1b file {:}, line {:})".format(current_file, current_line)) self.checkpoint_saver.restore( sess, tf.train.latest_checkpoint(self.config.checkpoints_dir)) else: utils.log("No previous checkpoint found - starting from scratch") self.history = [] self.unlabeled_data_reader = ( unlabeled_data.UnlabeledDataReader(config))
def main(): utils.heading('SETUP') config = configure.Config(mode=FLAGS.mode, model_name=FLAGS.model_name) config.write() with tf.Graph().as_default() as graph: model_trainer = trainer.Trainer(config) summary_writer = tf.summary.FileWriter(config.summaries_dir) checkpoints_saver = tf.train.Saver(max_to_keep=1) best_model_saver = tf.train.Saver(max_to_keep=1) init_op = tf.global_variables_initializer() graph.finalize() with tf.Session() as sess: sess.run(init_op) progress = training_progress.TrainingProgress( config, sess, checkpoints_saver, best_model_saver, config.mode == 'train') utils.log() if config.mode == 'train': utils.heading('START TRAINING ({:})'.format(config.model_name)) model_trainer.train(sess, progress, summary_writer) elif config.mode == 'eval': utils.heading('RUN EVALUATION ({:})'.format(config.model_name)) progress.best_model_saver.restore( sess, tf.train.latest_checkpoint(config.checkpoints_dir)) model_trainer.evaluate_all_tasks(sess, summary_writer, None) else: raise ValueError('Mode must be "train" or "eval"')
def _evaluate_task(self, sess, task, summary_writer, train_set): scorer = task.get_scorer() data = task.train_set if train_set else task.val_set for i, mb in enumerate(data.get_minibatches(self._config.test_batch_size)): loss, batch_preds = self._model.test(sess, mb) scorer.update(mb.examples, batch_preds, loss) results = scorer.get_results(task.name + ('_train_' if train_set else '_dev_')) utils.log(task.name.upper() + ': ' + scorer.results_str()) write_summary(summary_writer, results, global_step=self._model.get_global_step(sess)) return results
def train(self, sess, progress, summary_writer): heading = lambda s: utils.heading(s, '(' + self._config.model_name + ')') trained_on_sentences = 0 start_time = time.time() unsupervised_loss_total, unsupervised_loss_count = 0, 0 supervised_loss_total, supervised_loss_count = 0, 0 for mb in self._get_training_mbs(progress.unlabeled_data_reader): if mb.task_name != 'unlabeled': loss = self._model.train_labeled(sess, mb) print('train loss', loss) supervised_loss_total += loss supervised_loss_count += 1 if mb.task_name == 'unlabeled': self._model.run_teacher(sess, mb) loss = self._model.train_unlabeled(sess, mb) unsupervised_loss_total += loss unsupervised_loss_count += 1 mb.teacher_predictions.clear() trained_on_sentences += mb.size global_step = self._model.get_global_step(sess) if global_step % self._config.print_every == 0: supervised_loss_reported = supervised_loss_total / max( 1, supervised_loss_count) utils.log( 'step {:} - ' 'supervised loss: {:.3f} - ' 'unsupervised loss: {:.3f} - ' '{:.1f} sentences per second'.format( global_step, supervised_loss_reported, unsupervised_loss_total / max(1, unsupervised_loss_count), trained_on_sentences / (time.time() - start_time))) unsupervised_loss_total, unsupervised_loss_count = 0, 0 supervised_loss_total, supervised_loss_count = 0, 0 summary_writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='loss', simple_value=supervised_loss_reported) ]), global_step) if global_step % self._config.eval_dev_every == 0: heading('EVAL ON DEV') self.evaluate_all_tasks(sess, summary_writer, progress.history) progress.save_if_best_dev_model(sess, global_step) utils.log() if global_step % self._config.eval_train_every == 0: heading('EVAL ON TRAIN') self.evaluate_all_tasks(sess, summary_writer, progress.history, True) utils.log() if global_step % self._config.save_model_every == 0: heading('CHECKPOINTING MODEL') progress.write(sess, global_step) utils.log()
def _get_examples(self, split): word_vocab = embeddings.get_word_vocab(self._config) char_vocab = embeddings.get_char_vocab() examples = [ TaggingExample( self._config, self._is_token_level, words, tags, word_vocab, char_vocab, self.label_mapping, self._task_name) for words, tags in self.get_labeled_sentences(split)] if self._config.train_set_percent < 100: utils.log('using reduced train set ({:}%)'.format( self._config.train_set_percent)) random.shuffle(examples) examples = examples[:int(len(examples) * self._config.train_set_percent / 100.0)] return examples
def save_if_best_dev_model(self, sess, global_step): best_avg_score = 0 for i, results in enumerate(self.history): if any("train" in metric for metric, value in results): continue total, count = 0, 0 for metric, value in results: if "f1" in metric or "las" in metric or "accuracy" in metric: total += value count += 1 avg_score = total / count if avg_score >= best_avg_score: best_avg_score = avg_score if i == len(self.history) - 1: utils.log("New best model! Saving...") self.best_model_saver.save(sess, self.config.best_model_checkpoint, global_step=global_step)
def _evaluate_translate(self, sess, task, summary_writer, train_set): scorer = task.get_scorer() data = task.train_set if train_set else task.val_set for i, mb in enumerate( data.get_minibatches_without_weight( self._config.translate_batch_size)): print(i) if i == 100: break tgt = self._model.translate(sess, mb=mb) scorer.update(mb.examples, [tgt], 0) results = scorer.get_results(task.name + ('_train_' if train_set else '_dev_')) utils.log(task.name.upper() + ': ' + scorer.results_str()) write_summary(summary_writer, results, global_step=self._model.get_global_step(sess)) return results
def __init__(self, config, sess, checkpoint_saver, best_model_saver, restore_if_possible=True): self.config = config self.checkpoint_saver = checkpoint_saver self.best_model_saver = best_model_saver tf.gfile.MakeDirs(config.checkpoints_dir) if restore_if_possible and tf.gfile.Exists(config.progress): history, current_file, current_line = utils.load_cpickle( config.progress, memoized=False) self.history = history self.unlabeled_data_reader = unlabeled_data.UnlabeledDataReader( config, current_file, current_line) utils.log("Continuing from global step", dict(self.history[-1])["step"], "(lm1b file {:}, line {:})".format(current_file, current_line)) self.checkpoint_saver.restore(sess, tf.train.latest_checkpoint( self.config.checkpoints_dir)) else: utils.log("No previous checkpoint found - starting from scratch") self.history = [] self.unlabeled_data_reader = ( unlabeled_data.UnlabeledDataReader(config))
def __init__(self, config): self._config = config self.tasks = [task_definitions.get_task(self._config, task_name) for task_name in self._config.task_names] utils.log('Loading Pretrained Embeddings') pretrained_embeddings = utils.load_cpickle(self._config.word_embeddings) utils.log('Building Model') self._model = multitask_model.Model( self._config, pretrained_embeddings, self.tasks) utils.log()
def main(data_dir='/content/data'): random.seed(0) utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS") for pretrained in ['glove.6B.100d.txt']: config = configure.Config(data_dir=data_dir, for_preprocessing=True, pretrained_embeddings=pretrained, word_embedding_size=100) embeddings.PretrainedEmbeddingLoader(config).build() utils.log("WRITING LABEL MAPPINGS") for task_name in ["senclass"]: config = configure.Config(data_dir=data_dir, for_preprocessing=True) loader = sentence_level_data.SentenceClassificationDataLoader(config, task_name) utils.log("WRITING LABEL MAPPING FOR", task_name.upper()) utils.log(" ", len(loader.label_mapping), "classes") utils.write_cpickle(loader.label_mapping, loader.label_mapping_path)
def build(self): utils.log('loading pretrained embeddings from', self.config.pretrained_embeddings_file) for special in SPECIAL_TOKENS: self._add_vector(special) for extra in _EXTRA_WORDS: self._add_vector(extra) with tf.gfile.GFile(self.config.pretrained_embeddings_file, 'r') as f: for i, line in enumerate(f): if i % 10000 == 0: utils.log('on line', i) split = line.decode('utf8').split() w = normalize_word(split[0]) try: vec = np.array(map(float, split[1:]), dtype='float32') if vec.size != self.vector_size: utils.log('vector for line', i, 'has size', vec.size, 'so skipping') utils.log(line[:100] + '...') continue except: utils.log('can\'t parse line', i, 'so skipping') utils.log(line[:100] + '...') continue if w not in self.vocabulary: self.vocabulary[w] = len(self.vectors) self.vectors.append(vec) utils.log('writing vectors!') self._write()
def main(data_dir='./data'): random.seed(0) utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS") for pretrained in ['glove.6B.300d.txt']: config = configure.Config(data_dir=data_dir, for_preprocessing=True, pretrained_embeddings=pretrained, word_embedding_size=300) embeddings.PretrainedEmbeddingLoader(config).build() utils.log("CONSTRUCTING DEV SETS") for task_name in ["chunk"]: # chunking does not come with a provided dev split, so create one by # selecting a random subset of the data config = configure.Config(data_dir=data_dir, for_preprocessing=True) task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/' train_sentences = word_level_data.TaggedDataLoader( config, task_name, False).get_labeled_sentences("train") random.shuffle(train_sentences) write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:]) write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500]) utils.log("WRITING LABEL MAPPINGS") for task_name in ["chunk"]: for i, label_encoding in enumerate(["BIOES"]): config = configure.Config(data_dir=data_dir, for_preprocessing=True, label_encoding=label_encoding) token_level = task_name in ["ccg", "pos", "depparse"] loader = word_level_data.TaggedDataLoader(config, task_name, token_level) if token_level: if i != 0: continue utils.log("WRITING LABEL MAPPING FOR", task_name.upper()) else: utils.log(" Writing label mapping for", task_name.upper(), label_encoding) utils.log(" ", len(loader.label_mapping), "classes") utils.write_cpickle(loader.label_mapping, loader.label_mapping_path)
def main(data_dir='/content/data'): random.seed(0) utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS") for pretrained in ['glove.6B.50d.txt']: config = configure.Config(data_dir=data_dir, for_preprocessing=True, pretrained_embeddings=pretrained, word_embedding_size=50) embeddings.PretrainedEmbeddingLoader(config).build() utils.log("CONSTRUCTING DEV SETS") for task_name in ["chunk"]: # chunking does not come with a provided dev split, so create one by # selecting a random subset of the data config = configure.Config(data_dir=data_dir, for_preprocessing=True) task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/' train_sentences = word_level_data.TaggedDataLoader( config, task_name, False).get_labeled_sentences("train") random.shuffle(train_sentences) write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:]) write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500]) utils.log("WRITING LABEL MAPPINGS") for task_name in ["chunk"]: for i, label_encoding in enumerate(["BIOES"]): config = configure.Config(data_dir=data_dir, for_preprocessing=True, label_encoding=label_encoding) token_level = task_name in ["ccg", "pos", "depparse"] loader = word_level_data.TaggedDataLoader(config, task_name, token_level) if token_level: if i != 0: continue utils.log("WRITING LABEL MAPPING FOR", task_name.upper()) else: utils.log(" Writing label mapping for", task_name.upper(), label_encoding) utils.log(" ", len(loader.label_mapping), "classes") utils.write_cpickle(loader.label_mapping, loader.label_mapping_path)
def build(self): utils.log('loading pretrained embeddings from', self.config.pretrained_embeddings_file) for special in SPECIAL_TOKENS: self._add_vector(special) for extra in _EXTRA_WORDS: self._add_vector(extra) with tf.gfile.GFile( self.config.pretrained_embeddings_file, 'r') as f: for i, line in enumerate(f): if i % 10000 == 0: utils.log('on line', i) split = line.decode('utf8').split() w = normalize_word(split[0]) try: vec = np.array(map(float, split[1:]), dtype='float32') if vec.size != self.vector_size: utils.log('vector for line', i, 'has size', vec.size, 'so skipping') utils.log(line[:100] + '...') continue except: utils.log('can\'t parse line', i, 'so skipping') utils.log(line[:100] + '...') continue if w not in self.vocabulary: self.vocabulary[w] = len(self.vectors) self.vectors.append(vec) utils.log('writing vectors!') self._write()
def main(): utils.heading('SETUP') config = configure.Config(mode=FLAGS.mode, model_name=FLAGS.model_name) config.write() if config.mode == 'encode': word_vocab = embeddings.get_word_vocab(config) sentence = "Squirrels , for example , would show up , look for the peanut , go away .".split() sentence = ([word_vocab[embeddings.normalize_word(w)] for w in sentence]) print(sentence) return if config.mode == 'decode': word_vocab_reversed = embeddings.get_word_vocab_reversed(config) sentence = "25709 33 42 879 33 86 304 92 33 676 42 32 13406 33 273 445 34".split() sentence = ([word_vocab_reversed[int(w)] for w in sentence]) print(sentence) return if config.mode == 'encode-vi': word_vocab_vi = embeddings.get_word_vocab_vi(config) print(len(word_vocab_vi)) sentence = "Mỗi_một khoa_học_gia đều thuộc một nhóm nghiên_cứu , và mỗi nhóm đều nghiên_cứu rất nhiều đề_tài đa_dạng .".split() sentence = ([word_vocab_vi[embeddings.normalize_word(w)] for w in sentence]) print(sentence) return if config.mode == 'decode-vi': word_vocab_reversed_vi = embeddings.get_word_vocab_reversed_vi(config) sentence = "8976 32085 129 178 17 261 381 5 7 195 261 129 381 60 37 2474 1903 6".split() sentence = ([word_vocab_reversed_vi[int(w)] for w in sentence]) print(sentence) return if config.mode == 'embed': word_embeddings = embeddings.get_word_embeddings(config) word = 50 embed = word_embeddings[word] print(' '.join(str(x) for x in embed)) return if config.mode == 'embed-vi': word_embeddings_vi = embeddings.get_word_embeddings_vi(config) word = 50 embed = word_embeddings_vi[word] print(' '.join(str(x) for x in embed)) return with tf.Graph().as_default() as graph: model_trainer = trainer.Trainer(config) summary_writer = tf.summary.FileWriter(config.summaries_dir) checkpoints_saver = tf.train.Saver(max_to_keep=1) best_model_saver = tf.train.Saver(max_to_keep=1) init_op = tf.global_variables_initializer() graph.finalize() with tf.Session() as sess: sess.run(init_op) progress = training_progress.TrainingProgress( config, sess, checkpoints_saver, best_model_saver, config.mode == 'train') utils.log() if config.mode == 'train': #summary_writer.add_graph(sess.graph) utils.heading('START TRAINING ({:})'.format(config.model_name)) model_trainer.train(sess, progress, summary_writer) elif config.mode == 'eval-train': utils.heading('RUN EVALUATION ({:})'.format(config.model_name)) progress.best_model_saver.restore(sess, tf.train.latest_checkpoint( config.checkpoints_dir)) model_trainer.evaluate_all_tasks(sess, summary_writer, None, train_set=True) elif config.mode == 'eval-dev': utils.heading('RUN EVALUATION ({:})'.format(config.model_name)) progress.best_model_saver.restore(sess, tf.train.latest_checkpoint( config.checkpoints_dir)) model_trainer.evaluate_all_tasks(sess, summary_writer, None, train_set=False) elif config.mode == 'infer': utils.heading('START INFER ({:})'.format(config.model_name)) progress.best_model_saver.restore(sess, tf.train.latest_checkpoint( config.checkpoints_dir)) model_trainer.infer(sess) elif config.mode == 'translate': utils.heading('START TRANSLATE ({:})'.format(config.model_name)) progress.best_model_saver.restore(sess, tf.train.latest_checkpoint( config.checkpoints_dir)) model_trainer.translate(sess) elif config.mode == 'eval-translate-train': utils.heading('RUN EVALUATION ({:})'.format(config.model_name)) progress.best_model_saver.restore(sess, tf.train.latest_checkpoint( config.checkpoints_dir)) model_trainer.evaluate_all_tasks(sess, summary_writer, None, train_set=True, is_translate=True) elif config.mode == 'eval-translate-dev': utils.heading('RUN EVALUATION ({:})'.format(config.model_name)) progress.best_model_saver.restore(sess, tf.train.latest_checkpoint( config.checkpoints_dir)) model_trainer.evaluate_all_tasks(sess, summary_writer, None, train_set=False, is_translate=True) else: raise ValueError('Mode must be "train" or "eval"')