def __init__(self,
                 config,
                 sess,
                 checkpoint_saver,
                 best_model_saver,
                 restore_if_possible=True):
        self.config = config
        self.checkpoint_saver = checkpoint_saver
        self.best_model_saver = best_model_saver

        tf.gfile.MakeDirs(config.checkpoints_dir)
        if restore_if_possible and tf.gfile.Exists(config.progress):
            history, current_file, current_line = utils.load_cpickle(
                config.progress, memoized=False)
            self.history = history
            self.unlabeled_data_reader = unlabeled_data.UnlabeledDataReader(
                config, current_file, current_line)
            utils.log(
                "Continuing from global step",
                dict(self.history[-1])["step"],
                "(lm1b file {:}, line {:})".format(current_file, current_line))
            self.checkpoint_saver.restore(
                sess, tf.train.latest_checkpoint(self.config.checkpoints_dir))
        else:
            utils.log("No previous checkpoint found - starting from scratch")
            self.history = []
            self.unlabeled_data_reader = (
                unlabeled_data.UnlabeledDataReader(config))
Example #2
0
def main():
    utils.heading('SETUP')
    config = configure.Config(mode=FLAGS.mode, model_name=FLAGS.model_name)
    config.write()
    with tf.Graph().as_default() as graph:
        model_trainer = trainer.Trainer(config)
        summary_writer = tf.summary.FileWriter(config.summaries_dir)
        checkpoints_saver = tf.train.Saver(max_to_keep=1)
        best_model_saver = tf.train.Saver(max_to_keep=1)
        init_op = tf.global_variables_initializer()
        graph.finalize()
        with tf.Session() as sess:
            sess.run(init_op)
            progress = training_progress.TrainingProgress(
                config, sess, checkpoints_saver, best_model_saver,
                config.mode == 'train')
            utils.log()
            if config.mode == 'train':
                utils.heading('START TRAINING ({:})'.format(config.model_name))
                model_trainer.train(sess, progress, summary_writer)
            elif config.mode == 'eval':
                utils.heading('RUN EVALUATION ({:})'.format(config.model_name))
                progress.best_model_saver.restore(
                    sess, tf.train.latest_checkpoint(config.checkpoints_dir))
                model_trainer.evaluate_all_tasks(sess, summary_writer, None)
            else:
                raise ValueError('Mode must be "train" or "eval"')
Example #3
0
  def _evaluate_task(self, sess, task, summary_writer, train_set):
    scorer = task.get_scorer()
    data = task.train_set if train_set else task.val_set
    for i, mb in enumerate(data.get_minibatches(self._config.test_batch_size)):
      loss, batch_preds = self._model.test(sess, mb)
      scorer.update(mb.examples, batch_preds, loss)

    results = scorer.get_results(task.name +
                                 ('_train_' if train_set else '_dev_'))
    utils.log(task.name.upper() + ': ' + scorer.results_str())
    write_summary(summary_writer, results,
                  global_step=self._model.get_global_step(sess))
    return results
Example #4
0
    def train(self, sess, progress, summary_writer):
        heading = lambda s: utils.heading(s, '(' + self._config.model_name +
                                          ')')
        trained_on_sentences = 0
        start_time = time.time()
        unsupervised_loss_total, unsupervised_loss_count = 0, 0
        supervised_loss_total, supervised_loss_count = 0, 0
        for mb in self._get_training_mbs(progress.unlabeled_data_reader):
            if mb.task_name != 'unlabeled':
                loss = self._model.train_labeled(sess, mb)
                print('train loss', loss)
                supervised_loss_total += loss
                supervised_loss_count += 1

            if mb.task_name == 'unlabeled':
                self._model.run_teacher(sess, mb)
                loss = self._model.train_unlabeled(sess, mb)
                unsupervised_loss_total += loss
                unsupervised_loss_count += 1
                mb.teacher_predictions.clear()

            trained_on_sentences += mb.size
            global_step = self._model.get_global_step(sess)

            if global_step % self._config.print_every == 0:
                supervised_loss_reported = supervised_loss_total / max(
                    1, supervised_loss_count)
                utils.log(
                    'step {:} - '
                    'supervised loss: {:.3f} - '
                    'unsupervised loss: {:.3f} - '
                    '{:.1f} sentences per second'.format(
                        global_step, supervised_loss_reported,
                        unsupervised_loss_total /
                        max(1, unsupervised_loss_count),
                        trained_on_sentences / (time.time() - start_time)))
                unsupervised_loss_total, unsupervised_loss_count = 0, 0
                supervised_loss_total, supervised_loss_count = 0, 0
                summary_writer.add_summary(
                    tf.Summary(value=[
                        tf.Summary.Value(tag='loss',
                                         simple_value=supervised_loss_reported)
                    ]), global_step)

            if global_step % self._config.eval_dev_every == 0:
                heading('EVAL ON DEV')
                self.evaluate_all_tasks(sess, summary_writer, progress.history)
                progress.save_if_best_dev_model(sess, global_step)
                utils.log()

            if global_step % self._config.eval_train_every == 0:
                heading('EVAL ON TRAIN')
                self.evaluate_all_tasks(sess, summary_writer, progress.history,
                                        True)
                utils.log()

            if global_step % self._config.save_model_every == 0:
                heading('CHECKPOINTING MODEL')
                progress.write(sess, global_step)
                utils.log()
Example #5
0
 def _get_examples(self, split):
   word_vocab = embeddings.get_word_vocab(self._config)
   char_vocab = embeddings.get_char_vocab()
   examples = [
       TaggingExample(
           self._config, self._is_token_level, words, tags,
           word_vocab, char_vocab, self.label_mapping, self._task_name)
       for words, tags in self.get_labeled_sentences(split)]
   if self._config.train_set_percent < 100:
     utils.log('using reduced train set ({:}%)'.format(
         self._config.train_set_percent))
     random.shuffle(examples)
     examples = examples[:int(len(examples) *
                              self._config.train_set_percent / 100.0)]
   return examples
Example #6
0
 def _get_examples(self, split):
   word_vocab = embeddings.get_word_vocab(self._config)
   char_vocab = embeddings.get_char_vocab()
   examples = [
       TaggingExample(
           self._config, self._is_token_level, words, tags,
           word_vocab, char_vocab, self.label_mapping, self._task_name)
       for words, tags in self.get_labeled_sentences(split)]
   if self._config.train_set_percent < 100:
     utils.log('using reduced train set ({:}%)'.format(
         self._config.train_set_percent))
     random.shuffle(examples)
     examples = examples[:int(len(examples) *
                              self._config.train_set_percent / 100.0)]
   return examples
 def save_if_best_dev_model(self, sess, global_step):
   best_avg_score = 0
   for i, results in enumerate(self.history):
     if any("train" in metric for metric, value in results):
       continue
     total, count = 0, 0
     for metric, value in results:
       if "f1" in metric or "las" in metric or "accuracy" in metric:
         total += value
         count += 1
     avg_score = total / count
     if avg_score >= best_avg_score:
       best_avg_score = avg_score
       if i == len(self.history) - 1:
         utils.log("New best model! Saving...")
         self.best_model_saver.save(sess, self.config.best_model_checkpoint,
                                    global_step=global_step)
Example #8
0
 def save_if_best_dev_model(self, sess, global_step):
   best_avg_score = 0
   for i, results in enumerate(self.history):
     if any("train" in metric for metric, value in results):
       continue
     total, count = 0, 0
     for metric, value in results:
       if "f1" in metric or "las" in metric or "accuracy" in metric:
         total += value
         count += 1
     avg_score = total / count
     if avg_score >= best_avg_score:
       best_avg_score = avg_score
       if i == len(self.history) - 1:
         utils.log("New best model! Saving...")
         self.best_model_saver.save(sess, self.config.best_model_checkpoint,
                                    global_step=global_step)
Example #9
0
    def _evaluate_translate(self, sess, task, summary_writer, train_set):
        scorer = task.get_scorer()
        data = task.train_set if train_set else task.val_set
        for i, mb in enumerate(
                data.get_minibatches_without_weight(
                    self._config.translate_batch_size)):
            print(i)
            if i == 100:
                break
            tgt = self._model.translate(sess, mb=mb)
            scorer.update(mb.examples, [tgt], 0)

        results = scorer.get_results(task.name +
                                     ('_train_' if train_set else '_dev_'))
        utils.log(task.name.upper() + ': ' + scorer.results_str())
        write_summary(summary_writer,
                      results,
                      global_step=self._model.get_global_step(sess))
        return results
Example #10
0
  def __init__(self, config, sess, checkpoint_saver, best_model_saver,
              restore_if_possible=True):
    self.config = config
    self.checkpoint_saver = checkpoint_saver
    self.best_model_saver = best_model_saver

    tf.gfile.MakeDirs(config.checkpoints_dir)
    if restore_if_possible and tf.gfile.Exists(config.progress):
      history, current_file, current_line = utils.load_cpickle(
          config.progress, memoized=False)
      self.history = history
      self.unlabeled_data_reader = unlabeled_data.UnlabeledDataReader(
          config, current_file, current_line)
      utils.log("Continuing from global step", dict(self.history[-1])["step"],
                "(lm1b file {:}, line {:})".format(current_file, current_line))
      self.checkpoint_saver.restore(sess, tf.train.latest_checkpoint(
          self.config.checkpoints_dir))
    else:
      utils.log("No previous checkpoint found - starting from scratch")
      self.history = []
      self.unlabeled_data_reader = (
          unlabeled_data.UnlabeledDataReader(config))
Example #11
0
  def __init__(self, config):
    self._config = config
    self.tasks = [task_definitions.get_task(self._config, task_name)
                  for task_name in self._config.task_names]

    utils.log('Loading Pretrained Embeddings')
    pretrained_embeddings = utils.load_cpickle(self._config.word_embeddings)

    utils.log('Building Model')
    self._model = multitask_model.Model(
        self._config, pretrained_embeddings, self.tasks)
    utils.log()
Example #12
0
def main(data_dir='/content/data'):
  random.seed(0)

  utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS")
  for pretrained in ['glove.6B.100d.txt']:
    config = configure.Config(data_dir=data_dir,
                              for_preprocessing=True,
                              pretrained_embeddings=pretrained,
                              word_embedding_size=100)
    embeddings.PretrainedEmbeddingLoader(config).build()

  utils.log("WRITING LABEL MAPPINGS")
  for task_name in ["senclass"]:
    config = configure.Config(data_dir=data_dir,
                              for_preprocessing=True)
    loader = sentence_level_data.SentenceClassificationDataLoader(config, task_name)
    utils.log("WRITING LABEL MAPPING FOR", task_name.upper())
    utils.log(" ", len(loader.label_mapping), "classes")
    utils.write_cpickle(loader.label_mapping,
                        loader.label_mapping_path)
    def build(self):
        utils.log('loading pretrained embeddings from',
                  self.config.pretrained_embeddings_file)
        for special in SPECIAL_TOKENS:
            self._add_vector(special)
        for extra in _EXTRA_WORDS:
            self._add_vector(extra)
        with tf.gfile.GFile(self.config.pretrained_embeddings_file, 'r') as f:
            for i, line in enumerate(f):
                if i % 10000 == 0:
                    utils.log('on line', i)

                split = line.decode('utf8').split()
                w = normalize_word(split[0])

                try:
                    vec = np.array(map(float, split[1:]), dtype='float32')
                    if vec.size != self.vector_size:
                        utils.log('vector for line', i, 'has size', vec.size,
                                  'so skipping')
                        utils.log(line[:100] + '...')
                        continue
                except:
                    utils.log('can\'t parse line', i, 'so skipping')
                    utils.log(line[:100] + '...')
                    continue
                if w not in self.vocabulary:
                    self.vocabulary[w] = len(self.vectors)
                    self.vectors.append(vec)
        utils.log('writing vectors!')
        self._write()
Example #14
0
def main(data_dir='./data'):
  random.seed(0)

  utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS")
  for pretrained in ['glove.6B.300d.txt']:
    config = configure.Config(data_dir=data_dir,
                              for_preprocessing=True,
                              pretrained_embeddings=pretrained,
                              word_embedding_size=300)
    embeddings.PretrainedEmbeddingLoader(config).build()

  utils.log("CONSTRUCTING DEV SETS")
  for task_name in ["chunk"]:
    # chunking does not come with a provided dev split, so create one by
    # selecting a random subset of the data
    config = configure.Config(data_dir=data_dir,
                              for_preprocessing=True)
    task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/'
    train_sentences = word_level_data.TaggedDataLoader(
        config, task_name, False).get_labeled_sentences("train")
    random.shuffle(train_sentences)
    write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:])
    write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500])

  utils.log("WRITING LABEL MAPPINGS")
  for task_name in ["chunk"]:
    for i, label_encoding in enumerate(["BIOES"]):
      config = configure.Config(data_dir=data_dir,
                                for_preprocessing=True,
                                label_encoding=label_encoding)
      token_level = task_name in ["ccg", "pos", "depparse"]
      loader = word_level_data.TaggedDataLoader(config, task_name, token_level)
      if token_level:
        if i != 0:
          continue
        utils.log("WRITING LABEL MAPPING FOR", task_name.upper())
      else:
        utils.log("  Writing label mapping for", task_name.upper(),
                  label_encoding)
      utils.log(" ", len(loader.label_mapping), "classes")
      utils.write_cpickle(loader.label_mapping,
                          loader.label_mapping_path)
Example #15
0
def main(data_dir='/content/data'):
  random.seed(0)

  utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS")
  for pretrained in ['glove.6B.50d.txt']:
    config = configure.Config(data_dir=data_dir,
                              for_preprocessing=True,
                              pretrained_embeddings=pretrained,
                              word_embedding_size=50)
    embeddings.PretrainedEmbeddingLoader(config).build()

  utils.log("CONSTRUCTING DEV SETS")
  for task_name in ["chunk"]:
    # chunking does not come with a provided dev split, so create one by
    # selecting a random subset of the data
    config = configure.Config(data_dir=data_dir,
                              for_preprocessing=True)
    task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/'
    train_sentences = word_level_data.TaggedDataLoader(
        config, task_name, False).get_labeled_sentences("train")
    random.shuffle(train_sentences)
    write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:])
    write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500])

  utils.log("WRITING LABEL MAPPINGS")
  for task_name in ["chunk"]:
    for i, label_encoding in enumerate(["BIOES"]):
      config = configure.Config(data_dir=data_dir,
                                for_preprocessing=True,
                                label_encoding=label_encoding)
      token_level = task_name in ["ccg", "pos", "depparse"]
      loader = word_level_data.TaggedDataLoader(config, task_name, token_level)
      if token_level:
        if i != 0:
          continue
        utils.log("WRITING LABEL MAPPING FOR", task_name.upper())
      else:
        utils.log("  Writing label mapping for", task_name.upper(),
                  label_encoding)
      utils.log(" ", len(loader.label_mapping), "classes")
      utils.write_cpickle(loader.label_mapping,
                          loader.label_mapping_path)
Example #16
0
  def build(self):
    utils.log('loading pretrained embeddings from',
              self.config.pretrained_embeddings_file)
    for special in SPECIAL_TOKENS:
      self._add_vector(special)
    for extra in _EXTRA_WORDS:
      self._add_vector(extra)
    with tf.gfile.GFile(
        self.config.pretrained_embeddings_file, 'r') as f:
      for i, line in enumerate(f):
        if i % 10000 == 0:
          utils.log('on line', i)

        split = line.decode('utf8').split()
        w = normalize_word(split[0])

        try:
          vec = np.array(map(float, split[1:]), dtype='float32')
          if vec.size != self.vector_size:
            utils.log('vector for line', i, 'has size', vec.size, 'so skipping')
            utils.log(line[:100] + '...')
            continue
        except:
          utils.log('can\'t parse line', i, 'so skipping')
          utils.log(line[:100] + '...')
          continue
        if w not in self.vocabulary:
          self.vocabulary[w] = len(self.vectors)
          self.vectors.append(vec)
    utils.log('writing vectors!')
    self._write()
Example #17
0
def main():
  utils.heading('SETUP')
  config = configure.Config(mode=FLAGS.mode, model_name=FLAGS.model_name)
  config.write()
  if config.mode == 'encode':
    word_vocab = embeddings.get_word_vocab(config)
    sentence = "Squirrels , for example , would show up , look for the peanut , go away .".split()
    sentence = ([word_vocab[embeddings.normalize_word(w)] for w in sentence])
    print(sentence)
    return
  if config.mode == 'decode':
    word_vocab_reversed = embeddings.get_word_vocab_reversed(config)
    sentence = "25709 33 42 879 33 86 304 92 33 676 42 32 13406 33 273 445 34".split()
    sentence = ([word_vocab_reversed[int(w)] for w in sentence])
    print(sentence)
    return
  if config.mode == 'encode-vi':
    word_vocab_vi = embeddings.get_word_vocab_vi(config)
    print(len(word_vocab_vi))
    sentence = "Mỗi_một khoa_học_gia đều thuộc một nhóm nghiên_cứu , và mỗi nhóm đều nghiên_cứu rất nhiều đề_tài đa_dạng .".split()
    sentence = ([word_vocab_vi[embeddings.normalize_word(w)] for w in sentence])
    print(sentence)
    return
  if config.mode == 'decode-vi':
    word_vocab_reversed_vi = embeddings.get_word_vocab_reversed_vi(config)
    sentence = "8976 32085 129 178 17 261 381 5 7 195 261 129 381 60 37 2474 1903 6".split()
    sentence = ([word_vocab_reversed_vi[int(w)] for w in sentence])
    print(sentence)
    return
  if config.mode == 'embed':
    word_embeddings = embeddings.get_word_embeddings(config)
    word = 50
    embed = word_embeddings[word]
    print(' '.join(str(x) for x in embed))
    return
  if config.mode == 'embed-vi':
    word_embeddings_vi = embeddings.get_word_embeddings_vi(config)
    word = 50
    embed = word_embeddings_vi[word]
    print(' '.join(str(x) for x in embed))
    return
  with tf.Graph().as_default() as graph:
    model_trainer = trainer.Trainer(config)
    summary_writer = tf.summary.FileWriter(config.summaries_dir)
    checkpoints_saver = tf.train.Saver(max_to_keep=1)
    best_model_saver = tf.train.Saver(max_to_keep=1)
    init_op = tf.global_variables_initializer()
    graph.finalize()
    with tf.Session() as sess:
      sess.run(init_op)
      progress = training_progress.TrainingProgress(
          config, sess, checkpoints_saver, best_model_saver,
          config.mode == 'train')
      utils.log()
      if config.mode == 'train':
        #summary_writer.add_graph(sess.graph)
        utils.heading('START TRAINING ({:})'.format(config.model_name))
        model_trainer.train(sess, progress, summary_writer)
      elif config.mode == 'eval-train':
        utils.heading('RUN EVALUATION ({:})'.format(config.model_name))
        progress.best_model_saver.restore(sess, tf.train.latest_checkpoint(
            config.checkpoints_dir))
        model_trainer.evaluate_all_tasks(sess, summary_writer, None, train_set=True)
      elif config.mode == 'eval-dev':
        utils.heading('RUN EVALUATION ({:})'.format(config.model_name))
        progress.best_model_saver.restore(sess, tf.train.latest_checkpoint(
            config.checkpoints_dir))
        model_trainer.evaluate_all_tasks(sess, summary_writer, None, train_set=False)
      elif config.mode == 'infer':
        utils.heading('START INFER ({:})'.format(config.model_name))
        progress.best_model_saver.restore(sess, tf.train.latest_checkpoint(
            config.checkpoints_dir))
        model_trainer.infer(sess)
      elif config.mode == 'translate':
        utils.heading('START TRANSLATE ({:})'.format(config.model_name))
        progress.best_model_saver.restore(sess, tf.train.latest_checkpoint(
          config.checkpoints_dir))
        model_trainer.translate(sess)
      elif config.mode == 'eval-translate-train':
        utils.heading('RUN EVALUATION ({:})'.format(config.model_name))
        progress.best_model_saver.restore(sess, tf.train.latest_checkpoint(
          config.checkpoints_dir))
        model_trainer.evaluate_all_tasks(sess, summary_writer, None, train_set=True, is_translate=True)
      elif config.mode == 'eval-translate-dev':
        utils.heading('RUN EVALUATION ({:})'.format(config.model_name))
        progress.best_model_saver.restore(sess, tf.train.latest_checkpoint(
          config.checkpoints_dir))
        model_trainer.evaluate_all_tasks(sess, summary_writer, None, train_set=False, is_translate=True)
      else:
        raise ValueError('Mode must be "train" or "eval"')