def main():
  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument("--corpus-dir", default='../data',
                      help="Location of pre-training text files.")
  parser.add_argument("--vocab-file", default='vocab.txt',
                      help="Location of vocabulary file.")
  parser.add_argument("--output-dir", default='tfrecord_data',
                      help="Where to write out the tfrecords.")
  parser.add_argument("--max-seq-length", default=512, type=int,
                      help="Number of tokens per example.")
  parser.add_argument("--num-processes", default=1, type=int,
                      help="Parallelize across multiple processes.")
  parser.add_argument("--blanks-separate-docs", default=False, type=bool,
                      help="Whether blank lines indicate document boundaries.")
  parser.add_argument("--do-lower-case", dest='do_lower_case',
                      action='store_true', help="Lower case input text.")
  parser.add_argument("--no-lower-case", dest='do_lower_case',
                      action='store_false', help="Don't lower case input text.")
  parser.set_defaults(do_lower_case=False)
  args = parser.parse_args()

  print(args)

  utils.rmkdir(args.output_dir)
  if args.num_processes == 1:
    write_examples(0, args)
  else:
    jobs = []
    for i in range(args.num_processes):
      job = multiprocessing.Process(target=write_examples, args=(i, args))
      jobs.append(job)
      job.start()
    for job in jobs:
      job.join()
def main():
  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument("--data-dir", required=True,
                      help="Location of data (vocab file, corpus, etc).")
  parser.add_argument("--max-seq-length", default=128, type=int,
                      help="Number of tokens per example.")
  parser.add_argument("--num-processes", default=1, type=int,
                      help="Parallelize across multiple processes.")
  parser.add_argument("--do-lower-case", dest='do_lower_case',
                      action='store_true', help="Lower case input text.")
  parser.add_argument("--no-lower-case", dest='do_lower_case',
                      action='store_false', help="Don't lower case input text.")
  parser.set_defaults(do_lower_case=True)
  args = parser.parse_args()

  utils.rmkdir(os.path.join(args.data_dir, "pretrain_tfrecords"))
  if args.num_processes == 1:
    write_examples(0, args)
  else:
    jobs = []
    for i in range(args.num_processes):
      job = multiprocessing.Process(target=write_examples, args=(i, args))
      jobs.append(job)
      job.start()
    for job in jobs:
      job.join()
Example #3
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--corpus-dir",
                        required=True,
                        help="Location of pre-training text files.")
    parser.add_argument("--output-dir",
                        required=True,
                        help="Where to write out the tfrecords.")
    args = parser.parse_args()

    utils.rmkdir(args.output_dir)

    fnames = sorted(os.listdir(args.corpus_dir))

    for file_no, fname in enumerate(fnames):
        input_file = os.path.join(args.corpus_dir, fname)
        output_file = os.path.join(args.output_dir, fname)
        print(f'Writing {fname}...')
        with open(input_file, 'r') as fi:
            with open(output_file, 'w') as fo:
                for line in fi:
                    line = line.strip().replace('\n', '')
                    if line:
                        line = json.loads(line)
                        article_text = ' '.join(line['article_text']).replace(
                            '\n', '')
                        abstract_text = ' '.join(
                            line['abstract_text']).replace('<S>', '').replace(
                                '\n', '')
                        # empty lines to split docs
                        text = f'{abstract_text} {article_text} \n\n'
                        fo.write(text)
Example #4
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  hvd.init()

  config.model_dir = config.model_dir if hvd.rank() == 0 else \
      os.path.join(config.model_dir, str(hvd.rank()))
  config.train_batch_size = config.train_batch_size // hvd.size()

  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(
      config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)

  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks, hvd)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      results.append(model_runner.evaluate())
      write_results(config, results)
      if config.write_test_outputs and trial <= config.n_writes_test:
        heading("Running on the test set and writing the predictions")
        for task in tasks:
          # Currently only writing preds for GLUE and SQuAD 2.0 is supported
          if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
                           "sts"]:
            for split in task.get_test_splits():
              model_runner.write_classification_outputs([task], trial, split)
          elif task.name == "squad":
            scorer = model_runner.evaluate_task(task, "test", False)
            scorer.write_predictions()
            preds = utils.load_json(config.qa_preds_file("squad"))
            null_odds = utils.load_json(config.qa_na_file("squad"))
            for q, _ in preds.items():
              if null_odds[q] > config.qa_na_threshold:
                preds[q] = ""
            utils.write_json(preds, config.test_predictions(
                task.name, "test", trial))
          else:
            utils.log("Skipping task", task.name,
                      "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1
Example #5
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    # initialize horovod
    hvd.init()
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    config.model_dir = config.model_dir if hvd.rank() == 0 else \
        os.path.join(config.model_dir, str(hvd.rank()))
    config.train_batch_size = config.train_batch_size // hvd.size()
    config.eval_batch_size = config.eval_batch_size // hvd.size()

    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    tpu_cluster_resolver = None
    if config.use_tpu and config.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
    tpu_config = tf.estimator.tpu.TPUConfig(
        iterations_per_loop=config.iterations_per_loop,
        num_shards=(config.num_tpu_cores
                    if config.do_train else config.num_tpu_cores),
        tpu_job_name=config.tpu_job_name,
        per_host_input_for_training=is_per_host)

    session_config = tf.ConfigProto()
    session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=config.model_dir,
        save_checkpoints_steps=config.save_checkpoints_steps,
        session_config=session_config,
        tpu_config=tpu_config)
    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=config.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=config.train_batch_size,
        eval_batch_size=config.eval_batch_size)

    if config.do_train:
        utils.heading("Running training")
        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True, hvd),
                        max_steps=config.num_train_steps,
                        hooks=hooks)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False, hvd),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #6
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--corpus-path", required=True, help="Location of pre-training text files.")
    parser.add_argument("--corpus-dir", required=True, help="Location of pre-training text files.")
    parser.add_argument("--vocab-file", required=True, help="Location of vocabulary file.")
    parser.add_argument("--output-dir", required=True, help="Where to write out the tfrecords.")
    parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.")
    parser.add_argument("--num-processes", default=1, type=int,  help="Parallelize across multiple processes.")
    parser.add_argument("--blanks-separate-docs", action='store_true', help="Whether blank lines indicate document boundaries.")
    parser.add_argument("--do-lower-case", action='store_true', help="Lower case input text.")
    parser.add_argument("--num-out-files", default=2, type=int, help="Number of .tfrecord files")
    args = parser.parse_args()
    print(args)

    assert args.num_processes <= args.num_out_files

    utils.rmkdir(args.corpus_dir)
    utils.rmkdir(args.output_dir)

    split_corpus(corpus_path=args.corpus_path, tmp_dir=args.corpus_dir, num_processes=args.num_processes)

    if args.num_processes == 1:
        write_examples(0, args)
    else:
        jobs = []
        for i in range(args.num_processes):
            job = multiprocessing.Process(target=write_examples, args=(i, args))
            jobs.append(job)
            job.start()
        for job in jobs:
            job.join()

    utils.rmrf(args.corpus_dir)
Example #7
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    if config.use_tpu:
        is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu and config.tpu_name:
            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                config.tpu_name,
                zone=config.tpu_zone,
                project=config.gcp_project)
        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=config.iterations_per_loop,
            num_shards=config.num_tpu_cores,
            tpu_job_name=config.tpu_job_name,
            per_host_input_for_training=is_per_host)
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            keep_checkpoint_max=config.keep_checkpoint_max,
            tpu_config=tpu_config)
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size)
    else:
        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            keep_checkpoint_max=config.keep_checkpoint_max)
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.Estimator(
            model_fn=tensorflow.contrib.estimator.replicate_model_fn(model_fn),
            config=run_config,
            params={"batch_size": config.train_batch_size})

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #8
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
  os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu)
  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)
  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      model_runner.evaluate()
#       results.append(model_runner.evaluate())
#       write_results(config, results)
#       if config.write_test_outputs and trial <= config.n_writes_test:
#         heading("Running on the test set and writing the predictions")
#         for task in tasks:
#           # Currently only writing preds for GLUE and SQuAD 2.0 is supported
#           if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp","sts","conv"]:
#             for split in task.get_test_splits():
#               model_runner.write_classification_outputs([task], trial, split)
#           elif task.name == "squad":
#             scorer = model_runner.evaluate_task(task, "test", False)
#             scorer.write_predictions()
#             preds = utils.load_json(config.qa_preds_file("squad"))
#             null_odds = utils.load_json(config.qa_na_file("squad"))
#             for q, _ in preds.items():
#               if null_odds[q] > config.qa_na_threshold:
#                 preds[q] = ""
#             utils.write_json(preds, config.test_predictions(
#                 task.name, "test", trial))
#           else:
#             utils.log("Skipping task", task.name,
#                       "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1
Example #9
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    # warm_start_settings = None
    # if config.init_checkpoint:
    #     from tensorflow.python.estimator.estimator import WarmStartSettings
    #     warm_start_settings = WarmStartSettings(ckpt_to_initialize_from=config.init_checkpoint,
    #                                             vars_to_warm_start=['^(?!.*global_step.*)(?!.*adam.*)(?!.*Adam.*).*$'])

    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    tpu_cluster_resolver = None
    if config.use_tpu and config.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
    tpu_config = tf.estimator.tpu.TPUConfig(
        iterations_per_loop=config.iterations_per_loop,
        num_shards=config.num_tpu_cores,
        # tpu_job_name=config.tpu_job_name,
        per_host_input_for_training=is_per_host)
    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=config.model_dir,
        save_checkpoints_steps=config.save_checkpoints_steps,
        tpu_config=tpu_config)
    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=config.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=config.train_batch_size,
        eval_batch_size=config.eval_batch_size)

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #10
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
  """Run pre-training or evaluate the pre-trained model."""
  if config.do_train == config.do_eval:
    raise ValueError("Exactly one of `do_train` or `do_eval` must be True.")
  if config.debug and config.do_train:
    utils.rmkdir(config.model_dir)
  utils.heading("Config:")
  utils.log_config(config)

  is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
  tpu_cluster_resolver = None
  if config.use_tpu:
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(zone=config.tpu_zone, 
                                                                             project=config.gcp_project)
    print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker'])
        
    if tpu_cluster_resolver:
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)

  tpu_config = tf.estimator.tpu.TPUConfig(
      iterations_per_loop=config.iterations_per_loop,
      num_shards=config.num_tpu_cores,
      tpu_job_name=config.tpu_job_name,
      per_host_input_for_training=is_per_host)
  run_config = tf.estimator.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=config.model_dir,
      save_checkpoints_steps=config.save_checkpoints_steps,
      keep_checkpoint_max=config.keep_checkpoint_max,
      tpu_config=tpu_config)
  model_fn = model_fn_builder(config=config)
  estimator = tf.estimator.tpu.TPUEstimator(
      use_tpu=config.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=config.train_batch_size,
      eval_batch_size=config.eval_batch_size)

  if config.do_train:
    utils.heading("Running training")
    estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                    max_steps=config.num_train_steps)
  if config.do_eval:
    utils.heading("Running evaluation")
    result = estimator.evaluate(
        input_fn=pretrain_data.get_input_fn(config, False),
        steps=config.num_eval_steps)
    for key in sorted(result.keys()):
      utils.log("  {:} = {:}".format(key, str(result[key])))
    return result
Example #11
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--corpus-dir",
                        required=True,
                        help="Location of pre-training text files.")
    parser.add_argument("--vocab-file",
                        required=True,
                        help="Location of vocabulary file.")
    parser.add_argument("--output-dir",
                        required=True,
                        help="Where to write out the tfrecords.")
    parser.add_argument("--max-seq-length",
                        default=128,
                        type=int,
                        help="Number of tokens per example.")
    parser.add_argument("--num-processes",
                        default=1,
                        type=int,
                        help="Parallelize across multiple processes.")
    parser.add_argument(
        "--blanks-separate-docs",
        default=True,
        type=bool,
        help="Whether blank lines indicate document boundaries.")

    # toggle strip-accents and set default to True which is the default behavior
    parser.add_argument("--do-strip-accents",
                        dest='strip_accents',
                        action='store_true',
                        help="Strip accents (default).")
    parser.add_argument("--no-strip-accents",
                        dest='strip_accents',
                        action='store_false',
                        help="Don't strip accents.")
    parser.set_defaults(strip_accents=True)

    args = parser.parse_args()

    utils.rmkdir(args.output_dir)
    if args.num_processes == 1:
        write_examples(0, args)
    else:
        jobs = []
        for i in range(args.num_processes):
            job = multiprocessing.Process(target=write_examples,
                                          args=(i, args))
            jobs.append(job)
            job.start()
        for job in jobs:
            job.join()
Example #12
0
def write_examples(job_id, args):
    """A single process creating and writing out pre-processed examples."""
    job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id))
    owt_dir = os.path.join(args.data_dir, "openwebtext")

    def log(*args):
        msg = " ".join(map(str, args))
        print("Job {}:".format(job_id), msg)

    log("Creating example writer")
    example_writer = build_pretraining_dataset.ExampleWriter(
        job_id=job_id,
        vocab_file=os.path.join(args.data_dir, "vocab.txt"),
        output_dir=os.path.join(args.data_dir, "pretrain_small_tfrecords"),
        max_seq_length=args.max_seq_length,
        num_jobs=args.num_processes,
        blanks_separate_docs=False,
        do_lower_case=args.do_lower_case)
    log("Writing tf examples")
    fnames = sorted(tf.io.gfile.listdir(owt_dir))
    fnames = [
        f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id
    ]
    random.shuffle(fnames)
    start_time = time.time()
    count = 0
    for file_no, fname in enumerate(fnames):
        if count >= MAX_DATA_ROW:
            break
        count = count + 1
        if file_no > 0 and file_no % 10 == 0:
            elapsed = time.time() - start_time
            log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
                "{:} examples written".format(
                    file_no, len(fnames), 100.0 * file_no / len(fnames),
                    int(elapsed),
                    int((len(fnames) - file_no) / (file_no / elapsed)),
                    example_writer.n_written))
        utils.rmkdir(job_tmp_dir)
        with tarfile.open(os.path.join(owt_dir, fname)) as f:
            f.extractall(job_tmp_dir)
        extracted_files = tf.io.gfile.listdir(job_tmp_dir)
        random.shuffle(extracted_files)
        for txt_fname in extracted_files:
            example_writer.write_examples(os.path.join(job_tmp_dir, txt_fname))
    example_writer.finish()
    log("Done!")
Example #13
0
 def objective(params):
     num_epochs = params['num_epochs']
     lr = params['lr']
     batch_size = params['batch_size']
     config.num_train_epochs = num_epochs
     config.learning_rate = lr
     config.train_batch_size = batch_size
     suffix = "{}_{:.6}_{}".format(num_epochs, lr, batch_size)
     config.model_dir = generic_model_dir + "_opt_" + suffix
     utils.rmkdir(config.model_dir)
     model_runner = ModelRunner(config, tasks)
     utils.heading("Start training " + suffix)
     model_runner.train()
     utils.log()
     utils.heading("Run dev set evaluation " + suffix)
     result = list(model_runner.evaluate().values())[0]
     return {'loss': -result['f1'], 'status': STATUS_OK}
Example #14
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--data-dir",
                        required=True,
                        help="Location of data (vocab file, corpus, etc).")
    parser.add_argument("--max-seq-length",
                        default=128,
                        type=int,
                        help="Number of tokens per example.")
    parser.add_argument("--num-processes",
                        default=1,
                        type=int,
                        help="Parallelize across multiple processes.")
    parser.add_argument("--do-lower-case",
                        dest='do_lower_case',
                        action='store_true',
                        help="Lower case input text.")
    parser.add_argument("--no-lower-case",
                        dest='do_lower_case',
                        action='store_false',
                        help="Don't lower case input text.")
    parser.set_defaults(do_lower_case=True)
    args = parser.parse_args()

    utils.rmkdir(os.path.join(args.data_dir, "pretrain_tfrecords"))
    manager = multiprocessing.Manager()
    bilm_list = manager.list()
    if args.num_processes == 1:
        write_examples(0, args, bilm_list)
    else:
        jobs = []
        for i in range(args.num_processes):
            job = multiprocessing.Process(target=write_examples,
                                          args=(i, args, bilm_list))
            jobs.append(job)
            job.start()
        for job in jobs:
            job.join()
    bilm = None
    for _bilm in bilm_list:
        bilm = _bilm if bilm is None else np.concatenate((bilm, _bilm), 1)
    bilm_file = os.path.join(args.data_dir, "bilm.npy")
    with open(bilm_file, 'wb') as f:
        np.save(f, bilm)
def write_examples(job_id, args):
    """A single process creating and writing out pre-processed examples."""
    job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id))
    owt_dir = os.path.join(args.data_dir, "wiki")

    def log(*args):
        msg = " ".join(map(str, args))
        print("Job {}:".format(job_id), msg)

    log("Creating example writer")
    example_writer = build_pretraining_dataset.ExampleWriter(
        job_id=job_id,
        model_file=os.path.join(args.model_dir, "wiki-ja.model"),
        vocab_file=os.path.join(args.model_dir, "wiki-ja.vocab"),
        output_dir=os.path.join(args.model_dir, "pretrain_tfrecords"),
        max_seq_length=args.max_seq_length,
        num_jobs=args.num_processes,
        blanks_separate_docs=False,
        do_lower_case=args.do_lower_case)
    log("Writing tf examples")
    fnames = tf.io.gfile.listdir(owt_dir)
    fnames = [f for f in fnames if '.' not in f]
    fnames = sorted(fnames)

    fnames = [
        f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id
    ]
    random.shuffle(fnames)
    for file_no, fname in enumerate(fnames):
        print('file number : {} of job_id: {}'.format(file_no, job_id))
        utils.rmkdir(job_tmp_dir)
        copy_tree(os.path.join(owt_dir, fname), job_tmp_dir)
        list_files = tf.io.gfile.listdir(job_tmp_dir)
        list_files = [fi for fi in list_files if fi != 'all.txt']
        for file_name in list_files:
            example_writer.write_examples(os.path.join(job_tmp_dir, file_name))
    example_writer.finish()
    log("Done!")
Example #16
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  tf.get_variable_scope().reuse_variables() #import pdb; pdb.set_trace()

  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(
      config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)
  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      results.append(model_runner.evaluate())
      write_results(config, results)
      if config.write_test_outputs and trial <= config.n_writes_test:
        heading("Running on the test set and writing the predictions")
        for task in tasks:
          # Currently only writing preds for GLUE and SQuAD 2.0 is supported
          if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
                           "sts"]:
            for split in task.get_test_splits():
              model_runner.write_classification_outputs([task], trial, split)
          elif task.name == "squad":
            scorer = model_runner.evaluate_task(task, "test", False)
            scorer.write_predictions()
            preds = utils.load_json(config.qa_preds_file("squad"))
            null_odds = utils.load_json(config.qa_na_file("squad"))
            for q, _ in preds.items():
              if null_odds[q] > config.qa_na_threshold:
                preds[q] = ""
            utils.write_json(preds, config.test_predictions(
                task.name, "test", trial))
          else:
            utils.log("Skipping task", task.name,
                      "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1

  # exporting the model
  if config.export_dir:
    # with tf.variable_scope(tf.get_variable_scope(), reuse=True):
    #   model_runner = ModelRunner(config, tasks)
    #   tf.gfile.MakeDirs(config.export_dir)
    #   checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315")
    #   squad_serving_input_fn = (
    #       build_squad_serving_input_fn(config.max_seq_length))
    #   utils.log("Starting to export model.")
    #   subfolder = model_runner._estimator.export_saved_model(
    #       export_dir_base=os.path.join(config.export_dir, "saved_model"),
    #       serving_input_receiver_fn=squad_serving_input_fn)
    tf.get_variable_scope().reuse_variables()
    model_runner = ModelRunner(config, tasks)
    tf.gfile.MakeDirs(config.export_dir)
    checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315")
    squad_serving_input_fn = (
        build_squad_serving_input_fn(config.max_seq_length))
    utils.log("Starting to export model.")
    subfolder = model_runner._estimator.export_saved_model(
        export_dir_base=os.path.join(config.export_dir, "saved_model"),
        serving_input_receiver_fn=squad_serving_input_fn)
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--corpus_dir",
                        required=True,
                        help="Location of pre-training text files.")
    parser.add_argument("--output_dir",
                        required=True,
                        help="Where to write out the tfrecords.")
    parser.add_argument("--vocab_file",
                        required=True,
                        help="Location of vocabulary file.")
    parser.add_argument("--max_seq_length",
                        default=512,
                        type=int,
                        help="Number of tokens per example.")
    parser.add_argument(
        "--num_processes",
        default=4,
        type=int,
        help="Parallelize across multiple processes.",
    )
    parser.add_argument(
        "--tokenizer_type",
        default="wordpiece",
        type=str,
        help=
        "Select the tokenizer algorithm to use. (wordpiece/mecab_wordpiece)",
    )
    parser.add_argument(
        "--blanks-separate-docs",
        default=False,
        type=bool,
        help="Whether blank lines indicate document boundaries.",
    )
    parser.add_argument(
        "--do-lower-case",
        dest="do_lower_case",
        action="store_true",
        help="Lower case input text.",
    )
    parser.add_argument(
        "--no-lower-case",
        dest="do_lower_case",
        action="store_false",
        help="Don't lower case input text.",
    )
    parser.set_defaults(do_lower_case=False)
    args = parser.parse_args()

    print(args)

    utils.rmkdir(args.output_dir)
    if args.num_processes == 1:
        write_examples(0, args)
    else:
        jobs = []
        for i in range(args.num_processes):
            job = multiprocessing.Process(target=write_examples,
                                          args=(i, args))
            jobs.append(job)
            job.start()
        for job in jobs:
            job.join()
Example #18
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
    """Run finetuning."""

    # Setup for training
    results = []
    trial = 1
    heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial,
                                                     config.num_trials)
    heading = lambda msg: utils.heading(msg + ": " + heading_info)
    heading("Config")
    utils.log_config(config)
    generic_model_dir = config.model_dir
    tasks = task_builder.get_tasks(config)

    # Train and evaluate num_trials models with different random seeds
    while config.num_trials < 0 or trial <= config.num_trials:
        config.model_dir = generic_model_dir + "_" + str(trial)
        if config.do_train:
            utils.rmkdir(config.model_dir)

        model_runner = ModelRunner(config, tasks)
        if config.do_train:
            heading("Start training")
            model_runner.train()
            utils.log()

        if config.do_eval:
            heading("Run dev set evaluation")
            results.append(model_runner.evaluate())
            if config.do_test:
                for task in tasks:
                    test_score = model_runner.evaluate_task_test(
                        task, results[-1][task.name]['checkpoint_path'])
                    results[-1][task.name]["test_results"] = test_score
            write_results(config, results)
            if config.write_test_outputs and trial <= config.n_writes_test:
                heading("Running on the test set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(
                            task, "test", False)
                        scorer.write_predictions()
                        preds = utils.load_json(config.qa_preds_file("squad"))
                        null_odds = utils.load_json(config.qa_na_file("squad"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "test", trial))
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )
        if config.do_predict:
            if "dev" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "dev")
                import pickle
                with open("predict_dev.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "train" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "train")
                import pickle
                with open("predict_train.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "test" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "test")
                import pickle
                with open("predict_test.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

        if trial != config.num_trials and (not config.keep_all_models):
            utils.rmrf(config.model_dir)
        trial += 1
Example #19
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    num_gpus = utils.get_available_gpus()
    utils.log("Found {} gpus".format(len(num_gpus)))

    if num_gpus == 1:
        session_config = tf.ConfigProto(
            log_device_placement=True,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True))

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            # save_checkpoints_secs=3600,
            # tf_random_seed=FLAGS.seed,
            session_config=session_config,
            # keep_checkpoint_max=0,
            log_step_count_steps=100)
    else:
        train_distribution_strategy = tf.distribute.MirroredStrategy(
            devices=None,
            cross_device_ops=tensorflow.contrib.distribute.
            AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus)))
        eval_distribution_strategy = tf.distribute.MirroredStrategy(
            devices=None)

        session_config = tf.ConfigProto(
            # log_device_placement=True,
            inter_op_parallelism_threads=0,
            intra_op_parallelism_threads=0,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True))

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            train_distribute=train_distribution_strategy,
            eval_distribute=eval_distribution_strategy,
            # save_checkpoints_secs=3600,
            # tf_random_seed=FLAGS.seed,
            session_config=session_config,
            # keep_checkpoint_max=0,
            log_step_count_steps=100)

    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params={
                                           'train_batch_size':
                                           config.train_batch_size,
                                           'eval_batch_size':
                                           config.eval_batch_size
                                       })

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #20
0
def train_or_eval(config: PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    # session config
    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True
    session_config.gpu_options.visible_device_list = str(
        hvd.local_rank())  # one gpu per process
    # session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1  # xla
    # session_config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT  # xla

    # run config
    # согласно примеру: https://gist.github.com/alsrgv/34a32f30292f4e2c1fa29ec0d65dea26
    # model_dir = config.model_dir if hvd.rank() == 0 else None
    # UPD: если model_dir == None, то Estimator по умолчанию сохраняет чекпоинты в /tmp, что сжирает системный диск

    run_config = tf.estimator.RunConfig(
        model_dir=config.model_dir,
        session_config=session_config,
        save_checkpoints_steps=config.save_checkpoints_steps
        if hvd.rank() == 0 else None,
        save_summary_steps=100 if hvd.rank() == 0 else 0,
        keep_checkpoint_max=config.keep_checkpoint_max,
        log_step_count_steps=10000)

    # model_fn
    model_fn = model_fn_builder(config=config)

    # training hooks
    training_hooks = []

    if hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    # estimator
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if config.do_train:
        utils.heading("Running training")
        input_fn = pretrain_data.get_input_fn(
            pretrain_tfrecords=config.pretrain_tfrecords,
            max_seq_length=config.max_seq_length,
            batch_size=config.train_batch_size,
            is_training=True,
            hvd=hvd,
            num_cpu_threads=8)
        estimator.train(input_fn=input_fn,
                        hooks=training_hooks,
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        input_fn = pretrain_data.get_input_fn(
            pretrain_tfrecords=config.pretrain_tfrecords,
            max_seq_length=config.max_seq_length,
            batch_size=config.eval_batch_size,
            is_training=False,
            hvd=hvd,
            num_cpu_threads=8)
        result = estimator.evaluate(input_fn=input_fn,
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Example #21
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    if config.use_tpu:
        is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu and config.tpu_name:
            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                config.tpu_name,
                zone=config.tpu_zone,
                project=config.gcp_project)

        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=config.iterations_per_loop,
            num_shards=config.num_tpu_cores,
            tpu_job_name=config.tpu_job_name,
            per_host_input_for_training=is_per_host,
        )
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            tpu_config=tpu_config,
        )
        model_fn = model_fn_builder(config=config)
        estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
        )
    else:
        config_proto = tf.ConfigProto()
        config_proto.gpu_options.allow_growth = True

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            session_config=config_proto,
        )
        model_fn = model_fn_builder(config=config)

        estimator = None
        if config.saved_model:
            estimator = tf.estimator.Estimator(
                model_fn=model_fn,
                config=run_config,
                warm_start_from=config.saved_model)
        else:
            estimator = tf.estimator.Estimator(model_fn=model_fn,
                                               config=run_config)

    if config.do_train:
        utils.heading("Running training")
        estimator.train(
            input_fn=pretrain_data.get_input_fn(config, True),
            max_steps=config.num_train_steps,
        )
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(
            input_fn=pretrain_data.get_input_fn(config, False),
            steps=config.num_eval_steps,
        )
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result