def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--corpus-dir", default='../data', help="Location of pre-training text files.") parser.add_argument("--vocab-file", default='vocab.txt', help="Location of vocabulary file.") parser.add_argument("--output-dir", default='tfrecord_data', help="Where to write out the tfrecords.") parser.add_argument("--max-seq-length", default=512, type=int, help="Number of tokens per example.") parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.") parser.add_argument("--blanks-separate-docs", default=False, type=bool, help="Whether blank lines indicate document boundaries.") parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") parser.set_defaults(do_lower_case=False) args = parser.parse_args() print(args) utils.rmkdir(args.output_dir) if args.num_processes == 1: write_examples(0, args) else: jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args)) jobs.append(job) job.start() for job in jobs: job.join()
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--data-dir", required=True, help="Location of data (vocab file, corpus, etc).") parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.") parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.") parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") parser.set_defaults(do_lower_case=True) args = parser.parse_args() utils.rmkdir(os.path.join(args.data_dir, "pretrain_tfrecords")) if args.num_processes == 1: write_examples(0, args) else: jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args)) jobs.append(job) job.start() for job in jobs: job.join()
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--corpus-dir", required=True, help="Location of pre-training text files.") parser.add_argument("--output-dir", required=True, help="Where to write out the tfrecords.") args = parser.parse_args() utils.rmkdir(args.output_dir) fnames = sorted(os.listdir(args.corpus_dir)) for file_no, fname in enumerate(fnames): input_file = os.path.join(args.corpus_dir, fname) output_file = os.path.join(args.output_dir, fname) print(f'Writing {fname}...') with open(input_file, 'r') as fi: with open(output_file, 'w') as fo: for line in fi: line = line.strip().replace('\n', '') if line: line = json.loads(line) article_text = ' '.join(line['article_text']).replace( '\n', '') abstract_text = ' '.join( line['abstract_text']).replace('<S>', '').replace( '\n', '') # empty lines to split docs text = f'{abstract_text} {article_text} \n\n' fo.write(text)
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" hvd.init() config.model_dir = config.model_dir if hvd.rank() == 0 else \ os.path.join(config.model_dir, str(hvd.rank())) config.train_batch_size = config.train_batch_size // hvd.size() # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format( config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: config.model_dir = generic_model_dir + "_" + str(trial) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks, hvd) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") results.append(model_runner.evaluate()) write_results(config, results) if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts"]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task(task, "test", False) scorer.write_predictions() preds = utils.load_json(config.qa_preds_file("squad")) null_odds = utils.load_json(config.qa_na_file("squad")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json(preds, config.test_predictions( task.name, "test", trial)) else: utils.log("Skipping task", task.name, "- writing predictions is not supported for this task") if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" # initialize horovod hvd.init() if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) config.model_dir = config.model_dir if hvd.rank() == 0 else \ os.path.join(config.model_dir, str(hvd.rank())) config.train_batch_size = config.train_batch_size // hvd.size() config.eval_batch_size = config.eval_batch_size // hvd.size() is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=(config.num_tpu_cores if config.do_train else config.num_tpu_cores), tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host) session_config = tf.ConfigProto() session_config.gpu_options.visible_device_list = str(hvd.local_rank()) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, session_config=session_config, tpu_config=tpu_config) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size) if config.do_train: utils.heading("Running training") hooks = [hvd.BroadcastGlobalVariablesHook(0)] estimator.train(input_fn=pretrain_data.get_input_fn(config, True, hvd), max_steps=config.num_train_steps, hooks=hooks) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False, hvd), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--corpus-path", required=True, help="Location of pre-training text files.") parser.add_argument("--corpus-dir", required=True, help="Location of pre-training text files.") parser.add_argument("--vocab-file", required=True, help="Location of vocabulary file.") parser.add_argument("--output-dir", required=True, help="Where to write out the tfrecords.") parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.") parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.") parser.add_argument("--blanks-separate-docs", action='store_true', help="Whether blank lines indicate document boundaries.") parser.add_argument("--do-lower-case", action='store_true', help="Lower case input text.") parser.add_argument("--num-out-files", default=2, type=int, help="Number of .tfrecord files") args = parser.parse_args() print(args) assert args.num_processes <= args.num_out_files utils.rmkdir(args.corpus_dir) utils.rmkdir(args.output_dir) split_corpus(corpus_path=args.corpus_path, tmp_dir=args.corpus_dir, num_processes=args.num_processes) if args.num_processes == 1: write_examples(0, args) else: jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args)) jobs.append(job) job.start() for job in jobs: job.join() utils.rmrf(args.corpus_dir)
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) if config.use_tpu: is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, keep_checkpoint_max=config.keep_checkpoint_max, tpu_config=tpu_config) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size) else: run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, keep_checkpoint_max=config.keep_checkpoint_max) model_fn = model_fn_builder(config=config) estimator = tf.estimator.Estimator( model_fn=tensorflow.contrib.estimator.replicate_model_fn(model_fn), config=run_config, params={"batch_size": config.train_batch_size}) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(config.gpu) # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: config.model_dir = generic_model_dir + "_" + str(trial) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") model_runner.evaluate() # results.append(model_runner.evaluate()) # write_results(config, results) # if config.write_test_outputs and trial <= config.n_writes_test: # heading("Running on the test set and writing the predictions") # for task in tasks: # # Currently only writing preds for GLUE and SQuAD 2.0 is supported # if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp","sts","conv"]: # for split in task.get_test_splits(): # model_runner.write_classification_outputs([task], trial, split) # elif task.name == "squad": # scorer = model_runner.evaluate_task(task, "test", False) # scorer.write_predictions() # preds = utils.load_json(config.qa_preds_file("squad")) # null_odds = utils.load_json(config.qa_na_file("squad")) # for q, _ in preds.items(): # if null_odds[q] > config.qa_na_threshold: # preds[q] = "" # utils.write_json(preds, config.test_predictions( # task.name, "test", trial)) # else: # utils.log("Skipping task", task.name, # "- writing predictions is not supported for this task") if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) # warm_start_settings = None # if config.init_checkpoint: # from tensorflow.python.estimator.estimator import WarmStartSettings # warm_start_settings = WarmStartSettings(ckpt_to_initialize_from=config.init_checkpoint, # vars_to_warm_start=['^(?!.*global_step.*)(?!.*adam.*)(?!.*Adam.*).*$']) is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, # tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, tpu_config=tpu_config) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError("Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(zone=config.tpu_zone, project=config.gcp_project) print('Running on TPU ', tpu_cluster_resolver.cluster_spec().as_dict()['worker']) if tpu_cluster_resolver: tf.config.experimental_connect_to_cluster(tpu_cluster_resolver) tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, keep_checkpoint_max=config.keep_checkpoint_max, tpu_config=tpu_config) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate( input_fn=pretrain_data.get_input_fn(config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--corpus-dir", required=True, help="Location of pre-training text files.") parser.add_argument("--vocab-file", required=True, help="Location of vocabulary file.") parser.add_argument("--output-dir", required=True, help="Where to write out the tfrecords.") parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.") parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.") parser.add_argument( "--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries.") # toggle strip-accents and set default to True which is the default behavior parser.add_argument("--do-strip-accents", dest='strip_accents', action='store_true', help="Strip accents (default).") parser.add_argument("--no-strip-accents", dest='strip_accents', action='store_false', help="Don't strip accents.") parser.set_defaults(strip_accents=True) args = parser.parse_args() utils.rmkdir(args.output_dir) if args.num_processes == 1: write_examples(0, args) else: jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args)) jobs.append(job) job.start() for job in jobs: job.join()
def write_examples(job_id, args): """A single process creating and writing out pre-processed examples.""" job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id)) owt_dir = os.path.join(args.data_dir, "openwebtext") def log(*args): msg = " ".join(map(str, args)) print("Job {}:".format(job_id), msg) log("Creating example writer") example_writer = build_pretraining_dataset.ExampleWriter( job_id=job_id, vocab_file=os.path.join(args.data_dir, "vocab.txt"), output_dir=os.path.join(args.data_dir, "pretrain_small_tfrecords"), max_seq_length=args.max_seq_length, num_jobs=args.num_processes, blanks_separate_docs=False, do_lower_case=args.do_lower_case) log("Writing tf examples") fnames = sorted(tf.io.gfile.listdir(owt_dir)) fnames = [ f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id ] random.shuffle(fnames) start_time = time.time() count = 0 for file_no, fname in enumerate(fnames): if count >= MAX_DATA_ROW: break count = count + 1 if file_no > 0 and file_no % 10 == 0: elapsed = time.time() - start_time log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, " "{:} examples written".format( file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed), int((len(fnames) - file_no) / (file_no / elapsed)), example_writer.n_written)) utils.rmkdir(job_tmp_dir) with tarfile.open(os.path.join(owt_dir, fname)) as f: f.extractall(job_tmp_dir) extracted_files = tf.io.gfile.listdir(job_tmp_dir) random.shuffle(extracted_files) for txt_fname in extracted_files: example_writer.write_examples(os.path.join(job_tmp_dir, txt_fname)) example_writer.finish() log("Done!")
def objective(params): num_epochs = params['num_epochs'] lr = params['lr'] batch_size = params['batch_size'] config.num_train_epochs = num_epochs config.learning_rate = lr config.train_batch_size = batch_size suffix = "{}_{:.6}_{}".format(num_epochs, lr, batch_size) config.model_dir = generic_model_dir + "_opt_" + suffix utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) utils.heading("Start training " + suffix) model_runner.train() utils.log() utils.heading("Run dev set evaluation " + suffix) result = list(model_runner.evaluate().values())[0] return {'loss': -result['f1'], 'status': STATUS_OK}
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--data-dir", required=True, help="Location of data (vocab file, corpus, etc).") parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.") parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.") parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") parser.set_defaults(do_lower_case=True) args = parser.parse_args() utils.rmkdir(os.path.join(args.data_dir, "pretrain_tfrecords")) manager = multiprocessing.Manager() bilm_list = manager.list() if args.num_processes == 1: write_examples(0, args, bilm_list) else: jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args, bilm_list)) jobs.append(job) job.start() for job in jobs: job.join() bilm = None for _bilm in bilm_list: bilm = _bilm if bilm is None else np.concatenate((bilm, _bilm), 1) bilm_file = os.path.join(args.data_dir, "bilm.npy") with open(bilm_file, 'wb') as f: np.save(f, bilm)
def write_examples(job_id, args): """A single process creating and writing out pre-processed examples.""" job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id)) owt_dir = os.path.join(args.data_dir, "wiki") def log(*args): msg = " ".join(map(str, args)) print("Job {}:".format(job_id), msg) log("Creating example writer") example_writer = build_pretraining_dataset.ExampleWriter( job_id=job_id, model_file=os.path.join(args.model_dir, "wiki-ja.model"), vocab_file=os.path.join(args.model_dir, "wiki-ja.vocab"), output_dir=os.path.join(args.model_dir, "pretrain_tfrecords"), max_seq_length=args.max_seq_length, num_jobs=args.num_processes, blanks_separate_docs=False, do_lower_case=args.do_lower_case) log("Writing tf examples") fnames = tf.io.gfile.listdir(owt_dir) fnames = [f for f in fnames if '.' not in f] fnames = sorted(fnames) fnames = [ f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id ] random.shuffle(fnames) for file_no, fname in enumerate(fnames): print('file number : {} of job_id: {}'.format(file_no, job_id)) utils.rmkdir(job_tmp_dir) copy_tree(os.path.join(owt_dir, fname), job_tmp_dir) list_files = tf.io.gfile.listdir(job_tmp_dir) list_files = [fi for fi in list_files if fi != 'all.txt'] for file_name in list_files: example_writer.write_examples(os.path.join(job_tmp_dir, file_name)) example_writer.finish() log("Done!")
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" tf.get_variable_scope().reuse_variables() #import pdb; pdb.set_trace() # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format( config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: config.model_dir = generic_model_dir + "_" + str(trial) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") results.append(model_runner.evaluate()) write_results(config, results) if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts"]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task(task, "test", False) scorer.write_predictions() preds = utils.load_json(config.qa_preds_file("squad")) null_odds = utils.load_json(config.qa_na_file("squad")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json(preds, config.test_predictions( task.name, "test", trial)) else: utils.log("Skipping task", task.name, "- writing predictions is not supported for this task") if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1 # exporting the model if config.export_dir: # with tf.variable_scope(tf.get_variable_scope(), reuse=True): # model_runner = ModelRunner(config, tasks) # tf.gfile.MakeDirs(config.export_dir) # checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315") # squad_serving_input_fn = ( # build_squad_serving_input_fn(config.max_seq_length)) # utils.log("Starting to export model.") # subfolder = model_runner._estimator.export_saved_model( # export_dir_base=os.path.join(config.export_dir, "saved_model"), # serving_input_receiver_fn=squad_serving_input_fn) tf.get_variable_scope().reuse_variables() model_runner = ModelRunner(config, tasks) tf.gfile.MakeDirs(config.export_dir) checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315") squad_serving_input_fn = ( build_squad_serving_input_fn(config.max_seq_length)) utils.log("Starting to export model.") subfolder = model_runner._estimator.export_saved_model( export_dir_base=os.path.join(config.export_dir, "saved_model"), serving_input_receiver_fn=squad_serving_input_fn)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--corpus_dir", required=True, help="Location of pre-training text files.") parser.add_argument("--output_dir", required=True, help="Where to write out the tfrecords.") parser.add_argument("--vocab_file", required=True, help="Location of vocabulary file.") parser.add_argument("--max_seq_length", default=512, type=int, help="Number of tokens per example.") parser.add_argument( "--num_processes", default=4, type=int, help="Parallelize across multiple processes.", ) parser.add_argument( "--tokenizer_type", default="wordpiece", type=str, help= "Select the tokenizer algorithm to use. (wordpiece/mecab_wordpiece)", ) parser.add_argument( "--blanks-separate-docs", default=False, type=bool, help="Whether blank lines indicate document boundaries.", ) parser.add_argument( "--do-lower-case", dest="do_lower_case", action="store_true", help="Lower case input text.", ) parser.add_argument( "--no-lower-case", dest="do_lower_case", action="store_false", help="Don't lower case input text.", ) parser.set_defaults(do_lower_case=False) args = parser.parse_args() print(args) utils.rmkdir(args.output_dir) if args.num_processes == 1: write_examples(0, args) else: jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args)) jobs.append(job) job.start() for job in jobs: job.join()
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: config.model_dir = generic_model_dir + "_" + str(trial) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") results.append(model_runner.evaluate()) if config.do_test: for task in tasks: test_score = model_runner.evaluate_task_test( task, results[-1][task.name]['checkpoint_path']) results[-1][task.name]["test_results"] = test_score write_results(config, results) if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in [ "cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts" ]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task( task, "test", False) scorer.write_predictions() preds = utils.load_json(config.qa_preds_file("squad")) null_odds = utils.load_json(config.qa_na_file("squad")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json( preds, config.test_predictions(task.name, "test", trial)) else: utils.log( "Skipping task", task.name, "- writing predictions is not supported for this task" ) if config.do_predict: if "dev" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "dev") import pickle with open("predict_dev.pickle", "bw") as outfile: pickle.dump(results, outfile) if "train" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "train") import pickle with open("predict_train.pickle", "bw") as outfile: pickle.dump(results, outfile) if "test" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "test") import pickle with open("predict_test.pickle", "bw") as outfile: pickle.dump(results, outfile) if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) num_gpus = utils.get_available_gpus() utils.log("Found {} gpus".format(len(num_gpus))) if num_gpus == 1: session_config = tf.ConfigProto( log_device_placement=True, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, # save_checkpoints_secs=3600, # tf_random_seed=FLAGS.seed, session_config=session_config, # keep_checkpoint_max=0, log_step_count_steps=100) else: train_distribution_strategy = tf.distribute.MirroredStrategy( devices=None, cross_device_ops=tensorflow.contrib.distribute. AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus))) eval_distribution_strategy = tf.distribute.MirroredStrategy( devices=None) session_config = tf.ConfigProto( # log_device_placement=True, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, train_distribute=train_distribution_strategy, eval_distribute=eval_distribution_strategy, # save_checkpoints_secs=3600, # tf_random_seed=FLAGS.seed, session_config=session_config, # keep_checkpoint_max=0, log_step_count_steps=100) model_fn = model_fn_builder(config=config) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={ 'train_batch_size': config.train_batch_size, 'eval_batch_size': config.eval_batch_size }) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_or_eval(config: PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) # session config session_config = tf.ConfigProto() session_config.gpu_options.allow_growth = True session_config.gpu_options.visible_device_list = str( hvd.local_rank()) # one gpu per process # session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 # xla # session_config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT # xla # run config # согласно примеру: https://gist.github.com/alsrgv/34a32f30292f4e2c1fa29ec0d65dea26 # model_dir = config.model_dir if hvd.rank() == 0 else None # UPD: если model_dir == None, то Estimator по умолчанию сохраняет чекпоинты в /tmp, что сжирает системный диск run_config = tf.estimator.RunConfig( model_dir=config.model_dir, session_config=session_config, save_checkpoints_steps=config.save_checkpoints_steps if hvd.rank() == 0 else None, save_summary_steps=100 if hvd.rank() == 0 else 0, keep_checkpoint_max=config.keep_checkpoint_max, log_step_count_steps=10000) # model_fn model_fn = model_fn_builder(config=config) # training hooks training_hooks = [] if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) # estimator estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if config.do_train: utils.heading("Running training") input_fn = pretrain_data.get_input_fn( pretrain_tfrecords=config.pretrain_tfrecords, max_seq_length=config.max_seq_length, batch_size=config.train_batch_size, is_training=True, hvd=hvd, num_cpu_threads=8) estimator.train(input_fn=input_fn, hooks=training_hooks, max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") input_fn = pretrain_data.get_input_fn( pretrain_tfrecords=config.pretrain_tfrecords, max_seq_length=config.max_seq_length, batch_size=config.eval_batch_size, is_training=False, hvd=hvd, num_cpu_threads=8) result = estimator.evaluate(input_fn=input_fn, steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) if config.use_tpu: is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) tpu_config = tf.estimator.tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, tpu_job_name=config.tpu_job_name, per_host_input_for_training=is_per_host, ) run_config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, tpu_config=tpu_config, ) model_fn = model_fn_builder(config=config) estimator = tf.estimator.tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, ) else: config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, session_config=config_proto, ) model_fn = model_fn_builder(config=config) estimator = None if config.saved_model: estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, warm_start_from=config.saved_model) else: estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if config.do_train: utils.heading("Running training") estimator.train( input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps, ) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate( input_fn=pretrain_data.get_input_fn(config, False), steps=config.num_eval_steps, ) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result