def generate_vectors(json_input_filename, w2v_dim, perplexity, theta, pca_dims, dim=2): vectors = [] most_dominant_labels = [] image_ids = [] label_map = utils.load_json(json_input_filename, w2v_dim) for image_id, label in label_map.iteritems(): label_vectors = [] label_scores = [] label_desc = [] for val in label: label_vectors.append(val['word2vec']) label_scores.append(val['score']) label_desc.append(str(''.join(c for c in val['description'] if c in string.printable))) output_vec = word2vec.linear_combination_vectors(vectors=label_vectors, coefficients=label_scores) vectors.append(output_vec) most_dominant_labels.append(label_desc[0]) image_ids.append(image_id) embeddings = [] for result in bh_tsne(vectors, perplexity=perplexity, initial_dims=pca_dims, theta=theta, no_dims=dim): embeddings.append(result) embeddings = utils.scale_max_abs(embeddings) return embeddings, most_dominant_labels, image_ids
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--data-dir", required=True, help="Location of data files (model weights, etc).") parser.add_argument("--model-name", required=True, help="The name of the model being fine-tuned.") parser.add_argument("--hparams", default="{}", help="JSON dict of model hyperparameters.") args = parser.parse_args() if args.hparams.endswith(".json"): hparams = utils.load_json(args.hparams) else: hparams = json.loads(args.hparams) tf.logging.set_verbosity(tf.logging.ERROR) run_finetuning( configure_finetuning.FinetuningConfig(args.model_name, args.data_dir, **hparams))
def _serialize_dataset(self, tasks, is_training, split): """Write out the dataset as tfrecords.""" dataset_name = "_".join(sorted([task.name for task in tasks])) dataset_name += "_" + split dataset_prefix = os.path.join(self._config.preprocessed_data_dir, dataset_name) tfrecords_path = dataset_prefix + ".tfrecord" metadata_path = dataset_prefix + ".metadata" batch_size = (self._config.train_batch_size if is_training else self._config.eval_batch_size) utils.log("Loading dataset", dataset_name) n_examples = None if tf.io.gfile.exists(metadata_path): n_examples = utils.load_json(metadata_path)["n_examples"] if n_examples is None: utils.log("Existing tfrecords not found so creating") examples = [] for task in tasks: print("task-----------", task) task_examples = task.get_examples( data_dir=self._config.data_dir, corpus=self._config.corpus, split=split) #data_dir,corpus,split examples += task_examples if is_training: random.shuffle(examples) utils.mkdir(tfrecords_path.rsplit("/", 1)[0]) n_examples = self.serialize_examples(examples, is_training, tfrecords_path, batch_size) utils.write_json({"n_examples": n_examples}, metadata_path) input_fn = self._input_fn_builder(tfrecords_path, is_training) if is_training: steps = int(n_examples // batch_size * self._config.num_train_epochs) else: steps = n_examples // batch_size return input_fn, steps
def generate_vectors(json_input_filename, w2v_dim, perplexity, theta, pca_dims, dim=2): vectors = [] most_dominant_labels = [] image_ids = [] label_map = utils.load_json(json_input_filename, w2v_dim) for image_id, label in label_map.iteritems(): label_vectors = [] label_scores = [] label_desc = [] for val in label: label_vectors.append(val['word2vec']) label_scores.append(val['score']) label_desc.append( str(''.join(c for c in val['description'] if c in string.printable))) output_vec = word2vec.linear_combination_vectors( vectors=label_vectors, coefficients=label_scores) vectors.append(output_vec) most_dominant_labels.append(label_desc[0]) image_ids.append(image_id) embeddings = [] for result in bh_tsne(vectors, perplexity=perplexity, initial_dims=pca_dims, theta=theta, no_dims=dim): embeddings.append(result) embeddings = utils.scale_max_abs(embeddings) return embeddings, most_dominant_labels, image_ids
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--data-dir", required=True, help="Location of data files (model weights, etc).") parser.add_argument("--model-name", required=True, help="The name of the model being fine-tuned.") parser.add_argument("--hparams", default="{}", help="JSON dict of model hyperparameters.") args = parser.parse_args() if args.hparams.endswith(".json"): hparams = utils.load_json(args.hparams) else: hparams = json.loads(args.hparams) tf.logging.set_verbosity(tf.logging.ERROR) # hparams = { # "do_train": "true", # "do_eval": "false", # "model_size": "base", # "do_lower_case": "true", # "vocab_size": 100000, # "num_train_steps": 766000, # "save_checkpoints_steps": 50000, # "train_batch_size": 192, # "max_seq_length": 128, # } train_or_eval( configure_pretraining.PretrainingConfig( # "danish_electra_base_uncased_100k", # "/bachelor_project/electra_google/base_uncased_100k_danish_data", # **hparams)) args.model_name, args.data_dir, **hparams))
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--data_dir", required=True, type=str, help="Location of data files (model weights, etc).", ) parser.add_argument( "--model_name", required=True, type=str, help="The name of the model being fine-tuned.", ) parser.add_argument("--hparams", default="{}", type=str, help="JSON dict of model hyperparameters.") parser.add_argument("--use_tpu", action="store_true", help="Using tpu.") parser.add_argument("--mixed_precision", action="store_true", help="Using mixed precision.") args = parser.parse_args() if args.mixed_precision: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" if args.hparams.endswith(".json"): hparams = utils.load_json(args.hparams) else: hparams = json.loads(args.hparams) tf.logging.set_verbosity(tf.logging.ERROR) train_or_eval( configure_pretraining.PretrainingConfig(args.model_name, args.data_dir, args.use_tpu, args.mixed_precision, **hparams))
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: print("#################################################") print(tasks) t = vars(config) print(t) print("#################################################") # Create Neptune Experiment neptune.create_experiment(name=f'tf-ft', params=vars(config)) config.model_dir = generic_model_dir + "_" + str(trial) + '_' + str( random.randint(0, 10000)) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") eval_result = model_runner.evaluate() results.append(eval_result) write_results(config, results) if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in [ "cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts" ]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task( task, "test", False) scorer.write_predictions() preds = utils.load_json(config.qa_preds_file("squad")) null_odds = utils.load_json(config.qa_na_file("squad")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json( preds, config.test_predictions(task.name, "test", trial)) else: utils.log( "Skipping task", task.name, "- writing predictions is not supported for this task" ) if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1
def process_single(in_file, out_file): # Load the detected faces and embeddings and run the classifier result = [(face_id, predict_gender(embed), predict_gender_score(embed)) for face_id, embed in load_json(in_file)] save_json(result, out_file)
full_models.append(model_name) models = full_models models_predictions = collections.OrderedDict() for d in models: dire = os.path.join(data_dir, d) try: prediction = collections.OrderedDict() prediction['eval_all_nbest'] = filter_short_ans( utils.load_pickle( (os.path.join(dire, 'models', 'electra_large', 'results', '{}_qa'.format(task_name), '{}_{}_all_nbest.pkl'.format(task_name, split))))) prediction['squad_null_odds'] = utils.load_json( (os.path.join(dire, 'models', 'electra_large', 'results', '{}_qa'.format(task_name), '{}_{}_null_odds.json'.format(task_name, split)))) models_predictions[d] = prediction except: utils.log( "Error at loading all_nbest.pkl & null_odds.json for model {}". format(d)) continue dataset = \ utils.load_json((os.path.join(data_dir, model_name_part, 'finetuning_data', task_name, '{}.json'.format(split))))[ 'data'] qid_answers = collections.OrderedDict() for article in dataset: for p in article['paragraphs']: for qa in p['qas']:
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: config.model_dir = generic_model_dir + "_" + str(trial) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") results.append(model_runner.evaluate()) if config.do_test: for task in tasks: test_score = model_runner.evaluate_task_test( task, results[-1][task.name]['checkpoint_path']) results[-1][task.name]["test_results"] = test_score write_results(config, results) if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in [ "cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts" ]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task( task, "test", False) scorer.write_predictions() preds = utils.load_json(config.qa_preds_file("squad")) null_odds = utils.load_json(config.qa_na_file("squad")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json( preds, config.test_predictions(task.name, "test", trial)) else: utils.log( "Skipping task", task.name, "- writing predictions is not supported for this task" ) if config.do_predict: if "dev" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "dev") import pickle with open("predict_dev.pickle", "bw") as outfile: pickle.dump(results, outfile) if "train" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "train") import pickle with open("predict_train.pickle", "bw") as outfile: pickle.dump(results, outfile) if "test" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "test") import pickle with open("predict_test.pickle", "bw") as outfile: pickle.dump(results, outfile) if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" tf.get_variable_scope().reuse_variables() #import pdb; pdb.set_trace() # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format( config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: config.model_dir = generic_model_dir + "_" + str(trial) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") results.append(model_runner.evaluate()) write_results(config, results) if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts"]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task(task, "test", False) scorer.write_predictions() preds = utils.load_json(config.qa_preds_file("squad")) null_odds = utils.load_json(config.qa_na_file("squad")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json(preds, config.test_predictions( task.name, "test", trial)) else: utils.log("Skipping task", task.name, "- writing predictions is not supported for this task") if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1 # exporting the model if config.export_dir: # with tf.variable_scope(tf.get_variable_scope(), reuse=True): # model_runner = ModelRunner(config, tasks) # tf.gfile.MakeDirs(config.export_dir) # checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315") # squad_serving_input_fn = ( # build_squad_serving_input_fn(config.max_seq_length)) # utils.log("Starting to export model.") # subfolder = model_runner._estimator.export_saved_model( # export_dir_base=os.path.join(config.export_dir, "saved_model"), # serving_input_receiver_fn=squad_serving_input_fn) tf.get_variable_scope().reuse_variables() model_runner = ModelRunner(config, tasks) tf.gfile.MakeDirs(config.export_dir) checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315") squad_serving_input_fn = ( build_squad_serving_input_fn(config.max_seq_length)) utils.log("Starting to export model.") subfolder = model_runner._estimator.export_saved_model( export_dir_base=os.path.join(config.export_dir, "saved_model"), serving_input_receiver_fn=squad_serving_input_fn)
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: config.model_dir = generic_model_dir + "_" + str(trial) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading_info = "model={:}, trial {:}/{:}".format( config.model_name, trial, config.num_trials) heading("Start training") model_runner.train() utils.log() if config.do_eval: if config.write_eval_outputs and trial <= config.n_writes_test: heading("Running on the dev set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in [ "cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts" ]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task(task, "dev", False) scorer.write_predictions() preds = utils.load_json( config.qa_preds_file(task.name + "_dev")) null_odds = utils.load_json( config.qa_na_file(task.name + "_dev")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json( preds, config.test_predictions(task.name, "dev", trial)) elif task.name == "cmrc2018" or task.name == "drcd": scorer = model_runner.evaluate_task(task, "dev", False) scorer.write_predictions() preds = utils.load_json( config.qa_preds_file(task.name + "_dev")) #utils.write_json(preds, config.test_predictions(task.name, "dev", trial)) if config.num_trials > 1: utils.write_json( preds, config.qa_preds_file(task.name + "_dev_" + str(trial))) else: utils.log( "Skipping task", task.name, "- writing predictions is not supported for this task" ) else: heading("Run dev set evaluation") # results.append(model_runner.evaluate(split="train")) results.append(model_runner.evaluate(split="dev")) write_results(config, results) if config.do_test: if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in [ "cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts" ]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task( task, "eval", False) scorer.write_predictions() preds = utils.load_json( config.qa_preds_file(task.name + "_eval")) null_odds = utils.load_json( config.qa_na_file(task.name + "_eval")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json( preds, config.test_predictions(task.name, "eval", trial)) elif task.name == "cmrc2018" or task.name == "drcd": scorer = model_runner.evaluate_task( task, "eval", False) scorer.write_predictions() preds = utils.load_json( config.qa_preds_file(task.name + "_eval")) #utils.write_json(preds, config.test_predictions(task.name, "eval", trial)) if config.num_trials > 1: utils.write_json( preds, config.qa_preds_file(task.name + "_eval_" + str(trial))) elif task.name in [ "ccks42ee", "ccks42single", "ccks42multi" ]: scorer = model_runner.evaluate_task( task, "eval", False) scorer.write_predictions() preds = utils.load_json( config.qa_preds_file(task.name + "_eval")) null_odds = utils.load_json( config.qa_na_file(task.name + "_eval")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json( preds, config.qa_preds_file(task.name + "_eval_" + str(trial))) elif task.name in [ "ccks42ec", "ner", "ccks42num", "ccks42reg" ]: scorer = model_runner.evaluate_task( task, "eval", False) scorer.write_predictions() else: utils.log( "Skipping task", task.name, "- writing predictions is not supported for this task" ) else: heading("Run test set evaluation") results.append(model_runner.evaluate(split="eval")) write_results(config, results) if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1