def serialize_examples(self, examples, is_training, output_file, batch_size): """Convert a set of `InputExample`s to a TFRecord file.""" n_examples = 0 with tf.io.TFRecordWriter(output_file) as writer: for (ex_index, example) in enumerate(examples): if ex_index % 2000 == 0: utils.log("Writing example {:} of {:}".format( ex_index, len(examples))) for tf_example in self._example_to_tf_example( example, is_training, log=self._config.log_examples and ex_index < 1): writer.write(tf_example.SerializeToString()) n_examples += 1 # add padding so the dataset is a multiple of batch_size while n_examples % batch_size != 0: writer.write( self._make_tf_example(task_id=len( self._config.task_names)).SerializeToString()) n_examples += 1 return n_examples
def _serialize_dataset(self, tasks, is_training, split): """Write out the dataset as tfrecords.""" dataset_name = "_".join(sorted([task.name for task in tasks])) dataset_name += "_" + split dataset_prefix = os.path.join(self._config.preprocessed_data_dir, dataset_name) print("dataset_prefix: ", dataset_prefix) tfrecords_path = dataset_prefix + ".tfrecord" metadata_path = dataset_prefix + ".metadata" batch_size = (self._config.train_batch_size if is_training else self._config.eval_batch_size) utils.log("Loading dataset", dataset_name) n_examples = None if (self._config.use_tfrecords_if_existing and tf.io.gfile.exists(metadata_path)): n_examples = utils.load_json(metadata_path)["n_examples"] if n_examples is None: utils.log("Existing tfrecords not found so creating") examples = [] for task in tasks: task_examples = task.get_examples(split) examples += task_examples if is_training: random.shuffle(examples) utils.mkdir(tfrecords_path.rsplit("/", 1)[0]) n_examples = self.serialize_examples(examples, is_training, tfrecords_path, batch_size) utils.write_json({"n_examples": n_examples}, metadata_path) input_fn = self._input_fn_builder(tfrecords_path, is_training) if is_training: steps = int(n_examples // batch_size * self._config.num_train_epochs) else: steps = n_examples // batch_size return input_fn, steps
def write_classification_outputs(self, tasks, trial, split): """Write classification predictions to disk.""" utils.log("Writing out predictions for", tasks, split) predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split) results = self._estimator.predict(input_fn=predict_input_fn, yield_single_examples=True) # task name -> eid -> model-logits logits = collections.defaultdict(dict) for r in results: if r["task_id"] != len(self._tasks): r = utils.nest_dict(r, self._config.task_names) task_name = self._config.task_names[r["task_id"]] logits[task_name][r[task_name]["eid"]] = ( r[task_name]["logits"] if "logits" in r[task_name] else r[task_name]["predictions"]) for task_name in logits: utils.log("Pickling predictions for {:} {:} examples ({:})".format( len(logits[task_name]), task_name, split)) if trial <= self._config.n_writes_test: utils.write_pickle( logits[task_name], self._config.test_predictions(task_name, split, trial))
def get_examples(self, split): if split in self._examples: utils.log("N EXAMPLES", split, len(self._examples[split])) return self._examples[split] examples = [] example_failures = [0] with tf.io.gfile.GFile( os.path.join(self.config.raw_data_dir(self.name), split + ".jsonl"), "r") as f: for i, line in enumerate(f): if self.config.debug and i > 10: break paragraph = json.loads(line.strip()) if "header" in paragraph: continue self._add_examples(examples, example_failures, paragraph, split) self._examples[split] = examples utils.log("{:} examples created, {:} failures".format( len(examples), example_failures[0])) return examples
def run_db_container(container_name: str, network: str, port: str) -> None: """Runs the database Docker container. Args: container_name (str): Name to use for the database container. network (str): Name of a Docker network to plug the database into. port (str): Host port at which the database will be listening on. """ docker_image = CONFIG['DATABASE']['docker_image'] if docker_utils.item_exists('container', container_name): utils.log( f"Container '{container_name}' already exists, not running '{docker_image}' image" ) return if not docker_utils.item_exists('network', network): utils.raise_error(f"Docker network '{network}' doesn't exist") utils.log(f"Running '{docker_image}' container, name: {container_name}") utils.execute_cmd([ 'docker', 'run', '--detach', '--name', container_name, '--publish', f"{port}:5432", # <host_port>:<container_port> '--network', network, '--env', f"POSTGRES_DB={os.environ.get('POSTGRES_DB')}", '--env', f"POSTGRES_USER={os.environ.get('POSTGRES_USER')}", '--env', f"POSTGRES_PASSWORD={os.environ.get('POSTGRES_PASSWORD')}", docker_image, ]) time.sleep(3) # Wait for the database to come up
def _load_glue(self, lines, split, text_a_loc, text_b_loc, label_loc, skip_first_line=False, eid_offset=0, swap=False): examples = [] # print(line) for (i, line) in enumerate(lines): try: if i == 0 and skip_first_line: continue eid = i - (1 if skip_first_line else 0) + eid_offset text_a = tokenization.convert_to_unicode(line[text_a_loc]) if text_b_loc is None: text_b = None else: text_b = tokenization.convert_to_unicode(line[text_b_loc]) if "test" in split or "diagnostic" in split: label = self._get_dummy_label() else: label = tokenization.convert_to_unicode(line[label_loc]) if swap: text_a, text_b = text_b, text_a examples.append( InputExample(eid=eid, task_name=self.name, text_a=text_a, text_b=text_b, label=label)) except Exception as ex: utils.log("Error constructing example from line", i, "for task", self.name + ":", ex) utils.log("Input causing the error:", line) return examples
def evaluate_task(self, task, split="dev", return_results=True): """Evaluate the current model.""" utils.log("Evaluating", task.name) eval_input_fn, _ = self._preprocessor.prepare_predict([task], split) checkpoints = sorted([ f for f in tf.gfile.ListDirectory(self._config.model_dir) if f[-6:] == ".index" ], key=lambda x: int(x[11:-6])) checkpoints = [ os.path.join(self._config.model_dir, checkpoint[:-6]) for checkpoint in checkpoints ] best_scores = None best_scorer = None key = self._config.eval_key for checkpoint in checkpoints: if int(checkpoint.split("-")[-1]) == 0: continue results = self._estimator.predict(input_fn=eval_input_fn, yield_single_examples=True, checkpoint_path=checkpoint) scorer = task.get_scorer() for r in results: if r["task_id"] != len(self._tasks): # ignore padding examples r = utils.nest_dict(r, self._config.task_names) scorer.update(r[task.name]) scores = dict(scorer.get_results()) scores["checkpoint_path"] = checkpoint if return_results: utils.log(task.name + ": " + " - ".join("{}: {}".format(k, v) for k, v in scores.items())) utils.log() if key is None or best_scores is None or scores[ key] > best_scores[key]: best_scores = scores else: if key is None or best_scores is None or scores[ key] > best_scores[key]: best_scores = scores best_scorer = scorer if return_results: utils.log("eval_results " + task.name + ": " + " - ".join("{}: {}".format(k, v) for k, v in best_scores.items())) return best_scores else: return best_scorer
def model_fn(features, labels, mode, params): """Build the model for training.""" model = PretrainingModel(config, features, mode == tf.estimator.ModeKeys.TRAIN) utils.log("Model is built!") if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( model.total_loss, config.learning_rate, config.num_train_steps, weight_decay_rate=config.weight_decay_rate, use_tpu=config.use_tpu, warmup_steps=config.num_warmup_steps, lr_decay_power=config.lr_decay_power, num_attention_heads=config.num_attention_heads, d_kernel_weight=config.d_kernel_weight, stepsize=config.stepsize, ) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=model.total_loss, train_op=train_op, training_hooks=[training_utils.ETAHook( {} if config.use_tpu else dict(loss=model.total_loss), config.num_train_steps, config.iterations_per_loop, config.use_tpu)] ) elif mode == tf.estimator.ModeKeys.EVAL: output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=model.total_loss, eval_metrics=model.eval_metrics, evaluation_hooks=[training_utils.ETAHook( {} if config.use_tpu else dict(loss=model.total_loss), config.num_eval_steps, config.iterations_per_loop, config.use_tpu, is_training=False)]) else: raise ValueError("Only TRAIN and EVAL modes are supported") return output_spec
def print_tokens(inputs: Inputs, inv_vocab, updates_mask=None): """Pretty-print model inputs.""" batch_size = len(inputs.masked_lm_ids) provided_update_mask = (updates_mask is not None) if not provided_update_mask: updates_mask = np.zeros_like(inputs.input_ids) for i in range(batch_size): pos_to_tokid = {} for tokid, pos, weight in zip(inputs.masked_lm_ids[i], inputs.masked_lm_positions[i], inputs.masked_lm_weights[i]): if weight == 0: pass else: pos_to_tokid[pos] = tokid text = "" for pos, (tokid, tag, um) in enumerate( zip(inputs.input_ids[i], inputs.tag_ids[i], updates_mask[i])): token = inv_vocab[tokid] if tag == -1: tag = 0 if token == "[PAD]": break if pos in pos_to_tokid: # token = RED + token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" + ENDC token = token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" if provided_update_mask: assert um == 1 else: if provided_update_mask: assert um == 0 # tag_print = GREEN + " _" + NAMES[tag] + "_ " + ENDC tag_print = " _" + NAMES[tag] + "_ " text += token + tag_print + " " utils.log(tokenization.printable_text(text))
def main() -> None: docopt.docopt(__doc__, version=CONFIG['DEFAULT']['script_version']) db_container = CONFIG['DATABASE']['database_test_container'] network = CONFIG['DOCKER']['test_network'] db_port = CONFIG['DATABASE']['test_port'] set_envars(db_port) docker_utils.create_network(network) database.start(container=db_container, network=network, port=db_port, migrations=True) utils.log('Running integration tests') completed_process = utils.execute_cmd( ['./gradlew', 'integrationTest', '--info'], pipe_stderr=True) docker_utils.rm_container( docker_utils.DockerContainer(db_container, rm_volumes=True)) docker_utils.rm_network(network) if completed_process.stderr: utils.raise_error(completed_process.stderr.decode('utf8'))
def write_predictions(self): """Write final predictions to the json file.""" unique_id_to_result = {} for result in self._all_results: unique_id_to_result[result["unique_id"]] = result results = {} total_loss = 0. for example in self._eval_examples: example_id = example.qas_id if "squad" in self._name else example.qid features = self._task.featurize(example, False, for_eval=True) results[example_id] = [] for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature[self._name + "_eid"]] result['targets'] = feature[self._name + "_f1_score"] total_loss += (result['targets'] - result['predictions'])**2 results[example_id].append(result) total_loss /= len(results) utils.write_pickle(results, self._config.f1_predict_results_file) utils.log(f"total_loss: {total_loss}")
def model_fn(features, labels, mode, params): """The `model_fn` for TPUEstimator.""" utils.log("Building model...") is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = FinetuningModel( config, tasks, is_training, features, num_train_steps) # Load pre-trained weights from checkpoint init_checkpoint = config.init_checkpoint if pretraining_config is not None: init_checkpoint = tf.train.latest_checkpoint( pretraining_config.model_dir) utils.log("Using checkpoint", init_checkpoint) tvars = tf.trainable_variables() scaffold_fn = None if init_checkpoint: assignment_map, _ = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if config.use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) # Build model for training or prediction if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( model.loss, config.learning_rate, num_train_steps, weight_decay_rate=config.weight_decay_rate, use_tpu=config.use_tpu, warmup_proportion=config.warmup_proportion, layerwise_lr_decay_power=config.layerwise_lr_decay, n_transformer_layers=model.bert_config.num_hidden_layers ) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=model.loss, train_op=train_op, scaffold_fn=scaffold_fn, training_hooks=[training_utils.ETAHook( {} if config.use_tpu else dict(loss=model.loss), num_train_steps, config.iterations_per_loop, config.use_tpu, 10)]) else: assert mode == tf.estimator.ModeKeys.PREDICT output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=utils.flatten_dict(model.outputs), scaffold_fn=scaffold_fn) utils.log("Building complete") return output_spec
def featurize(self, example: TaggingExample, is_training, log=False): words_to_tokens = tokenize_and_align(self._tokenizer, example.words) input_ids = [] tagged_positions = [] for word_tokens in words_to_tokens: if len(words_to_tokens) + len( input_ids) + 1 > self.config.max_seq_length: input_ids.append(self._tokenizer.vocab["[SEP]"]) break if "[CLS]" not in word_tokens and "[SEP]" not in word_tokens: tagged_positions.append(len(input_ids)) for token in word_tokens: input_ids.append(self._tokenizer.vocab[token]) if len(input_ids) >= self.config.max_seq_length: utils.log( "ERROR: Example (len = {}) is longer than max sequence lenght {}" .format(len(words_to_tokens), self.config.max_seq_length)) utils.log(" * Sentence: {}".format(' '.join(example.words))) flat_list = [ item for sublist in words_to_tokens for item in sublist ] utils.log(" * Token: {}".format(' '.join(flat_list))) last_token = input_ids[-1] input_ids = input_ids[0:self.config.max_seq_length - 1] input_ids.append(last_token) pad = lambda x: x + [0] * (self.config.max_seq_length - len(x)) labels = pad(example.labels[:self.config.max_seq_length]) labeled_positions = pad(tagged_positions) labels_mask = pad([1.0] * len(tagged_positions)) segment_ids = pad([1] * len(input_ids)) input_mask = pad([1] * len(input_ids)) input_ids = pad(input_ids) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length assert len(labels) == self.config.max_seq_length assert len(labels_mask) == self.config.max_seq_length return { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "task_id": self.config.task_names.index(self.name), self.name + "_eid": example.eid, self.name + "_labels": labels, self.name + "_labels_mask": labels_mask, self.name + "_labeled_positions": labeled_positions }
def evaluate_task(self, task, split="dev", return_results=True): """Evaluate the current model.""" utils.log("Evaluating", task.name) eval_input_fn, _ = self._preprocessor.prepare_predict([task], split) results = self._estimator.predict(input_fn=eval_input_fn, yield_single_examples=True) scorer = task.get_scorer() for r in results: if r["task_id"] != len(self._tasks): # ignore padding examples r = utils.nest_dict(r, self._config.task_names) scorer.update(r[task.name]) if return_results: utils.log(task.name + ": " + scorer.results_str()) utils.log() return dict(scorer.get_results()) else: return scorer
def vote1(dataset, all_nbest, all_odds, qid_answers, split, output_dir): bagging_preds = collections.OrderedDict() bagging_odds = collections.OrderedDict() bagging_all_nbest = collections.OrderedDict() for qid in qid_answers: bagging_preds[qid] = \ (seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]['text'] bagging_all_nbest[qid] = \ [(seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]] bagging_odds[qid] = np.mean([odds[qid] for odds in all_odds]) utils.write_json( bagging_preds, os.path.join(output_dir, 'vote1', 'ccks42bagging_{}_preds.json'.format(split))) utils.write_pickle( bagging_all_nbest, os.path.join(output_dir, 'vote1', 'ccks42bagging_{}_all_nbest.pkl'.format(split))) utils.write_json( bagging_odds, os.path.join(output_dir, 'vote1', 'ccks42bagging_{}_null_odds.json'.format(split))) if split in ['train', 'dev']: out_eval = main2(dataset, bagging_preds, bagging_odds) utils.log('vote1') utils.log(out_eval) elif split == 'eval': for qid in bagging_preds.keys(): if bagging_odds[qid] > -2.75: bagging_preds[qid] = "" utils.write_json( bagging_preds, os.path.join(output_dir, 'vote1', 'ccks42bagging_{}_1_preds.json'.format(split))) else: utils.log('{} split is not supported'.format(split))
def vote2(dataset, all_nbest, all_odds, qid_answers, split, output_dir): bagging_preds = collections.OrderedDict() bagging_odds = collections.OrderedDict() for qid in qid_answers: preds_scores = (seq(all_nbest).map(lambda x: x[qid][0]).map( lambda x: (x['text'], x['probability']))).dict() compare = collections.defaultdict(lambda: 0.) for pred, score in preds_scores.items(): compare[pred] += score compare = seq(compare.items()).sorted(lambda x: x[1]).reverse().list() bagging_preds[qid] = compare[0][0] bagging_odds[qid] = np.mean([odds[qid] for odds in all_odds]) utils.write_json( bagging_preds, os.path.join(output_dir, 'vote2', 'ccks42bagging_{}_preds.json'.format(split))) utils.write_json( bagging_odds, os.path.join(output_dir, 'vote2', 'ccks42bagging_{}_null_odds.json'.format(split))) if split in ['train', 'dev']: out_eval = main2(dataset, bagging_preds, bagging_odds) utils.log('vote2') utils.log(out_eval) elif split == 'eval': for qid in bagging_preds.keys(): if bagging_odds[qid] > -2.75: bagging_preds[qid] = "" utils.write_json( bagging_preds, os.path.join(output_dir, 'vote2', 'ccks42bagging_{}_1_preds.json'.format(split))) else: utils.log('{} split is not supported'.format(split))
def evaluate_task(self, task, split="dev", return_results=True): """Evaluate the current model.""" utils.log("Evaluating", task.name, split) eval_input_fn, _ = self._preprocessor.prepare_predict([task], split) results = self._estimator.predict(input_fn=eval_input_fn, yield_single_examples=True) if task.name in [ "squad", "squadv1", "newsqa", "naturalqs", "triviaqa", "searchqa", "cmrc2018", "drcd", "ccks42ec", "ccks42ee", "ccks42single", "ccks42multi", "ner", "ccks42num", "ccks42reg" ]: scorer = task.get_scorer(split) else: scorer = task.get_scorer() for r in results: if r["task_id"] != len(self._tasks): # ignore padding examples r = utils.nest_dict(r, self._config.task_names) scorer.update(r[task.name]) if return_results: utils.log(task.name + ": " + scorer.results_str()) utils.log() return dict(scorer.get_results()) else: return scorer
def featurize(self, example: NerExample, is_training, log=False, for_eval=False): all_features = [] query_tokens = self._tokenizer.tokenize(example.text_b) if len(query_tokens) > self.config.max_query_length: query_tokens = query_tokens[0:self.config.max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.words): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = self._tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = self.config.max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length", "orig_start", "orig_end"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc orig_start = tok_to_orig_index[start_offset] orig_end = tok_to_orig_index[start_offset + length - 1] doc_spans.append( _DocSpan(start=start_offset, length=length, orig_start=orig_start, orig_end=orig_end)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, self.config.doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len( tokens)] = tok_to_orig_index[split_token_index] is_max_context = tagging_utils._check_is_max_context( doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) labels = example.labels[doc_span.orig_start:doc_span.orig_end + 1] labeled_positions = orig_to_tok_index[doc_span.orig_start:doc_span. orig_end + 1] labels_mask = [1] * len(labeled_positions) # Zero-pad up to the sequence length. pad = lambda x: x + [0] * (self.config.max_seq_length - len(x)) input_ids = pad(input_ids) input_mask = pad(input_mask) segment_ids = pad(segment_ids) labels = pad(labels) labels_mask = pad(labels_mask) labeled_positions = pad(labeled_positions) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length assert len(labels) == self.config.max_seq_length assert len(labels_mask) == self.config.max_seq_length assert len(labeled_positions) == self.config.max_seq_length if log: utils.log("*** Example ***") utils.log("doc_span_index: %s" % doc_span_index) utils.log("doc_span_orig_start: %s" % doc_span.orig_start) utils.log("doc_span_start: %s" % doc_span.start) utils.log("token_to_orig_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map) ])) utils.log("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) utils.log("input_ids: %s" % " ".join([str(x) for x in input_ids])) utils.log("input_mask: %s" % " ".join([str(x) for x in input_mask])) utils.log("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) utils.log("labels: %s" % " ".join([str(x) for x in labels])) utils.log("labels_mask: %s" % " ".join([str(x) for x in labels_mask])) utils.log("labeled_positions: %s" % " ".join([str(x) for x in labeled_positions])) features = { "task_id": self.config.task_names.index(self.name), "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, self.name + "_eid": (1000 * example.eid) + doc_span_index, self.name + "_labels": labels, self.name + "_labels_mask": labels_mask, self.name + "_labeled_positions": labeled_positions } if for_eval: features.update({ self.name + "_doc_span_index": doc_span_index, self.name + "_doc_span_orig_start": doc_span.orig_start, self.name + "_doc_span_start": doc_span.start, self.name + "_token_to_orig_map": token_to_orig_map, self.name + "_token_is_max_context": token_is_max_context, }) all_features.append(features) return all_features
def featurize(self, example: QAExample, is_training, log=False, for_eval=False): all_features = [] query_tokens = self._tokenizer.tokenize(example.question_text) if len(query_tokens) > self.config.max_query_length: query_tokens = query_tokens[0:self.config.max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = self._tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if is_training and example.is_impossible: tok_start_position = -1 tok_end_position = -1 if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, self._tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = self.config.max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, self.config.doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len( tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < self.config.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length start_position = None end_position = None if is_training and not example.is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 end_position = 0 else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if is_training and example.is_impossible: start_position = 0 end_position = 0 if log: utils.log("*** Example ***") utils.log("doc_span_index: %s" % doc_span_index) utils.log( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) utils.log("token_to_orig_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map) ])) utils.log("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) utils.log("input_ids: %s" % " ".join([str(x) for x in input_ids])) utils.log("input_mask: %s" % " ".join([str(x) for x in input_mask])) utils.log("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training and example.is_impossible: utils.log("impossible example") if is_training and not example.is_impossible: answer_text = " ".join( tokens[start_position:(end_position + 1)]) utils.log("start_position: %d" % start_position) utils.log("end_position: %d" % end_position) utils.log("answer: %s" % (tokenization.printable_text(answer_text))) features = { "task_id": self.config.task_names.index(self.name), self.name + "_eid": (1000 * example.eid) + doc_span_index, "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, } if for_eval: features.update({ self.name + "_doc_span_index": doc_span_index, self.name + "_tokens": tokens, self.name + "_token_to_orig_map": token_to_orig_map, self.name + "_token_is_max_context": token_is_max_context, }) if is_training: features.update({ self.name + "_start_positions": start_position, self.name + "_end_positions": end_position, self.name + "_is_impossible": example.is_impossible }) all_features.append(features) return all_features
def _add_examples(self, examples, example_failures, paragraph, split): paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True if self.name in [ "sacqa", "cmrc2018", "ccks42ee", "ccks42single", "ccks42multi" ]: # for chinese prev_is_chinese = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace or prev_is_chinese or is_chinese_char( c): doc_tokens.append(c) prev_is_chinese = True if is_chinese_char(c) else False else: doc_tokens[-1] += c prev_is_chinese = False prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) else: for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] if "id" in qa else None qid = qa["qid"] if "qid" in qa else None question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if split == "train": if self.v2: is_impossible = qa["is_impossible"] if not is_impossible: if "detected_answers" in qa: # MRQA format answer = qa["detected_answers"][0] answer_offset = answer["char_spans"][0][0] else: # SQuAD format answer = qa["answers"][0] answer_offset = answer["answer_start"] orig_answer_text = answer["text"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] if answer_offset + answer_length - 1 >= len( char_to_word_offset): utils.log("End position is out of document!") example_failures[0] += 1 continue end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. if self.name in [ "sacqa", "cmrc2018", "ccks42ee", "ccks42single", "ccks42multi" ]: # for chinese, no whitespace needed actual_text = "".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) else: actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) actual_text = actual_text.lower() cleaned_answer_text = cleaned_answer_text.lower() if actual_text.find(cleaned_answer_text) == -1: utils.log( "Could not find answer: '{:}': '{:}' in doc vs. " "'{:}' in provided answer".format( qas_id, tokenization.printable_text(actual_text), tokenization.printable_text( cleaned_answer_text))) example_failures[0] += 1 continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = QAExample(task_name=self.name, eid=len(examples), qas_id=qas_id, qid=qid, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example)
def get_final_text(config: configure_finetuning.FinetuningConfig, pred_text, orig_text): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heruistic between # `pred_text` and `orig_text` to get a character-to-charcter alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for i, c in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return ns_text, dict(ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = tokenization.BasicTokenizer(do_lower_case=config.do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if config.debug: utils.log("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if config.debug: utils.log("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in six.iteritems(tok_ns_to_s_map): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if config.debug: utils.log("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if config.debug: utils.log("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text
def model_fn(features, labels, mode, params): """Build the model for training.""" if config.masking_strategy == pretrain_helpers.ADVERSARIAL_STRATEGY or config.masking_strategy == pretrain_helpers.MIX_ADV_STRATEGY: model = AdversarialPretrainingModel( config, features, mode == tf.estimator.ModeKeys.TRAIN) elif config.masking_strategy == pretrain_helpers.RW_STRATEGY: ratio = [] with open(config.ratio_file, "r") as fin: for line in fin: line = line.strip() if line: tok = line.split() ratio.append(float(tok[1])) model = RatioBasedPretrainingModel( config, features, ratio, mode == tf.estimator.ModeKeys.TRAIN) else: model = PretrainingModel(config, features, mode == tf.estimator.ModeKeys.TRAIN) utils.log("Model is built!") tvars = tf.trainable_variables() initialized_variable_names = {} if config.init_checkpoint: (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, config.init_checkpoint) tf.train.init_from_checkpoint(config.init_checkpoint, assignment_map) utils.log("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" utils.log(" name = %s, shape = %s%s", var.name, var.shape, init_string) if mode == tf.estimator.ModeKeys.TRAIN: if config.masking_strategy == pretrain_helpers.ADVERSARIAL_STRATEGY: student_train_op = optimization.create_optimizer( model.mlm_loss, config.learning_rate, config.num_train_steps, weight_decay_rate=config.weight_decay_rate, use_tpu=config.use_tpu, warmup_steps=config.num_warmup_steps, lr_decay_power=config.lr_decay_power) teacher_train_op = optimization.create_optimizer( model.teacher_loss, config.teacher_learning_rate, config.num_train_steps, lr_decay_power=config.lr_decay_power) train_op = tf.group(student_train_op, teacher_train_op) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=model.total_loss, train_op=train_op, training_hooks=[ training_utils.ETAHook( dict(loss=model.mlm_loss, teacher_loss=model.teacher_loss, reward=model._baseline), config.num_train_steps, config.iterations_per_loop, config.use_tpu) ]) else: train_op = optimization.create_optimizer( model.total_loss, config.learning_rate, config.num_train_steps, weight_decay_rate=config.weight_decay_rate, use_tpu=config.use_tpu, warmup_steps=config.num_warmup_steps, lr_decay_power=config.lr_decay_power) output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=model.total_loss, train_op=train_op, training_hooks=[ training_utils.ETAHook(dict(loss=model.total_loss), config.num_train_steps, config.iterations_per_loop, config.use_tpu) ]) elif mode == tf.estimator.ModeKeys.EVAL: output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=model.total_loss, eval_metric_ops=model.eval_metrics, evaluation_hooks=[ training_utils.ETAHook(dict(loss=model.total_loss), config.num_eval_steps, config.iterations_per_loop, config.use_tpu, is_training=False) ]) else: raise ValueError("Only TRAIN and EVAL modes are supported") return output_spec
try: prediction = collections.OrderedDict() prediction['eval_all_nbest'] = filter_short_ans( utils.load_pickle( (os.path.join(dire, 'models', 'electra_large', 'results', '{}_qa'.format(task_name), '{}_{}_all_nbest.pkl'.format(task_name, split))))) prediction['squad_null_odds'] = utils.load_json( (os.path.join(dire, 'models', 'electra_large', 'results', '{}_qa'.format(task_name), '{}_{}_null_odds.json'.format(task_name, split)))) models_predictions[d] = prediction except: utils.log( "Error at loading all_nbest.pkl & null_odds.json for model {}". format(d)) continue dataset = \ utils.load_json((os.path.join(data_dir, model_name_part, 'finetuning_data', task_name, '{}.json'.format(split))))[ 'data'] qid_answers = collections.OrderedDict() for article in dataset: for p in article['paragraphs']: for qa in p['qas']: qid = qa['id'] gold_answers = [ a['text'] for a in qa['answers'] if normalize_answer(a['text']) ] if not gold_answers:
def train_or_eval(config: configure_pretraining.PretrainingConfig): """Run pre-training or evaluate the pre-trained model.""" if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug: utils.rmkdir(config.model_dir) utils.heading("Config:") utils.log_config(config) num_gpus = utils.get_available_gpus() utils.log("Found {} gpus".format(len(num_gpus))) if num_gpus == 1: session_config = tf.ConfigProto( log_device_placement=True, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, # save_checkpoints_secs=3600, # tf_random_seed=FLAGS.seed, session_config=session_config, # keep_checkpoint_max=0, log_step_count_steps=100) else: train_distribution_strategy = tf.distribute.MirroredStrategy( devices=None, cross_device_ops=tensorflow.contrib.distribute. AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus))) eval_distribution_strategy = tf.distribute.MirroredStrategy( devices=None) session_config = tf.ConfigProto( # log_device_placement=True, inter_op_parallelism_threads=0, intra_op_parallelism_threads=0, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) run_config = tf.estimator.RunConfig( model_dir=config.model_dir, save_checkpoints_steps=config.save_checkpoints_steps, train_distribute=train_distribution_strategy, eval_distribute=eval_distribution_strategy, # save_checkpoints_secs=3600, # tf_random_seed=FLAGS.seed, session_config=session_config, # keep_checkpoint_max=0, log_step_count_steps=100) model_fn = model_fn_builder(config=config) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={ 'train_batch_size': config.train_batch_size, 'eval_batch_size': config.eval_batch_size }) if config.do_train: utils.heading("Running training") estimator.train(input_fn=pretrain_data.get_input_fn(config, True), max_steps=config.num_train_steps) if config.do_eval: utils.heading("Running evaluation") result = estimator.evaluate(input_fn=pretrain_data.get_input_fn( config, False), steps=config.num_eval_steps) for key in sorted(result.keys()): utils.log(" {:} = {:}".format(key, str(result[key]))) return result
def count_params(): n = np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]) utils.log("Model size: %dK" % (n / 1000))
def featurize(self, example: InputExample, is_training, log=False): """Turn an InputExample into a dict of features.""" tokens_a = self._tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = self._tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, self.config.max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self.config.max_seq_length - 2: tokens_a = tokens_a[0:(self.config.max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it # makes it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self._tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < self.config.max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == self.config.max_seq_length assert len(input_mask) == self.config.max_seq_length assert len(segment_ids) == self.config.max_seq_length if log: utils.log(" Example {:}".format(example.eid)) utils.log(" tokens: {:}".format(" ".join( [tokenization.printable_text(x) for x in tokens]))) utils.log(" input_ids: {:}".format(" ".join(map(str, input_ids)))) utils.log(" input_mask: {:}".format(" ".join( map(str, input_mask)))) utils.log(" segment_ids: {:}".format(" ".join( map(str, segment_ids)))) eid = example.eid features = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "task_id": self.config.task_names.index(self.name), self.name + "_eid": eid, } self._add_features(features, example, log) return features
def train(self): utils.log("Training for {:} steps".format(self.train_steps)) self._estimator.train(input_fn=self._train_input_fn, max_steps=self.train_steps)
def vote3(dataset, all_nbest, all_odds, qid_answers, qid_questions, models, split, output_dir): bagging_preds = collections.OrderedDict() bagging_odds = collections.OrderedDict() def post_process(question, candi, weight=1): question = question.lower() first_token = candi['text'].split()[0] th = 0. if "when" in question: if first_token in [ 'before', 'after', 'about', 'around', 'from', 'during' ]: candi['probability'] += th elif "where" in question: if first_token in [ 'in', 'at', 'on', 'behind', 'from', 'through', 'between', 'throughout' ]: candi['probability'] += th elif "whose" in question: if "'s" in candi['text']: candi['probability'] += th elif "which" in question: if first_token == "the": candi['probability'] += th candi['probability'] *= weight return candi cof = 0.2 for qid in qid_answers: question = qid_questions[qid] post_process_candidates = (seq(zip(all_nbest, models)).map(lambda x: ( x[0][qid], cof if 'lr_epoch_results' in x[1] else 1.)).map( lambda x: seq(x[0]).map(lambda y: post_process( question, y, x[1])).list()).flatten()).list() preds_probs = collections.defaultdict(lambda: []) for pred in post_process_candidates: preds_probs[pred['text']].append(pred['probability']) for pred in post_process_candidates: preds_probs[pred['text']] = np.mean( preds_probs[pred['text']]).__float__() bagging_preds[qid] = (seq(preds_probs.items()).sorted( lambda x: x[1]).reverse().map(lambda x: x[0])).list()[0] bagging_odds[qid] = np.mean([ odds[qid] * cof if 'lr_epoch_results' in model else odds[qid] for odds, model in zip(all_odds, models) ]) utils.write_json( bagging_preds, os.path.join(output_dir, 'vote3', 'ccks42bagging_{}_preds.json'.format(split))) utils.write_json( bagging_odds, os.path.join(output_dir, 'vote3', 'ccks42bagging_{}_null_odds.json'.format(split))) if split in ['train', 'dev']: out_eval = main2(dataset, bagging_preds, bagging_odds) utils.log('vote3') utils.log(out_eval) elif split == 'eval': for qid in bagging_preds.keys(): if bagging_odds[qid] > -2.75: bagging_preds[qid] = "" utils.write_json( bagging_preds, os.path.join(output_dir, 'vote3', 'ccks42bagging_{}_1_preds.json'.format(split))) else: utils.log('{} split is not supported'.format(split))
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: print("#################################################") print(tasks) t = vars(config) print(t) print("#################################################") # Create Neptune Experiment neptune.create_experiment(name=f'tf-ft', params=vars(config)) config.model_dir = generic_model_dir + "_" + str(trial) + '_' + str( random.randint(0, 10000)) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") eval_result = model_runner.evaluate() results.append(eval_result) write_results(config, results) if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in [ "cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts" ]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task( task, "test", False) scorer.write_predictions() preds = utils.load_json(config.qa_preds_file("squad")) null_odds = utils.load_json(config.qa_na_file("squad")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json( preds, config.test_predictions(task.name, "test", trial)) else: utils.log( "Skipping task", task.name, "- writing predictions is not supported for this task" ) if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1
def run_finetuning(config: configure_finetuning.FinetuningConfig): """Run finetuning.""" # Setup for training results = [] trial = 1 heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial, config.num_trials) heading = lambda msg: utils.heading(msg + ": " + heading_info) heading("Config") utils.log_config(config) generic_model_dir = config.model_dir tasks = task_builder.get_tasks(config) # Train and evaluate num_trials models with different random seeds while config.num_trials < 0 or trial <= config.num_trials: config.model_dir = generic_model_dir + "_" + str(trial) if config.do_train: utils.rmkdir(config.model_dir) model_runner = ModelRunner(config, tasks) if config.do_train: heading("Start training") model_runner.train() utils.log() if config.do_eval: heading("Run dev set evaluation") results.append(model_runner.evaluate()) if config.do_test: for task in tasks: test_score = model_runner.evaluate_task_test( task, results[-1][task.name]['checkpoint_path']) results[-1][task.name]["test_results"] = test_score write_results(config, results) if config.write_test_outputs and trial <= config.n_writes_test: heading("Running on the test set and writing the predictions") for task in tasks: # Currently only writing preds for GLUE and SQuAD 2.0 is supported if task.name in [ "cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp", "sts" ]: for split in task.get_test_splits(): model_runner.write_classification_outputs([task], trial, split) elif task.name == "squad": scorer = model_runner.evaluate_task( task, "test", False) scorer.write_predictions() preds = utils.load_json(config.qa_preds_file("squad")) null_odds = utils.load_json(config.qa_na_file("squad")) for q, _ in preds.items(): if null_odds[q] > config.qa_na_threshold: preds[q] = "" utils.write_json( preds, config.test_predictions(task.name, "test", trial)) else: utils.log( "Skipping task", task.name, "- writing predictions is not supported for this task" ) if config.do_predict: if "dev" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "dev") import pickle with open("predict_dev.pickle", "bw") as outfile: pickle.dump(results, outfile) if "train" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "train") import pickle with open("predict_train.pickle", "bw") as outfile: pickle.dump(results, outfile) if "test" in config.predict_split: results = model_runner.predict(tasks[0], config.predict_checkpoint_path, "test") import pickle with open("predict_test.pickle", "bw") as outfile: pickle.dump(results, outfile) if trial != config.num_trials and (not config.keep_all_models): utils.rmrf(config.model_dir) trial += 1