Beispiel #1
0
 def serialize_examples(self, examples, is_training, output_file,
                        batch_size):
     """Convert a set of `InputExample`s to a TFRecord file."""
     n_examples = 0
     with tf.io.TFRecordWriter(output_file) as writer:
         for (ex_index, example) in enumerate(examples):
             if ex_index % 2000 == 0:
                 utils.log("Writing example {:} of {:}".format(
                     ex_index, len(examples)))
             for tf_example in self._example_to_tf_example(
                     example,
                     is_training,
                     log=self._config.log_examples and ex_index < 1):
                 writer.write(tf_example.SerializeToString())
                 n_examples += 1
         # add padding so the dataset is a multiple of batch_size
         while n_examples % batch_size != 0:
             writer.write(
                 self._make_tf_example(task_id=len(
                     self._config.task_names)).SerializeToString())
             n_examples += 1
     return n_examples
    def _serialize_dataset(self, tasks, is_training, split):
        """Write out the dataset as tfrecords."""
        dataset_name = "_".join(sorted([task.name for task in tasks]))
        dataset_name += "_" + split
        dataset_prefix = os.path.join(self._config.preprocessed_data_dir,
                                      dataset_name)
        print("dataset_prefix: ", dataset_prefix)
        tfrecords_path = dataset_prefix + ".tfrecord"
        metadata_path = dataset_prefix + ".metadata"
        batch_size = (self._config.train_batch_size
                      if is_training else self._config.eval_batch_size)

        utils.log("Loading dataset", dataset_name)
        n_examples = None
        if (self._config.use_tfrecords_if_existing
                and tf.io.gfile.exists(metadata_path)):
            n_examples = utils.load_json(metadata_path)["n_examples"]

        if n_examples is None:
            utils.log("Existing tfrecords not found so creating")
            examples = []
            for task in tasks:
                task_examples = task.get_examples(split)
                examples += task_examples
            if is_training:
                random.shuffle(examples)
            utils.mkdir(tfrecords_path.rsplit("/", 1)[0])
            n_examples = self.serialize_examples(examples, is_training,
                                                 tfrecords_path, batch_size)
            utils.write_json({"n_examples": n_examples}, metadata_path)

        input_fn = self._input_fn_builder(tfrecords_path, is_training)
        if is_training:
            steps = int(n_examples // batch_size *
                        self._config.num_train_epochs)
        else:
            steps = n_examples // batch_size

        return input_fn, steps
Beispiel #3
0
 def write_classification_outputs(self, tasks, trial, split):
     """Write classification predictions to disk."""
     utils.log("Writing out predictions for", tasks, split)
     predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split)
     results = self._estimator.predict(input_fn=predict_input_fn,
                                       yield_single_examples=True)
     # task name -> eid -> model-logits
     logits = collections.defaultdict(dict)
     for r in results:
         if r["task_id"] != len(self._tasks):
             r = utils.nest_dict(r, self._config.task_names)
             task_name = self._config.task_names[r["task_id"]]
             logits[task_name][r[task_name]["eid"]] = (
                 r[task_name]["logits"] if "logits" in r[task_name] else
                 r[task_name]["predictions"])
     for task_name in logits:
         utils.log("Pickling predictions for {:} {:} examples ({:})".format(
             len(logits[task_name]), task_name, split))
         if trial <= self._config.n_writes_test:
             utils.write_pickle(
                 logits[task_name],
                 self._config.test_predictions(task_name, split, trial))
Beispiel #4
0
    def get_examples(self, split):
        if split in self._examples:
            utils.log("N EXAMPLES", split, len(self._examples[split]))
            return self._examples[split]

        examples = []
        example_failures = [0]
        with tf.io.gfile.GFile(
                os.path.join(self.config.raw_data_dir(self.name),
                             split + ".jsonl"), "r") as f:
            for i, line in enumerate(f):
                if self.config.debug and i > 10:
                    break
                paragraph = json.loads(line.strip())
                if "header" in paragraph:
                    continue
                self._add_examples(examples, example_failures, paragraph,
                                   split)
        self._examples[split] = examples
        utils.log("{:} examples created, {:} failures".format(
            len(examples), example_failures[0]))
        return examples
Beispiel #5
0
def run_db_container(container_name: str, network: str, port: str) -> None:
    """Runs the database Docker container.

    Args:
        container_name (str): Name to use for the database container.
        network (str): Name of a Docker network to plug the database into.
        port (str): Host port at which the database will be listening on.
    """
    docker_image = CONFIG['DATABASE']['docker_image']

    if docker_utils.item_exists('container', container_name):
        utils.log(
            f"Container '{container_name}' already exists, not running '{docker_image}' image"
        )
        return
    if not docker_utils.item_exists('network', network):
        utils.raise_error(f"Docker network '{network}' doesn't exist")

    utils.log(f"Running '{docker_image}' container, name: {container_name}")
    utils.execute_cmd([
        'docker',
        'run',
        '--detach',
        '--name',
        container_name,
        '--publish',
        f"{port}:5432",  # <host_port>:<container_port>
        '--network',
        network,
        '--env',
        f"POSTGRES_DB={os.environ.get('POSTGRES_DB')}",
        '--env',
        f"POSTGRES_USER={os.environ.get('POSTGRES_USER')}",
        '--env',
        f"POSTGRES_PASSWORD={os.environ.get('POSTGRES_PASSWORD')}",
        docker_image,
    ])
    time.sleep(3)  # Wait for the database to come up
Beispiel #6
0
 def _load_glue(self,
                lines,
                split,
                text_a_loc,
                text_b_loc,
                label_loc,
                skip_first_line=False,
                eid_offset=0,
                swap=False):
     examples = []
     # print(line)
     for (i, line) in enumerate(lines):
         try:
             if i == 0 and skip_first_line:
                 continue
             eid = i - (1 if skip_first_line else 0) + eid_offset
             text_a = tokenization.convert_to_unicode(line[text_a_loc])
             if text_b_loc is None:
                 text_b = None
             else:
                 text_b = tokenization.convert_to_unicode(line[text_b_loc])
             if "test" in split or "diagnostic" in split:
                 label = self._get_dummy_label()
             else:
                 label = tokenization.convert_to_unicode(line[label_loc])
             if swap:
                 text_a, text_b = text_b, text_a
             examples.append(
                 InputExample(eid=eid,
                              task_name=self.name,
                              text_a=text_a,
                              text_b=text_b,
                              label=label))
         except Exception as ex:
             utils.log("Error constructing example from line", i,
                       "for task", self.name + ":", ex)
             utils.log("Input causing the error:", line)
     return examples
Beispiel #7
0
 def evaluate_task(self, task, split="dev", return_results=True):
     """Evaluate the current model."""
     utils.log("Evaluating", task.name)
     eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
     checkpoints = sorted([
         f for f in tf.gfile.ListDirectory(self._config.model_dir)
         if f[-6:] == ".index"
     ],
                          key=lambda x: int(x[11:-6]))
     checkpoints = [
         os.path.join(self._config.model_dir, checkpoint[:-6])
         for checkpoint in checkpoints
     ]
     best_scores = None
     best_scorer = None
     key = self._config.eval_key
     for checkpoint in checkpoints:
         if int(checkpoint.split("-")[-1]) == 0: continue
         results = self._estimator.predict(input_fn=eval_input_fn,
                                           yield_single_examples=True,
                                           checkpoint_path=checkpoint)
         scorer = task.get_scorer()
         for r in results:
             if r["task_id"] != len(self._tasks):  # ignore padding examples
                 r = utils.nest_dict(r, self._config.task_names)
                 scorer.update(r[task.name])
         scores = dict(scorer.get_results())
         scores["checkpoint_path"] = checkpoint
         if return_results:
             utils.log(task.name + ": " +
                       " - ".join("{}: {}".format(k, v)
                                  for k, v in scores.items()))
             utils.log()
             if key is None or best_scores is None or scores[
                     key] > best_scores[key]:
                 best_scores = scores
         else:
             if key is None or best_scores is None or scores[
                     key] > best_scores[key]:
                 best_scores = scores
                 best_scorer = scorer
     if return_results:
         utils.log("eval_results " + task.name + ": " +
                   " - ".join("{}: {}".format(k, v)
                              for k, v in best_scores.items()))
         return best_scores
     else:
         return best_scorer
Beispiel #8
0
 def model_fn(features, labels, mode, params):
     """Build the model for training."""
     model = PretrainingModel(config, features,
                              mode == tf.estimator.ModeKeys.TRAIN)
     utils.log("Model is built!")
     if mode == tf.estimator.ModeKeys.TRAIN:
         train_op = optimization.create_optimizer(
             model.total_loss, config.learning_rate, config.num_train_steps,
             weight_decay_rate=config.weight_decay_rate,
             use_tpu=config.use_tpu,
             warmup_steps=config.num_warmup_steps,
             lr_decay_power=config.lr_decay_power,
             num_attention_heads=config.num_attention_heads,
             d_kernel_weight=config.d_kernel_weight,
             stepsize=config.stepsize,
         )
         output_spec = tf.estimator.tpu.TPUEstimatorSpec(
             mode=mode,
             loss=model.total_loss,
             train_op=train_op,
             training_hooks=[training_utils.ETAHook(
                 {} if config.use_tpu else dict(loss=model.total_loss),
                 config.num_train_steps, config.iterations_per_loop,
                 config.use_tpu)]
         )
     elif mode == tf.estimator.ModeKeys.EVAL:
         output_spec = tf.estimator.tpu.TPUEstimatorSpec(
             mode=mode,
             loss=model.total_loss,
             eval_metrics=model.eval_metrics,
             evaluation_hooks=[training_utils.ETAHook(
                 {} if config.use_tpu else dict(loss=model.total_loss),
                 config.num_eval_steps, config.iterations_per_loop,
                 config.use_tpu, is_training=False)])
     else:
         raise ValueError("Only TRAIN and EVAL modes are supported")
     return output_spec
Beispiel #9
0
def print_tokens(inputs: Inputs, inv_vocab, updates_mask=None):
    """Pretty-print model inputs."""
    batch_size = len(inputs.masked_lm_ids)
    provided_update_mask = (updates_mask is not None)
    if not provided_update_mask:
        updates_mask = np.zeros_like(inputs.input_ids)
    for i in range(batch_size):
        pos_to_tokid = {}
        for tokid, pos, weight in zip(inputs.masked_lm_ids[i],
                                      inputs.masked_lm_positions[i],
                                      inputs.masked_lm_weights[i]):
            if weight == 0:
                pass
            else:
                pos_to_tokid[pos] = tokid

        text = ""

        for pos, (tokid, tag, um) in enumerate(
                zip(inputs.input_ids[i], inputs.tag_ids[i], updates_mask[i])):
            token = inv_vocab[tokid]
            if tag == -1:
                tag = 0
            if token == "[PAD]":
                break
            if pos in pos_to_tokid:
                # token = RED + token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" + ENDC
                token = token + " (" + inv_vocab[pos_to_tokid[pos]] + ")"
                if provided_update_mask:
                    assert um == 1
            else:
                if provided_update_mask:
                    assert um == 0
            # tag_print = GREEN + " _" + NAMES[tag] + "_ " + ENDC
            tag_print = " _" + NAMES[tag] + "_ "
            text += token + tag_print + " "
        utils.log(tokenization.printable_text(text))
Beispiel #10
0
def main() -> None:
    docopt.docopt(__doc__, version=CONFIG['DEFAULT']['script_version'])

    db_container = CONFIG['DATABASE']['database_test_container']
    network = CONFIG['DOCKER']['test_network']
    db_port = CONFIG['DATABASE']['test_port']
    set_envars(db_port)

    docker_utils.create_network(network)
    database.start(container=db_container,
                   network=network,
                   port=db_port,
                   migrations=True)

    utils.log('Running integration tests')
    completed_process = utils.execute_cmd(
        ['./gradlew', 'integrationTest', '--info'], pipe_stderr=True)

    docker_utils.rm_container(
        docker_utils.DockerContainer(db_container, rm_volumes=True))
    docker_utils.rm_network(network)

    if completed_process.stderr:
        utils.raise_error(completed_process.stderr.decode('utf8'))
Beispiel #11
0
    def write_predictions(self):
        """Write final predictions to the json file."""
        unique_id_to_result = {}
        for result in self._all_results:
            unique_id_to_result[result["unique_id"]] = result

        results = {}
        total_loss = 0.
        for example in self._eval_examples:
            example_id = example.qas_id if "squad" in self._name else example.qid
            features = self._task.featurize(example, False, for_eval=True)

            results[example_id] = []
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature[self._name + "_eid"]]
                result['targets'] = feature[self._name + "_f1_score"]

                total_loss += (result['targets'] - result['predictions'])**2

                results[example_id].append(result)
        total_loss /= len(results)

        utils.write_pickle(results, self._config.f1_predict_results_file)
        utils.log(f"total_loss: {total_loss}")
    def model_fn(features, labels, mode, params):
        """The `model_fn` for TPUEstimator."""
        utils.log("Building model...")
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        model = FinetuningModel(
            config, tasks, is_training, features, num_train_steps)

        # Load pre-trained weights from checkpoint
        init_checkpoint = config.init_checkpoint
        if pretraining_config is not None:
            init_checkpoint = tf.train.latest_checkpoint(
                pretraining_config.model_dir)
            utils.log("Using checkpoint", init_checkpoint)
        tvars = tf.trainable_variables()
        scaffold_fn = None
        if init_checkpoint:
            assignment_map, _ = modeling.get_assignment_map_from_checkpoint(
                tvars, init_checkpoint)
            if config.use_tpu:
                def tpu_scaffold():
                    tf.train.init_from_checkpoint(
                        init_checkpoint, assignment_map)
                    return tf.train.Scaffold()
                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        # Build model for training or prediction
        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op = optimization.create_optimizer(
                model.loss, config.learning_rate, num_train_steps,
                weight_decay_rate=config.weight_decay_rate,
                use_tpu=config.use_tpu,
                warmup_proportion=config.warmup_proportion,
                layerwise_lr_decay_power=config.layerwise_lr_decay,
                n_transformer_layers=model.bert_config.num_hidden_layers
            )
            output_spec = tf.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=model.loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn,
                training_hooks=[training_utils.ETAHook(
                    {} if config.use_tpu else dict(loss=model.loss),
                    num_train_steps, config.iterations_per_loop, config.use_tpu, 10)])
        else:
            assert mode == tf.estimator.ModeKeys.PREDICT
            output_spec = tf.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions=utils.flatten_dict(model.outputs),
                scaffold_fn=scaffold_fn)

        utils.log("Building complete")
        return output_spec
Beispiel #13
0
    def featurize(self, example: TaggingExample, is_training, log=False):
        words_to_tokens = tokenize_and_align(self._tokenizer, example.words)
        input_ids = []
        tagged_positions = []
        for word_tokens in words_to_tokens:
            if len(words_to_tokens) + len(
                    input_ids) + 1 > self.config.max_seq_length:
                input_ids.append(self._tokenizer.vocab["[SEP]"])
                break
            if "[CLS]" not in word_tokens and "[SEP]" not in word_tokens:
                tagged_positions.append(len(input_ids))
            for token in word_tokens:
                input_ids.append(self._tokenizer.vocab[token])

        if len(input_ids) >= self.config.max_seq_length:
            utils.log(
                "ERROR: Example (len = {}) is longer than max sequence lenght {}"
                .format(len(words_to_tokens), self.config.max_seq_length))
            utils.log(" * Sentence: {}".format(' '.join(example.words)))
            flat_list = [
                item for sublist in words_to_tokens for item in sublist
            ]
            utils.log(" * Token: {}".format(' '.join(flat_list)))
            last_token = input_ids[-1]
            input_ids = input_ids[0:self.config.max_seq_length - 1]
            input_ids.append(last_token)

        pad = lambda x: x + [0] * (self.config.max_seq_length - len(x))
        labels = pad(example.labels[:self.config.max_seq_length])
        labeled_positions = pad(tagged_positions)
        labels_mask = pad([1.0] * len(tagged_positions))
        segment_ids = pad([1] * len(input_ids))
        input_mask = pad([1] * len(input_ids))
        input_ids = pad(input_ids)
        assert len(input_ids) == self.config.max_seq_length
        assert len(input_mask) == self.config.max_seq_length
        assert len(segment_ids) == self.config.max_seq_length
        assert len(labels) == self.config.max_seq_length
        assert len(labels_mask) == self.config.max_seq_length

        return {
            "input_ids": input_ids,
            "input_mask": input_mask,
            "segment_ids": segment_ids,
            "task_id": self.config.task_names.index(self.name),
            self.name + "_eid": example.eid,
            self.name + "_labels": labels,
            self.name + "_labels_mask": labels_mask,
            self.name + "_labeled_positions": labeled_positions
        }
Beispiel #14
0
 def evaluate_task(self, task, split="dev", return_results=True):
     """Evaluate the current model."""
     utils.log("Evaluating", task.name)
     eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
     results = self._estimator.predict(input_fn=eval_input_fn,
                                       yield_single_examples=True)
     scorer = task.get_scorer()
     for r in results:
         if r["task_id"] != len(self._tasks):  # ignore padding examples
             r = utils.nest_dict(r, self._config.task_names)
             scorer.update(r[task.name])
     if return_results:
         utils.log(task.name + ": " + scorer.results_str())
         utils.log()
         return dict(scorer.get_results())
     else:
         return scorer
Beispiel #15
0
def vote1(dataset, all_nbest, all_odds, qid_answers, split, output_dir):
    bagging_preds = collections.OrderedDict()
    bagging_odds = collections.OrderedDict()
    bagging_all_nbest = collections.OrderedDict()

    for qid in qid_answers:
        bagging_preds[qid] = \
          (seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]['text']
        bagging_all_nbest[qid] = \
          [(seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]]
        bagging_odds[qid] = np.mean([odds[qid] for odds in all_odds])

    utils.write_json(
        bagging_preds,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_preds.json'.format(split)))
    utils.write_pickle(
        bagging_all_nbest,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_all_nbest.pkl'.format(split)))
    utils.write_json(
        bagging_odds,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_null_odds.json'.format(split)))

    if split in ['train', 'dev']:
        out_eval = main2(dataset, bagging_preds, bagging_odds)
        utils.log('vote1')
        utils.log(out_eval)
    elif split == 'eval':
        for qid in bagging_preds.keys():
            if bagging_odds[qid] > -2.75:
                bagging_preds[qid] = ""
        utils.write_json(
            bagging_preds,
            os.path.join(output_dir, 'vote1',
                         'ccks42bagging_{}_1_preds.json'.format(split)))
    else:
        utils.log('{} split is not supported'.format(split))
Beispiel #16
0
def vote2(dataset, all_nbest, all_odds, qid_answers, split, output_dir):
    bagging_preds = collections.OrderedDict()
    bagging_odds = collections.OrderedDict()

    for qid in qid_answers:
        preds_scores = (seq(all_nbest).map(lambda x: x[qid][0]).map(
            lambda x: (x['text'], x['probability']))).dict()
        compare = collections.defaultdict(lambda: 0.)
        for pred, score in preds_scores.items():
            compare[pred] += score
        compare = seq(compare.items()).sorted(lambda x: x[1]).reverse().list()
        bagging_preds[qid] = compare[0][0]

        bagging_odds[qid] = np.mean([odds[qid] for odds in all_odds])

    utils.write_json(
        bagging_preds,
        os.path.join(output_dir, 'vote2',
                     'ccks42bagging_{}_preds.json'.format(split)))
    utils.write_json(
        bagging_odds,
        os.path.join(output_dir, 'vote2',
                     'ccks42bagging_{}_null_odds.json'.format(split)))

    if split in ['train', 'dev']:
        out_eval = main2(dataset, bagging_preds, bagging_odds)
        utils.log('vote2')
        utils.log(out_eval)
    elif split == 'eval':
        for qid in bagging_preds.keys():
            if bagging_odds[qid] > -2.75:
                bagging_preds[qid] = ""
        utils.write_json(
            bagging_preds,
            os.path.join(output_dir, 'vote2',
                         'ccks42bagging_{}_1_preds.json'.format(split)))
    else:
        utils.log('{} split is not supported'.format(split))
Beispiel #17
0
 def evaluate_task(self, task, split="dev", return_results=True):
     """Evaluate the current model."""
     utils.log("Evaluating", task.name, split)
     eval_input_fn, _ = self._preprocessor.prepare_predict([task], split)
     results = self._estimator.predict(input_fn=eval_input_fn,
                                       yield_single_examples=True)
     if task.name in [
             "squad", "squadv1", "newsqa", "naturalqs", "triviaqa",
             "searchqa", "cmrc2018", "drcd", "ccks42ec", "ccks42ee",
             "ccks42single", "ccks42multi", "ner", "ccks42num", "ccks42reg"
     ]:
         scorer = task.get_scorer(split)
     else:
         scorer = task.get_scorer()
     for r in results:
         if r["task_id"] != len(self._tasks):  # ignore padding examples
             r = utils.nest_dict(r, self._config.task_names)
             scorer.update(r[task.name])
     if return_results:
         utils.log(task.name + ": " + scorer.results_str())
         utils.log()
         return dict(scorer.get_results())
     else:
         return scorer
Beispiel #18
0
    def featurize(self,
                  example: NerExample,
                  is_training,
                  log=False,
                  for_eval=False):
        all_features = []
        query_tokens = self._tokenizer.tokenize(example.text_b)

        if len(query_tokens) > self.config.max_query_length:
            query_tokens = query_tokens[0:self.config.max_query_length]

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.words):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = self._tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = self.config.max_seq_length - len(query_tokens) - 3

        # We can have documents that are longer than the maximum sequence length.
        # To deal with this we do a sliding window approach, where we take chunks
        # of the up to our max length with a stride of `doc_stride`.
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length", "orig_start", "orig_end"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            orig_start = tok_to_orig_index[start_offset]
            orig_end = tok_to_orig_index[start_offset + length - 1]
            doc_spans.append(
                _DocSpan(start=start_offset,
                         length=length,
                         orig_start=orig_start,
                         orig_end=orig_end))
            if start_offset + length == len(all_doc_tokens):
                break
            start_offset += min(length, self.config.doc_stride)

        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = []
            token_to_orig_map = {}
            token_is_max_context = {}
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)

            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(
                    tokens)] = tok_to_orig_index[split_token_index]

                is_max_context = tagging_utils._check_is_max_context(
                    doc_spans, doc_span_index, split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for token in query_tokens:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)

            labels = example.labels[doc_span.orig_start:doc_span.orig_end + 1]
            labeled_positions = orig_to_tok_index[doc_span.orig_start:doc_span.
                                                  orig_end + 1]
            labels_mask = [1] * len(labeled_positions)

            # Zero-pad up to the sequence length.
            pad = lambda x: x + [0] * (self.config.max_seq_length - len(x))
            input_ids = pad(input_ids)
            input_mask = pad(input_mask)
            segment_ids = pad(segment_ids)
            labels = pad(labels)
            labels_mask = pad(labels_mask)
            labeled_positions = pad(labeled_positions)

            assert len(input_ids) == self.config.max_seq_length
            assert len(input_mask) == self.config.max_seq_length
            assert len(segment_ids) == self.config.max_seq_length
            assert len(labels) == self.config.max_seq_length
            assert len(labels_mask) == self.config.max_seq_length
            assert len(labeled_positions) == self.config.max_seq_length

            if log:
                utils.log("*** Example ***")
                utils.log("doc_span_index: %s" % doc_span_index)
                utils.log("doc_span_orig_start: %s" % doc_span.orig_start)
                utils.log("doc_span_start: %s" % doc_span.start)
                utils.log("token_to_orig_map: %s" % " ".join([
                    "%d:%d" % (x, y)
                    for (x, y) in six.iteritems(token_to_orig_map)
                ]))
                utils.log("token_is_max_context: %s" % " ".join([
                    "%d:%s" % (x, y)
                    for (x, y) in six.iteritems(token_is_max_context)
                ]))
                utils.log("input_ids: %s" %
                          " ".join([str(x) for x in input_ids]))
                utils.log("input_mask: %s" %
                          " ".join([str(x) for x in input_mask]))
                utils.log("segment_ids: %s" %
                          " ".join([str(x) for x in segment_ids]))
                utils.log("labels: %s" % " ".join([str(x) for x in labels]))
                utils.log("labels_mask: %s" %
                          " ".join([str(x) for x in labels_mask]))
                utils.log("labeled_positions: %s" %
                          " ".join([str(x) for x in labeled_positions]))

            features = {
                "task_id": self.config.task_names.index(self.name),
                "input_ids": input_ids,
                "input_mask": input_mask,
                "segment_ids": segment_ids,
                self.name + "_eid": (1000 * example.eid) + doc_span_index,
                self.name + "_labels": labels,
                self.name + "_labels_mask": labels_mask,
                self.name + "_labeled_positions": labeled_positions
            }
            if for_eval:
                features.update({
                    self.name + "_doc_span_index":
                    doc_span_index,
                    self.name + "_doc_span_orig_start":
                    doc_span.orig_start,
                    self.name + "_doc_span_start":
                    doc_span.start,
                    self.name + "_token_to_orig_map":
                    token_to_orig_map,
                    self.name + "_token_is_max_context":
                    token_is_max_context,
                })
            all_features.append(features)
        return all_features
Beispiel #19
0
    def featurize(self,
                  example: QAExample,
                  is_training,
                  log=False,
                  for_eval=False):
        all_features = []
        query_tokens = self._tokenizer.tokenize(example.question_text)

        if len(query_tokens) > self.config.max_query_length:
            query_tokens = query_tokens[0:self.config.max_query_length]

        tok_to_orig_index = []
        orig_to_tok_index = []
        all_doc_tokens = []
        for (i, token) in enumerate(example.doc_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = self._tokenizer.tokenize(token)
            for sub_token in sub_tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)

        tok_start_position = None
        tok_end_position = None
        if is_training and example.is_impossible:
            tok_start_position = -1
            tok_end_position = -1
        if is_training and not example.is_impossible:
            tok_start_position = orig_to_tok_index[example.start_position]
            if example.end_position < len(example.doc_tokens) - 1:
                tok_end_position = orig_to_tok_index[example.end_position +
                                                     1] - 1
            else:
                tok_end_position = len(all_doc_tokens) - 1
            (tok_start_position, tok_end_position) = _improve_answer_span(
                all_doc_tokens, tok_start_position, tok_end_position,
                self._tokenizer, example.orig_answer_text)

        # The -3 accounts for [CLS], [SEP] and [SEP]
        max_tokens_for_doc = self.config.max_seq_length - len(query_tokens) - 3

        # We can have documents that are longer than the maximum sequence length.
        # To deal with this we do a sliding window approach, where we take chunks
        # of the up to our max length with a stride of `doc_stride`.
        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
            "DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):
            length = len(all_doc_tokens) - start_offset
            if length > max_tokens_for_doc:
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):
                break
            start_offset += min(length, self.config.doc_stride)

        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = []
            token_to_orig_map = {}
            token_is_max_context = {}
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            for token in query_tokens:
                tokens.append(token)
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)

            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(
                    tokens)] = tok_to_orig_index[split_token_index]

                is_max_context = _check_is_max_context(doc_spans,
                                                       doc_span_index,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

            input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < self.config.max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            assert len(input_ids) == self.config.max_seq_length
            assert len(input_mask) == self.config.max_seq_length
            assert len(segment_ids) == self.config.max_seq_length

            start_position = None
            end_position = None
            if is_training and not example.is_impossible:
                # For training, if our document chunk does not contain an annotation
                # we throw it out, since there is nothing to predict.
                doc_start = doc_span.start
                doc_end = doc_span.start + doc_span.length - 1
                out_of_span = False
                if not (tok_start_position >= doc_start
                        and tok_end_position <= doc_end):
                    out_of_span = True
                if out_of_span:
                    start_position = 0
                    end_position = 0
                else:
                    doc_offset = len(query_tokens) + 2
                    start_position = tok_start_position - doc_start + doc_offset
                    end_position = tok_end_position - doc_start + doc_offset

            if is_training and example.is_impossible:
                start_position = 0
                end_position = 0

            if log:
                utils.log("*** Example ***")
                utils.log("doc_span_index: %s" % doc_span_index)
                utils.log(
                    "tokens: %s" %
                    " ".join([tokenization.printable_text(x) for x in tokens]))
                utils.log("token_to_orig_map: %s" % " ".join([
                    "%d:%d" % (x, y)
                    for (x, y) in six.iteritems(token_to_orig_map)
                ]))
                utils.log("token_is_max_context: %s" % " ".join([
                    "%d:%s" % (x, y)
                    for (x, y) in six.iteritems(token_is_max_context)
                ]))
                utils.log("input_ids: %s" %
                          " ".join([str(x) for x in input_ids]))
                utils.log("input_mask: %s" %
                          " ".join([str(x) for x in input_mask]))
                utils.log("segment_ids: %s" %
                          " ".join([str(x) for x in segment_ids]))
                if is_training and example.is_impossible:
                    utils.log("impossible example")
                if is_training and not example.is_impossible:
                    answer_text = " ".join(
                        tokens[start_position:(end_position + 1)])
                    utils.log("start_position: %d" % start_position)
                    utils.log("end_position: %d" % end_position)
                    utils.log("answer: %s" %
                              (tokenization.printable_text(answer_text)))

            features = {
                "task_id": self.config.task_names.index(self.name),
                self.name + "_eid": (1000 * example.eid) + doc_span_index,
                "input_ids": input_ids,
                "input_mask": input_mask,
                "segment_ids": segment_ids,
            }
            if for_eval:
                features.update({
                    self.name + "_doc_span_index":
                    doc_span_index,
                    self.name + "_tokens":
                    tokens,
                    self.name + "_token_to_orig_map":
                    token_to_orig_map,
                    self.name + "_token_is_max_context":
                    token_is_max_context,
                })
            if is_training:
                features.update({
                    self.name + "_start_positions":
                    start_position,
                    self.name + "_end_positions":
                    end_position,
                    self.name + "_is_impossible":
                    example.is_impossible
                })
            all_features.append(features)
        return all_features
Beispiel #20
0
    def _add_examples(self, examples, example_failures, paragraph, split):
        paragraph_text = paragraph["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        if self.name in [
                "sacqa", "cmrc2018", "ccks42ee", "ccks42single", "ccks42multi"
        ]:  # for chinese
            prev_is_chinese = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace or prev_is_chinese or is_chinese_char(
                            c):
                        doc_tokens.append(c)
                        prev_is_chinese = True if is_chinese_char(c) else False
                    else:
                        doc_tokens[-1] += c
                        prev_is_chinese = False
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
        else:
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

        for qa in paragraph["qas"]:
            qas_id = qa["id"] if "id" in qa else None
            qid = qa["qid"] if "qid" in qa else None
            question_text = qa["question"]
            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False
            if split == "train":
                if self.v2:
                    is_impossible = qa["is_impossible"]
                if not is_impossible:
                    if "detected_answers" in qa:  # MRQA format
                        answer = qa["detected_answers"][0]
                        answer_offset = answer["char_spans"][0][0]
                    else:  # SQuAD format
                        answer = qa["answers"][0]
                        answer_offset = answer["answer_start"]
                    orig_answer_text = answer["text"]
                    answer_length = len(orig_answer_text)
                    start_position = char_to_word_offset[answer_offset]
                    if answer_offset + answer_length - 1 >= len(
                            char_to_word_offset):
                        utils.log("End position is out of document!")
                        example_failures[0] += 1
                        continue
                    end_position = char_to_word_offset[answer_offset +
                                                       answer_length - 1]

                    # Only add answers where the text can be exactly recovered from the
                    # document. If this CAN'T happen it's likely due to weird Unicode
                    # stuff so we will just skip the example.
                    #
                    # Note that this means for training mode, every example is NOT
                    # guaranteed to be preserved.
                    if self.name in [
                            "sacqa", "cmrc2018", "ccks42ee", "ccks42single",
                            "ccks42multi"
                    ]:  # for chinese, no whitespace needed
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                    else:
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                    actual_text = actual_text.lower()
                    cleaned_answer_text = cleaned_answer_text.lower()
                    if actual_text.find(cleaned_answer_text) == -1:
                        utils.log(
                            "Could not find answer: '{:}': '{:}' in doc vs. "
                            "'{:}' in provided answer".format(
                                qas_id,
                                tokenization.printable_text(actual_text),
                                tokenization.printable_text(
                                    cleaned_answer_text)))
                        example_failures[0] += 1
                        continue
                else:
                    start_position = -1
                    end_position = -1
                    orig_answer_text = ""

            example = QAExample(task_name=self.name,
                                eid=len(examples),
                                qas_id=qas_id,
                                qid=qid,
                                question_text=question_text,
                                doc_tokens=doc_tokens,
                                orig_answer_text=orig_answer_text,
                                start_position=start_position,
                                end_position=end_position,
                                is_impossible=is_impossible)
            examples.append(example)
Beispiel #21
0
def get_final_text(config: configure_finetuning.FinetuningConfig, pred_text,
                   orig_text):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heruistic between
    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for i, c in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return ns_text, dict(ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = tokenization.BasicTokenizer(do_lower_case=config.do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if config.debug:
            utils.log("Unable to find text: '%s' in '%s'" %
                      (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if config.debug:
            utils.log("Length not equal after stripping spaces: '%s' vs '%s'",
                      orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if config.debug:
            utils.log("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if config.debug:
            utils.log("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Beispiel #22
0
    def model_fn(features, labels, mode, params):
        """Build the model for training."""
        if config.masking_strategy == pretrain_helpers.ADVERSARIAL_STRATEGY or config.masking_strategy == pretrain_helpers.MIX_ADV_STRATEGY:
            model = AdversarialPretrainingModel(
                config, features, mode == tf.estimator.ModeKeys.TRAIN)
        elif config.masking_strategy == pretrain_helpers.RW_STRATEGY:
            ratio = []
            with open(config.ratio_file, "r") as fin:
                for line in fin:
                    line = line.strip()
                    if line:
                        tok = line.split()
                        ratio.append(float(tok[1]))
            model = RatioBasedPretrainingModel(
                config, features, ratio, mode == tf.estimator.ModeKeys.TRAIN)
        else:
            model = PretrainingModel(config, features,
                                     mode == tf.estimator.ModeKeys.TRAIN)
        utils.log("Model is built!")

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        if config.init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, config.init_checkpoint)
            tf.train.init_from_checkpoint(config.init_checkpoint,
                                          assignment_map)

        utils.log("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            utils.log("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if config.masking_strategy == pretrain_helpers.ADVERSARIAL_STRATEGY:
                student_train_op = optimization.create_optimizer(
                    model.mlm_loss,
                    config.learning_rate,
                    config.num_train_steps,
                    weight_decay_rate=config.weight_decay_rate,
                    use_tpu=config.use_tpu,
                    warmup_steps=config.num_warmup_steps,
                    lr_decay_power=config.lr_decay_power)
                teacher_train_op = optimization.create_optimizer(
                    model.teacher_loss,
                    config.teacher_learning_rate,
                    config.num_train_steps,
                    lr_decay_power=config.lr_decay_power)
                train_op = tf.group(student_train_op, teacher_train_op)
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=model.total_loss,
                    train_op=train_op,
                    training_hooks=[
                        training_utils.ETAHook(
                            dict(loss=model.mlm_loss,
                                 teacher_loss=model.teacher_loss,
                                 reward=model._baseline),
                            config.num_train_steps, config.iterations_per_loop,
                            config.use_tpu)
                    ])
            else:
                train_op = optimization.create_optimizer(
                    model.total_loss,
                    config.learning_rate,
                    config.num_train_steps,
                    weight_decay_rate=config.weight_decay_rate,
                    use_tpu=config.use_tpu,
                    warmup_steps=config.num_warmup_steps,
                    lr_decay_power=config.lr_decay_power)
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=model.total_loss,
                    train_op=train_op,
                    training_hooks=[
                        training_utils.ETAHook(dict(loss=model.total_loss),
                                               config.num_train_steps,
                                               config.iterations_per_loop,
                                               config.use_tpu)
                    ])
        elif mode == tf.estimator.ModeKeys.EVAL:
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=model.total_loss,
                eval_metric_ops=model.eval_metrics,
                evaluation_hooks=[
                    training_utils.ETAHook(dict(loss=model.total_loss),
                                           config.num_eval_steps,
                                           config.iterations_per_loop,
                                           config.use_tpu,
                                           is_training=False)
                ])
        else:
            raise ValueError("Only TRAIN and EVAL modes are supported")
        return output_spec
    try:
        prediction = collections.OrderedDict()
        prediction['eval_all_nbest'] = filter_short_ans(
            utils.load_pickle(
                (os.path.join(dire, 'models', 'electra_large', 'results',
                              '{}_qa'.format(task_name),
                              '{}_{}_all_nbest.pkl'.format(task_name,
                                                           split)))))
        prediction['squad_null_odds'] = utils.load_json(
            (os.path.join(dire, 'models', 'electra_large', 'results',
                          '{}_qa'.format(task_name),
                          '{}_{}_null_odds.json'.format(task_name, split))))
        models_predictions[d] = prediction
    except:
        utils.log(
            "Error at loading all_nbest.pkl & null_odds.json for model {}".
            format(d))
        continue

dataset = \
  utils.load_json((os.path.join(data_dir, model_name_part, 'finetuning_data', task_name, '{}.json'.format(split))))[
    'data']
qid_answers = collections.OrderedDict()
for article in dataset:
    for p in article['paragraphs']:
        for qa in p['qas']:
            qid = qa['id']
            gold_answers = [
                a['text'] for a in qa['answers'] if normalize_answer(a['text'])
            ]
            if not gold_answers:
Beispiel #24
0
def train_or_eval(config: configure_pretraining.PretrainingConfig):
    """Run pre-training or evaluate the pre-trained model."""
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    utils.log_config(config)

    num_gpus = utils.get_available_gpus()
    utils.log("Found {} gpus".format(len(num_gpus)))

    if num_gpus == 1:
        session_config = tf.ConfigProto(
            log_device_placement=True,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True))

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            # save_checkpoints_secs=3600,
            # tf_random_seed=FLAGS.seed,
            session_config=session_config,
            # keep_checkpoint_max=0,
            log_step_count_steps=100)
    else:
        train_distribution_strategy = tf.distribute.MirroredStrategy(
            devices=None,
            cross_device_ops=tensorflow.contrib.distribute.
            AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus)))
        eval_distribution_strategy = tf.distribute.MirroredStrategy(
            devices=None)

        session_config = tf.ConfigProto(
            # log_device_placement=True,
            inter_op_parallelism_threads=0,
            intra_op_parallelism_threads=0,
            allow_soft_placement=True,
            gpu_options=tf.GPUOptions(allow_growth=True))

        run_config = tf.estimator.RunConfig(
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            train_distribute=train_distribution_strategy,
            eval_distribute=eval_distribution_strategy,
            # save_checkpoints_secs=3600,
            # tf_random_seed=FLAGS.seed,
            session_config=session_config,
            # keep_checkpoint_max=0,
            log_step_count_steps=100)

    model_fn = model_fn_builder(config=config)
    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params={
                                           'train_batch_size':
                                           config.train_batch_size,
                                           'eval_batch_size':
                                           config.eval_batch_size
                                       })

    if config.do_train:
        utils.heading("Running training")
        estimator.train(input_fn=pretrain_data.get_input_fn(config, True),
                        max_steps=config.num_train_steps)
    if config.do_eval:
        utils.heading("Running evaluation")
        result = estimator.evaluate(input_fn=pretrain_data.get_input_fn(
            config, False),
                                    steps=config.num_eval_steps)
        for key in sorted(result.keys()):
            utils.log("  {:} = {:}".format(key, str(result[key])))
        return result
Beispiel #25
0
 def count_params():
     n = np.sum([
         np.prod(v.get_shape().as_list())
         for v in tf.trainable_variables()
     ])
     utils.log("Model size: %dK" % (n / 1000))
Beispiel #26
0
    def featurize(self, example: InputExample, is_training, log=False):
        """Turn an InputExample into a dict of features."""
        tokens_a = self._tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = self._tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b,
                               self.config.max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self.config.max_seq_length - 2:
                tokens_a = tokens_a[0:(self.config.max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it
        # makes it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.config.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.config.max_seq_length
        assert len(input_mask) == self.config.max_seq_length
        assert len(segment_ids) == self.config.max_seq_length

        if log:
            utils.log("  Example {:}".format(example.eid))
            utils.log("    tokens: {:}".format(" ".join(
                [tokenization.printable_text(x) for x in tokens])))
            utils.log("    input_ids: {:}".format(" ".join(map(str,
                                                               input_ids))))
            utils.log("    input_mask: {:}".format(" ".join(
                map(str, input_mask))))
            utils.log("    segment_ids: {:}".format(" ".join(
                map(str, segment_ids))))

        eid = example.eid
        features = {
            "input_ids": input_ids,
            "input_mask": input_mask,
            "segment_ids": segment_ids,
            "task_id": self.config.task_names.index(self.name),
            self.name + "_eid": eid,
        }
        self._add_features(features, example, log)
        return features
Beispiel #27
0
 def train(self):
     utils.log("Training for {:} steps".format(self.train_steps))
     self._estimator.train(input_fn=self._train_input_fn,
                           max_steps=self.train_steps)
Beispiel #28
0
def vote3(dataset, all_nbest, all_odds, qid_answers, qid_questions, models,
          split, output_dir):
    bagging_preds = collections.OrderedDict()
    bagging_odds = collections.OrderedDict()

    def post_process(question, candi, weight=1):
        question = question.lower()
        first_token = candi['text'].split()[0]
        th = 0.
        if "when" in question:
            if first_token in [
                    'before', 'after', 'about', 'around', 'from', 'during'
            ]:
                candi['probability'] += th
        elif "where" in question:
            if first_token in [
                    'in', 'at', 'on', 'behind', 'from', 'through', 'between',
                    'throughout'
            ]:
                candi['probability'] += th
        elif "whose" in question:
            if "'s" in candi['text']:
                candi['probability'] += th
        elif "which" in question:
            if first_token == "the":
                candi['probability'] += th
        candi['probability'] *= weight
        return candi

    cof = 0.2

    for qid in qid_answers:
        question = qid_questions[qid]
        post_process_candidates = (seq(zip(all_nbest, models)).map(lambda x: (
            x[0][qid], cof if 'lr_epoch_results' in x[1] else 1.)).map(
                lambda x: seq(x[0]).map(lambda y: post_process(
                    question, y, x[1])).list()).flatten()).list()
        preds_probs = collections.defaultdict(lambda: [])
        for pred in post_process_candidates:
            preds_probs[pred['text']].append(pred['probability'])
        for pred in post_process_candidates:
            preds_probs[pred['text']] = np.mean(
                preds_probs[pred['text']]).__float__()
        bagging_preds[qid] = (seq(preds_probs.items()).sorted(
            lambda x: x[1]).reverse().map(lambda x: x[0])).list()[0]
        bagging_odds[qid] = np.mean([
            odds[qid] * cof if 'lr_epoch_results' in model else odds[qid]
            for odds, model in zip(all_odds, models)
        ])

    utils.write_json(
        bagging_preds,
        os.path.join(output_dir, 'vote3',
                     'ccks42bagging_{}_preds.json'.format(split)))
    utils.write_json(
        bagging_odds,
        os.path.join(output_dir, 'vote3',
                     'ccks42bagging_{}_null_odds.json'.format(split)))

    if split in ['train', 'dev']:
        out_eval = main2(dataset, bagging_preds, bagging_odds)
        utils.log('vote3')
        utils.log(out_eval)
    elif split == 'eval':
        for qid in bagging_preds.keys():
            if bagging_odds[qid] > -2.75:
                bagging_preds[qid] = ""
        utils.write_json(
            bagging_preds,
            os.path.join(output_dir, 'vote3',
                         'ccks42bagging_{}_1_preds.json'.format(split)))
    else:
        utils.log('{} split is not supported'.format(split))
Beispiel #29
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
    """Run finetuning."""

    # Setup for training
    results = []
    trial = 1
    heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial,
                                                     config.num_trials)
    heading = lambda msg: utils.heading(msg + ": " + heading_info)
    heading("Config")
    utils.log_config(config)
    generic_model_dir = config.model_dir
    tasks = task_builder.get_tasks(config)

    # Train and evaluate num_trials models with different random seeds
    while config.num_trials < 0 or trial <= config.num_trials:

        print("#################################################")
        print(tasks)

        t = vars(config)
        print(t)
        print("#################################################")

        # Create Neptune Experiment
        neptune.create_experiment(name=f'tf-ft', params=vars(config))

        config.model_dir = generic_model_dir + "_" + str(trial) + '_' + str(
            random.randint(0, 10000))
        if config.do_train:
            utils.rmkdir(config.model_dir)

        model_runner = ModelRunner(config, tasks)
        if config.do_train:
            heading("Start training")
            model_runner.train()
            utils.log()

        if config.do_eval:
            heading("Run dev set evaluation")
            eval_result = model_runner.evaluate()
            results.append(eval_result)

            write_results(config, results)
            if config.write_test_outputs and trial <= config.n_writes_test:
                heading("Running on the test set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(
                            task, "test", False)
                        scorer.write_predictions()
                        preds = utils.load_json(config.qa_preds_file("squad"))
                        null_odds = utils.load_json(config.qa_na_file("squad"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "test", trial))
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )

        if trial != config.num_trials and (not config.keep_all_models):
            utils.rmrf(config.model_dir)
        trial += 1
Beispiel #30
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
    """Run finetuning."""

    # Setup for training
    results = []
    trial = 1
    heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial,
                                                     config.num_trials)
    heading = lambda msg: utils.heading(msg + ": " + heading_info)
    heading("Config")
    utils.log_config(config)
    generic_model_dir = config.model_dir
    tasks = task_builder.get_tasks(config)

    # Train and evaluate num_trials models with different random seeds
    while config.num_trials < 0 or trial <= config.num_trials:
        config.model_dir = generic_model_dir + "_" + str(trial)
        if config.do_train:
            utils.rmkdir(config.model_dir)

        model_runner = ModelRunner(config, tasks)
        if config.do_train:
            heading("Start training")
            model_runner.train()
            utils.log()

        if config.do_eval:
            heading("Run dev set evaluation")
            results.append(model_runner.evaluate())
            if config.do_test:
                for task in tasks:
                    test_score = model_runner.evaluate_task_test(
                        task, results[-1][task.name]['checkpoint_path'])
                    results[-1][task.name]["test_results"] = test_score
            write_results(config, results)
            if config.write_test_outputs and trial <= config.n_writes_test:
                heading("Running on the test set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(
                            task, "test", False)
                        scorer.write_predictions()
                        preds = utils.load_json(config.qa_preds_file("squad"))
                        null_odds = utils.load_json(config.qa_na_file("squad"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "test", trial))
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )
        if config.do_predict:
            if "dev" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "dev")
                import pickle
                with open("predict_dev.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "train" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "train")
                import pickle
                with open("predict_train.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "test" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "test")
                import pickle
                with open("predict_test.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

        if trial != config.num_trials and (not config.keep_all_models):
            utils.rmrf(config.model_dir)
        trial += 1