Exemple #1
0
    def run(self) -> None:
        has_reader = self._dataset_reader is not None
        index = 0
        start_time = time.time()
        print('Starting prediction')
        if has_reader:
            for batch in lazy_groups_of(self._get_instance_data(),
                                        self._batch_size):
                for model_input_instance, result in zip(
                        batch, self._predict_instances(batch)):
                    self._maybe_print_to_console_and_file(
                        index, result, str(model_input_instance))
                    index = index + 1
        else:
            for batch_json in lazy_groups_of(self._get_json_data(),
                                             self._batch_size):
                for model_input_json, result in zip(
                        batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(
                        index, result, json.dumps(model_input_json))
                    index = index + 1
        end_time = time.time()

        print('Ending prediction')
        print('Total Time = ', (end_time - start_time), ' s')
        if self._output_file is not None:
            self._output_file.close()
Exemple #2
0
    def run(self) -> None:
        has_reader = self._dataset_reader is not None
        index = 0
        if has_reader:
            for batch in tqdm(lazy_groups_of(self._get_instance_data(),
                                             self._batch_size),
                              total=self.total_size,
                              unit="batches"):
                for model_input_instance, result in zip(
                        batch, self._predict_instances(batch)):
                    self._maybe_print_to_console_and_file(
                        index, result, str(model_input_instance))
                    index = index + 1
        else:
            for batch_json in tqdm(lazy_groups_of(self._get_json_data(),
                                                  self._batch_size),
                                   total=self.total_size,
                                   unit="batches"):
                for model_input_json, result in zip(
                        batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(
                        index, result, json.dumps(model_input_json))
                    index = index + 1

        if self._output_file is not None:
            self._output_file.close()
Exemple #3
0
    def _memory_sized_lists(
            self, instances: Iterable[Instance]) -> Iterable[List[Instance]]:
        """
        Breaks the dataset into "memory-sized" lists of instances,
        which it yields up one at a time until it gets through a full epoch.

        For example, if the dataset is already an in-memory list, and each epoch
        represents one pass through the dataset, it just yields back the dataset.
        Whereas if the dataset is lazily read from disk and we've specified to
        load 1000 instances at a time, then it yields lists of 1000 instances each.
        """
        lazy = is_lazy(instances)

        # Get an iterator over the next epoch worth of instances.
        iterator = self._take_instances(instances, self._instances_per_epoch)

        # We have four different cases to deal with:

        # With lazy instances and no guidance about how many to load into memory,
        # we just load ``batch_size`` instances at a time:
        if lazy and self._max_instances_in_memory is None:
            yield from lazy_groups_of(iterator, self._batch_size)
        # If we specified max instances in memory, lazy or not, we just
        # load ``max_instances_in_memory`` instances at a time:
        elif self._max_instances_in_memory is not None:
            yield from lazy_groups_of(iterator, self._max_instances_in_memory)
        # If we have non-lazy instances, and we want all instances each epoch,
        # then we just yield back the list of instances:
        elif self._instances_per_epoch is None:
            yield ensure_list(instances)
        # In the final case we have non-lazy instances, we want a specific number
        # of instances each epoch, and we didn't specify how to many instances to load
        # into memory. So we convert the whole iterator to a list:
        else:
            yield list(iterator)
 def run(self) -> None:
     has_reader = self._dataset_reader is not None
     index = 0
     if has_reader:
         for batch in lazy_groups_of(self._get_instance_data(),
                                     self._batch_size):
             for model_input_instance, result in zip(
                     batch, self._predict_instances(batch)):
                 self._maybe_print_to_console_and_file(
                     index, result, str(model_input_instance))
                 index = index + 1
     else:
         ids_ = []
         vecs = []
         for batch_json in tqdm(
                 lazy_groups_of(self._get_json_data(), self._batch_size)):
             for model_input_json, result in zip(
                     batch_json, self._predict_json(batch_json)):
                 scalar_mix = (
                     torch.Tensor(
                         result['activation_encoder_layer_0']).unsqueeze(0)
                     + -20 * torch.Tensor(
                         result['activation_encoder_layer_1']).unsqueeze(0)
                     +
                     torch.Tensor(result['activation_theta']).unsqueeze(0))
                 vecs.append(scalar_mix)
                 ids_.append(
                     torch.IntTensor([model_input_json['index']
                                      ]).unsqueeze(0))
                 index = index + 1
     torch.save((torch.cat(ids_, 0), torch.cat(vecs, 0)), self._output_file)
Exemple #5
0
    def run(self) -> None:
        has_reader = self._dataset_reader is not None
        index = 0
        if has_reader:
            if self._extend_vocab:
                instances = list(self._get_instance_data())
                self._predictor._model.vocab.extend_from_instances(
                    instances=instances)
                self._predictor._model.extend_embedder_vocab()
            else:
                instances = self._get_instance_data()

            for batch in lazy_groups_of(instances, self._batch_size):
                for model_input_instance, result in zip(
                        batch, self._predict_instances(batch)):
                    self._maybe_print_to_console_and_file(
                        index, result, str(model_input_instance))
                    index = index + 1
        else:
            for batch_json in lazy_groups_of(self._get_json_data(),
                                             self._batch_size):
                for model_input_json, result in zip(
                        batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(
                        index, result, json.dumps(model_input_json))
                    index = index + 1

        if self._output_file is not None:
            self._output_file.close()
Exemple #6
0
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        # First break the dataset into memory-sized lists:
        for instance_list in self._memory_sized_lists(instances):
            # group the instance into two groups, each of identical 'is_flesh_event' fields
            flesh_iterator = \
                (instance for instance in instance_list if instance.fields['is_flesh_event'].metadata is True)
            sketch_iterator = \
                (instance for instance in instance_list if instance.fields['is_flesh_event'].metadata is False)

            # break each memory-sized list into batches and mix the batches into one list so we could shuffle it
            batches = list()
            excess: Deque[Instance] = deque()
            for batch_instances in lazy_groups_of(flesh_iterator,
                                                  self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                        batch_instances, excess):
                    batches.append(Batch(possibly_smaller_batches))
            if excess:
                batches.append(Batch(excess))
            excess: Deque[Instance] = deque()
            # Then break each memory-sized list into batches.
            for batch_instances in lazy_groups_of(sketch_iterator,
                                                  self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                        batch_instances, excess):
                    batches.append(Batch(possibly_smaller_batches))
            if excess:
                batches.append(Batch(excess))
            # shuffle
            if shuffle:
                random.shuffle(batches)
            # generate
            for batch in batches:
                yield batch
    def _memory_sized_lists(self, instances: Iterable[Instance]) -> Iterable[List[Instance]]:
        """
        Breaks the dataset into "memory-sized" lists of instances,
        which it yields up one at a time until it gets through a full epoch.

        For example, if the dataset is already an in-memory list, and each epoch
        represents one pass through the dataset, it just yields back the dataset.
        Whereas if the dataset is lazily read from disk and we've specified to
        load 1000 instances at a time, then it yields lists of 1000 instances each.
        """
        lazy = is_lazy(instances)

        # Get an iterator over the next epoch worth of instances.
        iterator = self._take_instances(instances, self._instances_per_epoch)

        # We have four different cases to deal with:

        # With lazy instances and no guidance about how many to load into memory,
        # we just load ``batch_size`` instances at a time:
        if lazy and self._max_instances_in_memory is None:
            yield from lazy_groups_of(iterator, self._batch_size)
        # If we specified max instances in memory, lazy or not, we just
        # load ``max_instances_in_memory`` instances at a time:
        elif self._max_instances_in_memory is not None:
            yield from lazy_groups_of(iterator, self._max_instances_in_memory)
        # If we have non-lazy instances, and we want all instances each epoch,
        # then we just yield back the list of instances:
        elif self._instances_per_epoch is None:
            yield ensure_list(instances)
        # In the final case we have non-lazy instances, we want a specific number
        # of instances each epoch, and we didn't specify how to many instances to load
        # into memory. So we convert the whole iterator to a list:
        else:
            yield list(iterator)
Exemple #8
0
 def predict(self, sentence: Union[str, List[str], List[List[str]],
                                   List[data.Sentence]]):
     if isinstance(sentence, str):
         return data.Sentence.from_dict(
             self.predict_json({"sentence": sentence}))
     elif isinstance(sentence, list):
         if len(sentence) == 0:
             return []
         example = sentence[0]
         if isinstance(example, str) or isinstance(example, list):
             sentences = []
             for sentences_batch in util.lazy_groups_of(
                     sentence, self.batch_size):
                 sentences_batch = self.predict_batch_json(
                     [self._to_input_json(s) for s in sentences_batch])
                 sentences.extend(sentences_batch)
             return sentences
         elif isinstance(example, data.Sentence):
             sentences = []
             for sentences_batch in util.lazy_groups_of(
                     sentence, self.batch_size):
                 sentences_batch = self.predict_batch_instance(
                     [self._to_input_instance(s) for s in sentences_batch])
                 sentences.extend(sentences_batch)
             return sentences
         else:
             raise ValueError(
                 "List must have either sentences as str, List[str] or Sentence object."
             )
     else:
         raise ValueError("Input must be either string or list of strings.")
Exemple #9
0
    def create_cached_cnn_embeddings(self, tokens: List[str]) -> None:
        """
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.

        This function sets 3 attributes:

        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor)
            token_embedding = output["token_embedding"]
            mask = output["mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0)
Exemple #10
0
    def create_cached_cnn_embeddings(self, tokens           )        :
        u"""
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.

        This function sets 3 attributes:

        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor)
            token_embedding = output[u"token_embedding"]
            mask = output[u"mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0)
Exemple #11
0
    def run(self) -> None:
        has_reader = self._dataset_reader is not None
        if has_reader:
            for batch in lazy_groups_of(self._get_instance_data(), self._batch_size):
                for result in self._predict_instances(batch):
                    self._maybe_print_to_console_and_file(result)
        else:
            for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size):
                for model_input, result in zip(batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(result, json.dumps(model_input))

        if self._output_file is not None:
            self._output_file.close()
def get_loss_per_candidate_squad(
    index,
    model,
    trigger_token_ids,
    cand_trigger_token_ids,
    vocab,
    dev_dataset,
    span_start,
    span_end,
):
    """
    Similar to get_loss_per_candidate, except that we use multiple batches (in this case 4) rhater than one
    to evaluate the top trigger token candidates.
    """
    if isinstance(cand_trigger_token_ids[0], (numpy.int64, int)):
        print("Only 1 candidate for index detected, not searching")
        return trigger_token_ids
    model.get_metrics(reset=True)
    loss_per_candidate = []
    iterator = BasicIterator(batch_size=32)
    batch_count = 0
    curr_loss = 0.0
    for batch in lazy_groups_of(iterator(dev_dataset,
                                         num_epochs=1,
                                         shuffle=True),
                                group_size=1):
        if batch_count > 4:
            continue
        batch_count = batch_count + 1
        curr_loss += (evaluate_batch_squad(
            model, batch, trigger_token_ids, vocab, span_start,
            span_end)["loss"].cpu().detach().numpy())
    loss_per_candidate.append((deepcopy(trigger_token_ids), curr_loss))

    for cand_id in range(len(cand_trigger_token_ids[0])):
        temp_trigger_token_ids = deepcopy(trigger_token_ids)
        temp_trigger_token_ids[index] = cand_trigger_token_ids[index][cand_id]
        loss = 0
        batch_count = 0
        for batch in lazy_groups_of(iterator(dev_dataset,
                                             num_epochs=1,
                                             shuffle=True),
                                    group_size=1):
            if batch_count > 4:
                continue
            batch_count = batch_count + 1
            loss += (evaluate_batch_squad(
                model, batch, temp_trigger_token_ids, vocab, span_start,
                span_end)["loss"].cpu().detach().numpy())
        loss_per_candidate.append((deepcopy(temp_trigger_token_ids), loss))
    return loss_per_candidate
Exemple #13
0
def get_mixer(it1,
              ds1,
              it2,
              ds2,
              num_gpus=1,
              id1=0,
              which_mixer='bm',
              min_pct_of_ds2=0.0,
              shuffle=True):
    #Pdb().set_trace()
    num_batches1 = math.ceil(it1.get_num_batches(ds1) / num_gpus)
    if ds2 is None:
        num_batches2 = 0
    else:
        num_batches2 = math.ceil(it2.get_num_batches(ds2) / num_gpus)

    id2 = (id1 + 1) % 2
    if (which_mixer in ['em', 'bm']) or (num_batches1 == 0) or (num_batches2
                                                                == 0):
        total_batches = num_batches1 + num_batches2
        num_epochs1 = 1
        num_epochs2 = 1
    elif which_mixer == 'cm':
        total_batches = round(
            max(2 * num_batches1, 2 * min_pct_of_ds2 * num_batches2))
        # total_batches = 2*max(num_batches1, num_batches2)
        #total_batches = 2*num_batches1
        # num_epochs1 = math.ceil(total_batches/(2*num_batches1))
        num_epochs1 = round(total_batches / (2 * num_batches1))
        #num_epochs1 = 1
        num_epochs2 = math.ceil(total_batches / (2 * num_batches2))
    else:
        raise "incorrect value of which_mixer {}".format(which_mixer)

    raw_g1 = it1(ds1, num_epochs=num_epochs1, shuffle=shuffle)
    raw_g2 = it2(ds2, num_epochs=num_epochs2, shuffle=shuffle)
    g1 = lazy_groups_of(raw_g1, num_gpus)
    if ds2 is None:
        g2 = None
    else:
        g2 = lazy_groups_of(raw_g2, num_gpus)
    if (which_mixer == 'em') or (num_batches1 == 0) or (num_batches2 == 0):
        mixer = mix_generators_em(g1, g2, id1)
    elif which_mixer == 'bm':
        mixer = mix_generators_bm(g1, g2, num_batches1, num_batches2, id1)
    elif which_mixer == 'cm':
        mixer = mix_generators_cm(g1, g2, id1)
    #
    return (mixer, total_batches)
    def validate(self, trainer: 'CallbackTrainer'):
        # If the trainer has MovingAverage objects, use their weights for validation.
        for moving_average in self.moving_averages:
            moving_average.assign_average_value()

        with torch.no_grad():
            # We have a validation set, so compute all the metrics on it.
            logger.info("Validating")

            trainer.model.eval()

            num_gpus = len(trainer._cuda_devices)  # pylint: disable=protected-access

            raw_val_generator = self.iterator(self.instances,
                                              num_epochs=1,
                                              shuffle=False)
            val_generator = lazy_groups_of(raw_val_generator, num_gpus)
            num_validation_batches = math.ceil(
                self.iterator.get_num_batches(self.instances) / num_gpus)
            val_generator_tqdm = Tqdm.tqdm(val_generator,
                                           total=num_validation_batches)

            batches_this_epoch = 0
            val_loss = 0
            for batch_group in val_generator_tqdm:

                loss = trainer.batch_loss(batch_group, for_training=False)
                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    val_loss += loss.detach().cpu().numpy()
                if self.loss_tracker is not None:
                    ''' update validation regular / irregular loss status '''
                    if trainer.model._effective_encoder is trainer.model._sketch_encoder:
                        self.loss_tracker.cumulated_regular_loss += loss.detach(
                        ).cpu().numpy()
                        self.loss_tracker.regular_batch_count += 1.
                    else:
                        self.loss_tracker.cumulated_irregular_loss += loss.detach(
                        ).cpu().numpy()
                        self.loss_tracker.irregular_batch_count += 1.
                # Update the description with the latest metrics
                val_metrics = training_util.get_metrics(
                    trainer.model, val_loss, batches_this_epoch)
                description = training_util.description_from_metrics(
                    val_metrics)
                val_generator_tqdm.set_description(description, refresh=False)

            trainer.val_metrics = training_util.get_metrics(trainer.model,
                                                            val_loss,
                                                            batches_this_epoch,
                                                            reset=True)

        # If the trainer has a moving average, restore
        for moving_average in self.moving_averages:
            moving_average.restore()
Exemple #15
0
    def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):

            instance_list = sort_by_padding(instance_list,
                                            self._sorting_keys,
                                            self.vocab,
                                            self._padding_noise)

            batches = []
            for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
                    batches.append(Batch(possibly_smaller_batches))

            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if shuffle:
                random.shuffle(batches)
            else:
                logger.warning("shuffle parameter is set to False,"
                               " while bucket iterators by definition change the order of your data.")
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            yield from batches
    def _create_batches(self, instances                    , shuffle      )                   :
        for instance_list in self._memory_sized_lists(instances):

            instance_list = sort_by_padding(instance_list,
                                            self._sorting_keys,
                                            self.vocab,
                                            self._padding_noise)

            batches = []
            for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
                    batches.append(Batch(possibly_smaller_batches))

            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if shuffle:
                random.shuffle(batches)
            else:
                logger.warning(u"shuffle parameter is set to False,"
                               u" while bucket iterators by definition change the order of your data.")
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            _i = batches
            while True:
                yield _i.next()
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):

            instance_list = sort_by_padding(instance_list, self._sorting_keys,
                                            self.vocab, self._padding_noise)

            batches = []
            for batch_instances in lazy_groups_of(iter(instance_list),
                                                  self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                        batch_instances):
                    batches.append(Batch(possibly_smaller_batches))

            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if shuffle:
                # NOTE: if shuffle is false, the data will still be in a different order
                # because of the bucket sorting.
                random.shuffle(batches)
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            yield from batches
Exemple #18
0
    def _test_epoch(self):
        """
        Validate after training an epoch

        :return: A log that contains information about validation

        Note:
            The validation metrics in log must have the key 'val_metrics'.
        """
        self.valid_iter, self.valid_num_batches = self.data_loader.get_iterator_and_num_batches(
            'test')
        self.valid_iter = lazy_groups_of(self.valid_iter, self.n_gpu_use)

        self.model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch_idx, data in enumerate(self.valid_iter):
                output = self._run_model(data)
                loss = output['loss']
                total_val_loss += loss.item()

        metrics = self.model.get_metrics(True)
        metrics.update({
            'test_loss': total_val_loss / self.valid_num_batches,
        })

        return metrics
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        # First break the dataset into memory-sized lists:
        for instance_list in self._memory_sized_lists(instances):
            if shuffle:
                random.shuffle(instance_list)

            # Divvy up the instances based on their value of the "partition_key" field.
            hoppers: Dict[str, List[Instance]] = defaultdict(list)
            for instance in instance_list:
                partition = instance.fields[
                    self._partition_key].metadata  # type: ignore
                hoppers[partition].append(instance)

            # Get a `lazy_groups_of` iterator over each set of homogeneous instances.
            batches = {
                key: lazy_groups_of(iter(hopper), self._batch_size)
                for key, hopper in hoppers.items()
            }

            remaining = set(batches)

            # Yield batches in a round-robin fashion until none are left.
            while remaining:
                # TODO: shuffle keys before each batch creation.
                # Another approach can be to sample a task proportional to its data probability.
                # Then sample a batch from that task.
                # Data prob can be updated once the batch is sampled.
                for key, lazy_batches in batches.items():
                    if key in remaining:
                        try:
                            batch = next(lazy_batches)
                            yield Batch(batch)
                        except StopIteration:
                            remaining.remove(key)
Exemple #20
0
    def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):

            instance_list = sort_by_padding(instance_list,
                                            self._sorting_keys,
                                            self.vocab,
                                            self._padding_noise)

            batches = []
            excess: Deque[Instance] = deque()
            for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances, excess):
                    if self._skip_smaller_batches and len(possibly_smaller_batches) < self._batch_size:
                        continue
                    batches.append(Batch(possibly_smaller_batches))
            if excess and (not self._skip_smaller_batches or len(excess) == self._batch_size):
                batches.append(Batch(excess))

            # TODO(brendanr): Add multi-GPU friendly grouping, i.e. group
            # num_gpu batches together, shuffle and then expand the groups.
            # This guards against imbalanced batches across GPUs.
            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if shuffle:
                # NOTE: if shuffle is false, the data will still be in a different order
                # because of the bucket sorting.
                random.shuffle(batches)
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            yield from batches
 def _create_batches(self, instances: Iterable[Instance],
                     shuffle: bool) -> Iterable[Batch]:
     # First break the dataset into memory-sized lists:
     epoch_number = self._epochs[id(instances)]
     filtered_instances = []
     for instance in instances:
         if 'epoch_numbers' in instance.fields:
             epoch_numbers = instance.fields['epoch_numbers'].array
             if epoch_number in epoch_numbers:
                 filtered_instances.append(instance)
                 continue
             elif -1 in epoch_numbers:
                 filtered_instances.append(instance)
                 continue
             else:
                 continue
         filtered_instances.append(instance)
     for instance_list in self._memory_sized_lists(filtered_instances):
         if shuffle:
             random.shuffle(instance_list)
         iterator = iter(instance_list)
         excess: Deque[Instance] = deque()
         # Then break each memory-sized list into batches.
         for batch_instances in lazy_groups_of(iterator, self._batch_size):
             for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(
                     batch_instances, excess):
                 batch = Batch(possibly_smaller_batches)
                 yield batch
         if excess:
             yield Batch(excess)
Exemple #22
0
    def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        # First break the dataset into memory-sized lists:
        for instance_list in self._memory_sized_lists(instances):
            if shuffle:
                random.shuffle(instance_list)

            # Divvy up the instances based on their value of the "partition_key" field.
            hoppers: Dict[str, List[Instance]] = defaultdict(list)
            for instance in instance_list:
                partition = instance.fields[self._partition_key].metadata  # type: ignore
                hoppers[partition].append(instance)

            # Get a `lazy_groups_of` iterator over each set of homogeneous instances.
            batches = {key: lazy_groups_of(iter(hopper), self._batch_size) for key, hopper in hoppers.items()}

            remaining = set(batches)

            # Yield batches in a round-robin fashion until none are left.
            while remaining:
                for key, lazy_batches in batches.items():
                    if key in remaining:
                        try:
                            batch = next(lazy_batches)
                            if not self._skip_smaller_batches or len(batch) == self._batch_size:
                                yield Batch(batch)
                        except StopIteration:
                            remaining.remove(key)
Exemple #23
0
def get_average_grad_squad(model, vocab, trigger_token_ids, dev_dataset,
                           span_start, span_end):
    """
    Same as get_average_grad() in utils.py, except that we use the entire development set to
    compute the gradient for the triggers (rather than one batch).
    """
    batch_count = 0
    optimizer = optim.Adam(model.parameters())
    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)
    for batch in lazy_groups_of(iterator(dev_dataset,
                                         num_epochs=1,
                                         shuffle=True),
                                group_size=1):
        optimizer.zero_grad()
        utils.extracted_grads = []  # clear existing stored grads
        loss = evaluate_batch_squad(model, batch, trigger_token_ids, vocab,
                                    span_start, span_end)['loss']
        loss.backward()
        if batch_count == 0:
            grads = torch.sum(utils.extracted_grads[0], dim=0).detach()[0:len(
                trigger_token_ids)]  # inddex 0 is passage
        else:
            grads += torch.sum(utils.extracted_grads[0],
                               dim=0).detach()[0:len(trigger_token_ids)]
        batch_count = batch_count + 1

    averaged_grad = grads / batch_count
    return averaged_grad.cpu()
Exemple #24
0
    def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):

            instance_list = sort_by_padding(instance_list,
                                            self._sorting_keys,
                                            self.vocab,
                                            self._padding_noise)

            batches = []
            excess: Deque[Instance] = deque()
            for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances, excess):
                    batches.append(Batch(possibly_smaller_batches))
            if excess:
                batches.append(Batch(excess))

            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if shuffle:
                # NOTE: if shuffle is false, the data will still be in a different order
                # because of the bucket sorting.
                random.shuffle(batches)
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            yield from batches
Exemple #25
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        num_gpus = len(self._cuda_devices)

        raw_val_generator = val_iterator(self._validation_data,
                                         num_epochs=1,
                                         shuffle=False)
        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
        num_validation_batches = math.ceil(
            val_iterator.get_num_batches(self._validation_data) / num_gpus)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        print("val gene called")
        batches_this_epoch = 0
        val_loss = 0
        try:
            few_shot = val_generator.__next__()
        except:
            print("Error could not do few shot validation")
            return batches_this_epoch, val_loss
        self.reptile_inner_update(few_shot)
        self.model.eval()

        with torch.no_grad():
            for batch_group in val_generator_tqdm:
                loss = self.batch_loss(batch_group, for_training=False)
                if loss is not None:
                    # You shouldn't necessarily have to compute a loss for validation, so we allow for
                    # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                    # currently only used as the divisor for the loss function, so we can safely only
                    # count those batches for which we actually have a loss.  If this variable ever
                    # gets used for something else, we might need to change things around a bit.
                    batches_this_epoch += 1
                    val_loss += loss.detach().cpu().numpy()

                # Update the description with the latest metrics
                val_metrics = training_util.get_metrics(
                    self.model, val_loss, batches_this_epoch)
                description = training_util.description_from_metrics(
                    val_metrics)
                val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
    def _instances_to_batches(
        self, instance_iterator: Iterable[Instance], move_to_device
    ) -> Iterator[TensorDict]:
        instance_iterator = (self._index_instance(instance) for instance in instance_iterator)

        if move_to_device and self.cuda_device is not None:
            tensorize = lambda batch: nn_util.move_to_device(  # noqa: E731
                self.collate_fn(batch), self.cuda_device
            )
        else:
            tensorize = self.collate_fn

        if self.batch_sampler is not None:
            instance_chunks: Iterable[List[Instance]]

            if self.max_instances_in_memory is not None:
                instance_chunks = lazy_groups_of(instance_iterator, self.max_instances_in_memory)
            else:
                instance_chunks = [list(instance_iterator)]

            for instances in instance_chunks:
                batches = (
                    [instances[i] for i in batch_indices]
                    for batch_indices in self.batch_sampler.get_batch_indices(instances)
                )
                for batch in batches:
                    yield tensorize(batch)
        else:
            # Safe to assume this is not `None` when `self.batch_sampler` is `None`.
            assert self.batch_size is not None

            if self.shuffle:
                if self.max_instances_in_memory is not None:
                    instance_iterator = shuffle_iterable(
                        instance_iterator,
                        self.max_instances_in_memory,
                    )
                else:
                    # At this point we've already loaded the instances in memory and indexed them,
                    # so this won't take long.
                    instance_iterator = list(instance_iterator)
                    random.shuffle(instance_iterator)

            for batch in lazy_groups_of(instance_iterator, self.batch_size):
                if self.drop_last and len(batch) < self.batch_size:
                    break
                yield tensorize(batch)
Exemple #27
0
    def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        # First break the dataset into memory-sized lists:
        for instance_list in self._memory_sized_lists(instances):
            instances_w_epoch_num = 0
            for instance in instances:
                if "epoch_num" in instance.fields:
                    instances_w_epoch_num += 1

            print(f"\nInstances: {len(instance_list)}")

            epochs_list = list(self._epochs.values())
            assert len(epochs_list) == 1, f"Multiple epoch keys: {self._epochs}"
            epoch_num = epochs_list[0]
            if self._track_epoch:
                for instance in instance_list:
                    instance.fields["epoch_num"] = epoch_num

            supervision_dict = defaultdict(int)
            qtype_dict = defaultdict(int)
            for instance in instance_list:
                for key in self.supervision_keys:
                    supervision_dict[key] += 1 if instance[key].metadata else 0
                qtype_dict[instance["qtypes"].metadata] += 1

            print(f"QType: {qtype_dict}")

            # These QType instances will not be kept in the first curriculum even if supervised
            NO_CURRICULUM = [
                constants.COUNT_filter_find_qtype,
                constants.MAX_filter_find_qtype,
                constants.MIN_filter_find_qtype,
                constants.NUM_filter_find_qtype,
            ]

            filtered_instance_list = []
            if self.filter_instances and epoch_num < self.filter_for_epochs:
                for instance in instance_list:
                    if (
                        any(instance[key].metadata is True for key in self.supervision_keys)
                        and not instance["qtypes"].metadata in NO_CURRICULUM
                    ):
                        filtered_instance_list.append(instance)
            else:
                filtered_instance_list = instance_list

            print(f"SupervisionDict: {supervision_dict}")
            print(f"Filtered Instances: {len(filtered_instance_list)}")

            if shuffle:
                random.shuffle(filtered_instance_list)
            iterator = iter(filtered_instance_list)
            excess: Deque[Instance] = deque()
            # Then break each memory-sized list into batches.
            for batch_instances in lazy_groups_of(iterator, self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances, excess):
                    batch = Batch(possibly_smaller_batches)
                    yield batch
            if excess:
                yield Batch(excess)
Exemple #28
0
 def test_lazy_groups_of(self):
     xs = [1, 2, 3, 4, 5, 6, 7]
     groups = util.lazy_groups_of(iter(xs), group_size=3)
     assert next(groups) == [1, 2, 3]
     assert next(groups) == [4, 5, 6]
     assert next(groups) == [7]
     with pytest.raises(StopIteration):
         _ = next(groups)
 def retrieve_cluster(target_data):
     encoder_outs = []
     for batch in lazy_groups_of(iterator(target_data, num_epochs=1, shuffle=True), group_size=1):
         output = evaluate_batch(model, batch, trigger_token_ids=None, snli=False)
         encoder_outs.append(output['encoder_out'].detach().cpu())
         # break
     encoder_outs = torch.cat(encoder_outs, 0) #bs * 128
     return encoder_outs, torch.mean(encoder_outs,0)
    def __iter__(self) -> Iterable[List[int]]:

        indices = self._argsort_by_padding(self.data_source)
        for group in lazy_groups_of(indices, self.batch_size):
            batch_indices = list(group)
            if self.drop_last and len(batch_indices) < self.batch_size:
                continue
            yield batch_indices
Exemple #31
0
 def test_lazy_groups_of(self):
     xs = [1, 2, 3, 4, 5, 6, 7]
     groups = util.lazy_groups_of(iter(xs), group_size=3)
     assert next(groups) == [1, 2, 3]
     assert next(groups) == [4, 5, 6]
     assert next(groups) == [7]
     with pytest.raises(StopIteration):
         _ = next(groups)
Exemple #32
0
    def run(self):
        has_reader = self._dataset_reader is not None
        if has_reader:
            for batch in lazy_groups_of(self._get_instance_data(),
                                        self._batch_size):
                for result in self._predict_instances(batch):
                    self._maybe_print_to_console_and_file(result)
        else:
            for batch_json in lazy_groups_of(self._get_json_data(),
                                             self._batch_size):
                for model_input, result in izip(
                        batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(
                        result, json.dumps(model_input))

        if self._output_file is not None:
            self._output_file.close()
Exemple #33
0
def get_accuracy(model,
                 dev_dataset,
                 vocab,
                 trigger_token_ids=None,
                 snli=False,
                 reset_metric=True):
    """
    When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with
    triggers prepended for the whole dev_dataset.
    """
    if reset_metric:
        model.get_metrics(reset=True)
    model.eval()  # model should be in eval() already, but just in case
    if snli:
        iterator = BucketIterator(batch_size=128,
                                  sorting_keys=[("premise", "num_tokens")])
    else:
        iterator = BucketIterator(batch_size=128,
                                  sorting_keys=[("tokens", "num_tokens")])
    iterator.index_with(vocab)

    logits = []
    labels = []
    print_string = []
    if trigger_token_ids:
        for idx in trigger_token_ids:
            print_string += [vocab.get_token_from_index(idx)]

    for batch in lazy_groups_of(iterator(dev_dataset,
                                         num_epochs=1,
                                         shuffle=False),
                                group_size=1):
        output = evaluate_batch(model, batch, trigger_token_ids, snli)
        logits.append(output['logits'].detach().cpu().numpy())
        labels.append(output['labels'].detach().cpu().numpy())

    logits = np.concatenate(logits, 0)
    labels = np.concatenate(labels, 0)
    preds_int = np.argmax(logits, 1)
    success_idx = np.where(labels != preds_int)[0]
    acc = accuracy_score(labels, preds_int)
    if len(np.unique(labels)) > 1:
        f1_weighted = f1_score(labels, preds_int, average="weighted")
        try:
            f1 = f1_score(labels, preds_int)
        except:
            f1 = f1_weighted
    else:
        f1 = 'N/A'
        f1_weighted = 'N/A'

    try:
        auc = roc_auc_score(labels, preds_int)
        auc = "{:.4f}".format(auc)
    except:
        auc = "N/A"

    return acc, auc, f1, f1_weighted, success_idx
Exemple #34
0
    def _create_batches(self, instances: Iterable[Instance],
                        shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):

            # WARNING: Assumes that defaultdict is ordered i.e > python 3.6
            # Divvy up the instances based on their value of the "partition_key" field.
            hoppers: Dict[str, List[Instance]] = defaultdict(list)
            for instance in instance_list:
                partition = instance.fields[
                    self._partition_key].metadata  # type: ignore
                hoppers[partition].append(instance)

            # Shuffle each parition separately
            if shuffle:
                # NOTE: if shuffle is false, the data will still be in a different order
                # because of the bucket sorting.
                for k, v in hoppers.items():
                    random.shuffle(v)

            # Sort each partition spearately
            hoppers = {
                key: sort_by_padding(value, self._sorting_keys, self.vocab,
                                     self._padding_noise)
                for key, value in hoppers.items()
            }

            instance_list = sort_by_padding(instance_list, self._sorting_keys,
                                            self.vocab, self._padding_noise)
            # Get a `lazy_groups_of` iterator over each set of homogeneous instances.
            batches = {
                key: lazy_groups_of(iter(hopper), self._batch_size)
                for key, hopper in hoppers.items()
            }

            remaining = set(batches)
            # Yield batches in a round-robin fashion until none are left.

            keys = batches.keys()

            # TODO(brendanr): Add multi-GPU friendly grouping, i.e. group
            # num_gpu batches together, shuffle and then expand the groups.
            # This guards against imbalanced batches across GPUs.
            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches[keys[0]].pop()
                penultimate_batch = batches[keys[0]].pop()
                batches[keys[0]].insert(0, penultimate_batch)
                batches[keys[0]].insert(0, last_batch)

            while remaining:
                for key, lazy_batches in batches.items():
                    if key in remaining:
                        try:
                            batch = next(lazy_batches)
                            yield Batch(batch)
                        except StopIteration:
                            remaining.remove(key)
Exemple #35
0
 def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
     # First break the dataset into memory-sized lists:
     for instance_list in self._memory_sized_lists(instances):
         if shuffle:
             random.shuffle(instance_list)
         iterator = iter(instance_list)
         # Then break each memory-sized list into batches.
         for batch_instances in lazy_groups_of(iterator, self._batch_size):
             yield Batch(batch_instances)
 def _iter_batches(self) -> Iterator[TensorDict]:
     if self.shuffle:
         random.shuffle(self.instances)
     for batch in lazy_groups_of(self.iter_instances(), self.batch_size):
         tensor_dict = allennlp_collate(batch)
         if self.cuda_device is not None:
             tensor_dict = nn_util.move_to_device(tensor_dict,
                                                  self.cuda_device)
         yield tensor_dict
Exemple #37
0
 def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
     # First break the dataset into memory-sized lists:
     for instance_list in self._memory_sized_lists(instances):
         if shuffle:
             random.shuffle(instance_list)
         iterator = iter(instance_list)
         # Then break each memory-sized list into batches.
         for batch_instances in lazy_groups_of(iterator, self._batch_size):
             for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
                 batch = Batch(possibly_smaller_batches)
                 yield batch
Exemple #38
0
    def embed_sentences(self,
                        sentences: Iterable[List[str]],
                        batch_size: int = DEFAULT_BATCH_SIZE) -> Iterable[numpy.ndarray]:
        """
        Computes the ELMo embeddings for a iterable of sentences.

        Parameters
        ----------
        sentences : ``Iterable[List[str]]``, required
            An iterable of tokenized sentences.
        batch_size : ``int``, required
            The number of sentences ELMo should process at once.

        Returns
        -------
            A list of tensors, each representing the ELMo vectors for the input sentence at the same index.
        """
        for batch in lazy_groups_of(iter(sentences), batch_size):
            yield from self.embed_batch(batch)
Exemple #39
0
    def embed_sentences(self,
                        sentences: Iterable[List[str]],
                        batch_size: int = DEFAULT_BATCH_SIZE) -> Iterable[numpy.ndarray]:
        """
        Computes the ELMo embeddings for a iterable of sentences.

        Please note that ELMo has internal state and will give different results for the same input.
        See the comment under the class definition.

        Parameters
        ----------
        sentences : ``Iterable[List[str]]``, required
            An iterable of tokenized sentences.
        batch_size : ``int``, required
            The number of sentences ELMo should process at once.

        Returns
        -------
            A list of tensors, each representing the ELMo vectors for the input sentence at the same index.
        """
        for batch in lazy_groups_of(iter(sentences), batch_size):
            yield from self.embed_batch(batch)