def run(self) -> None: has_reader = self._dataset_reader is not None index = 0 start_time = time.time() print('Starting prediction') if has_reader: for batch in lazy_groups_of(self._get_instance_data(), self._batch_size): for model_input_instance, result in zip( batch, self._predict_instances(batch)): self._maybe_print_to_console_and_file( index, result, str(model_input_instance)) index = index + 1 else: for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size): for model_input_json, result in zip( batch_json, self._predict_json(batch_json)): self._maybe_print_to_console_and_file( index, result, json.dumps(model_input_json)) index = index + 1 end_time = time.time() print('Ending prediction') print('Total Time = ', (end_time - start_time), ' s') if self._output_file is not None: self._output_file.close()
def run(self) -> None: has_reader = self._dataset_reader is not None index = 0 if has_reader: for batch in tqdm(lazy_groups_of(self._get_instance_data(), self._batch_size), total=self.total_size, unit="batches"): for model_input_instance, result in zip( batch, self._predict_instances(batch)): self._maybe_print_to_console_and_file( index, result, str(model_input_instance)) index = index + 1 else: for batch_json in tqdm(lazy_groups_of(self._get_json_data(), self._batch_size), total=self.total_size, unit="batches"): for model_input_json, result in zip( batch_json, self._predict_json(batch_json)): self._maybe_print_to_console_and_file( index, result, json.dumps(model_input_json)) index = index + 1 if self._output_file is not None: self._output_file.close()
def _memory_sized_lists( self, instances: Iterable[Instance]) -> Iterable[List[Instance]]: """ Breaks the dataset into "memory-sized" lists of instances, which it yields up one at a time until it gets through a full epoch. For example, if the dataset is already an in-memory list, and each epoch represents one pass through the dataset, it just yields back the dataset. Whereas if the dataset is lazily read from disk and we've specified to load 1000 instances at a time, then it yields lists of 1000 instances each. """ lazy = is_lazy(instances) # Get an iterator over the next epoch worth of instances. iterator = self._take_instances(instances, self._instances_per_epoch) # We have four different cases to deal with: # With lazy instances and no guidance about how many to load into memory, # we just load ``batch_size`` instances at a time: if lazy and self._max_instances_in_memory is None: yield from lazy_groups_of(iterator, self._batch_size) # If we specified max instances in memory, lazy or not, we just # load ``max_instances_in_memory`` instances at a time: elif self._max_instances_in_memory is not None: yield from lazy_groups_of(iterator, self._max_instances_in_memory) # If we have non-lazy instances, and we want all instances each epoch, # then we just yield back the list of instances: elif self._instances_per_epoch is None: yield ensure_list(instances) # In the final case we have non-lazy instances, we want a specific number # of instances each epoch, and we didn't specify how to many instances to load # into memory. So we convert the whole iterator to a list: else: yield list(iterator)
def run(self) -> None: has_reader = self._dataset_reader is not None index = 0 if has_reader: for batch in lazy_groups_of(self._get_instance_data(), self._batch_size): for model_input_instance, result in zip( batch, self._predict_instances(batch)): self._maybe_print_to_console_and_file( index, result, str(model_input_instance)) index = index + 1 else: ids_ = [] vecs = [] for batch_json in tqdm( lazy_groups_of(self._get_json_data(), self._batch_size)): for model_input_json, result in zip( batch_json, self._predict_json(batch_json)): scalar_mix = ( torch.Tensor( result['activation_encoder_layer_0']).unsqueeze(0) + -20 * torch.Tensor( result['activation_encoder_layer_1']).unsqueeze(0) + torch.Tensor(result['activation_theta']).unsqueeze(0)) vecs.append(scalar_mix) ids_.append( torch.IntTensor([model_input_json['index'] ]).unsqueeze(0)) index = index + 1 torch.save((torch.cat(ids_, 0), torch.cat(vecs, 0)), self._output_file)
def run(self) -> None: has_reader = self._dataset_reader is not None index = 0 if has_reader: if self._extend_vocab: instances = list(self._get_instance_data()) self._predictor._model.vocab.extend_from_instances( instances=instances) self._predictor._model.extend_embedder_vocab() else: instances = self._get_instance_data() for batch in lazy_groups_of(instances, self._batch_size): for model_input_instance, result in zip( batch, self._predict_instances(batch)): self._maybe_print_to_console_and_file( index, result, str(model_input_instance)) index = index + 1 else: for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size): for model_input_json, result in zip( batch_json, self._predict_json(batch_json)): self._maybe_print_to_console_and_file( index, result, json.dumps(model_input_json)) index = index + 1 if self._output_file is not None: self._output_file.close()
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): # group the instance into two groups, each of identical 'is_flesh_event' fields flesh_iterator = \ (instance for instance in instance_list if instance.fields['is_flesh_event'].metadata is True) sketch_iterator = \ (instance for instance in instance_list if instance.fields['is_flesh_event'].metadata is False) # break each memory-sized list into batches and mix the batches into one list so we could shuffle it batches = list() excess: Deque[Instance] = deque() for batch_instances in lazy_groups_of(flesh_iterator, self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small( batch_instances, excess): batches.append(Batch(possibly_smaller_batches)) if excess: batches.append(Batch(excess)) excess: Deque[Instance] = deque() # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(sketch_iterator, self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small( batch_instances, excess): batches.append(Batch(possibly_smaller_batches)) if excess: batches.append(Batch(excess)) # shuffle if shuffle: random.shuffle(batches) # generate for batch in batches: yield batch
def _memory_sized_lists(self, instances: Iterable[Instance]) -> Iterable[List[Instance]]: """ Breaks the dataset into "memory-sized" lists of instances, which it yields up one at a time until it gets through a full epoch. For example, if the dataset is already an in-memory list, and each epoch represents one pass through the dataset, it just yields back the dataset. Whereas if the dataset is lazily read from disk and we've specified to load 1000 instances at a time, then it yields lists of 1000 instances each. """ lazy = is_lazy(instances) # Get an iterator over the next epoch worth of instances. iterator = self._take_instances(instances, self._instances_per_epoch) # We have four different cases to deal with: # With lazy instances and no guidance about how many to load into memory, # we just load ``batch_size`` instances at a time: if lazy and self._max_instances_in_memory is None: yield from lazy_groups_of(iterator, self._batch_size) # If we specified max instances in memory, lazy or not, we just # load ``max_instances_in_memory`` instances at a time: elif self._max_instances_in_memory is not None: yield from lazy_groups_of(iterator, self._max_instances_in_memory) # If we have non-lazy instances, and we want all instances each epoch, # then we just yield back the list of instances: elif self._instances_per_epoch is None: yield ensure_list(instances) # In the final case we have non-lazy instances, we want a specific number # of instances each epoch, and we didn't specify how to many instances to load # into memory. So we convert the whole iterator to a list: else: yield list(iterator)
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): if isinstance(sentence, str): return data.Sentence.from_dict( self.predict_json({"sentence": sentence})) elif isinstance(sentence, list): if len(sentence) == 0: return [] example = sentence[0] if isinstance(example, str) or isinstance(example, list): sentences = [] for sentences_batch in util.lazy_groups_of( sentence, self.batch_size): sentences_batch = self.predict_batch_json( [self._to_input_json(s) for s in sentences_batch]) sentences.extend(sentences_batch) return sentences elif isinstance(example, data.Sentence): sentences = [] for sentences_batch in util.lazy_groups_of( sentence, self.batch_size): sentences_batch = self.predict_batch_instance( [self._to_input_instance(s) for s in sentences_batch]) sentences.extend(sentences_batch) return sentences else: raise ValueError( "List must have either sentences as str, List[str] or Sentence object." ) else: raise ValueError("Input must be either string or list of strings.")
def create_cached_cnn_embeddings(self, tokens: List[str]) -> None: """ Given a list of tokens, this method precomputes word representations by running just the character convolutions and highway layers of elmo, essentially creating uncontextual word vectors. On subsequent forward passes, the word ids are looked up from an embedding, rather than being computed on the fly via the CNN encoder. This function sets 3 attributes: _word_embedding : ``torch.Tensor`` The word embedding for each word in the tokens passed to this method. _bos_embedding : ``torch.Tensor`` The embedding for the BOS token. _eos_embedding : ``torch.Tensor`` The embedding for the EOS token. Parameters ---------- tokens : ``List[str]``, required. A list of tokens to precompute character convolutions for. """ tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens timesteps = 32 batch_size = 32 chunked_tokens = lazy_groups_of(iter(tokens), timesteps) all_embeddings = [] device = get_device_of(next(self.parameters())) for batch in lazy_groups_of(chunked_tokens, batch_size): # Shape (batch_size, timesteps, 50) batched_tensor = batch_to_ids(batch) # NOTE: This device check is for when a user calls this method having # already placed the model on a device. If this is called in the # constructor, it will probably happen on the CPU. This isn't too bad, # because it's only a few convolutions and will likely be very fast. if device >= 0: batched_tensor = batched_tensor.cuda(device) output = self._token_embedder(batched_tensor) token_embedding = output["token_embedding"] mask = output["mask"] token_embedding, _ = remove_sentence_boundaries(token_embedding, mask) all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1))) full_embedding = torch.cat(all_embeddings, 0) # We might have some trailing embeddings from padding in the batch, so # we clip the embedding and lookup to the right size. full_embedding = full_embedding[:len(tokens), :] embedding = full_embedding[2:len(tokens), :] vocab_size, embedding_dim = list(embedding.size()) from allennlp.modules.token_embedders import Embedding # type: ignore self._bos_embedding = full_embedding[0, :] self._eos_embedding = full_embedding[1, :] self._word_embedding = Embedding(vocab_size, # type: ignore embedding_dim, weight=embedding.data, trainable=self._requires_grad, padding_index=0)
def create_cached_cnn_embeddings(self, tokens ) : u""" Given a list of tokens, this method precomputes word representations by running just the character convolutions and highway layers of elmo, essentially creating uncontextual word vectors. On subsequent forward passes, the word ids are looked up from an embedding, rather than being computed on the fly via the CNN encoder. This function sets 3 attributes: _word_embedding : ``torch.Tensor`` The word embedding for each word in the tokens passed to this method. _bos_embedding : ``torch.Tensor`` The embedding for the BOS token. _eos_embedding : ``torch.Tensor`` The embedding for the EOS token. Parameters ---------- tokens : ``List[str]``, required. A list of tokens to precompute character convolutions for. """ tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens timesteps = 32 batch_size = 32 chunked_tokens = lazy_groups_of(iter(tokens), timesteps) all_embeddings = [] device = get_device_of(next(self.parameters())) for batch in lazy_groups_of(chunked_tokens, batch_size): # Shape (batch_size, timesteps, 50) batched_tensor = batch_to_ids(batch) # NOTE: This device check is for when a user calls this method having # already placed the model on a device. If this is called in the # constructor, it will probably happen on the CPU. This isn't too bad, # because it's only a few convolutions and will likely be very fast. if device >= 0: batched_tensor = batched_tensor.cuda(device) output = self._token_embedder(batched_tensor) token_embedding = output[u"token_embedding"] mask = output[u"mask"] token_embedding, _ = remove_sentence_boundaries(token_embedding, mask) all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1))) full_embedding = torch.cat(all_embeddings, 0) # We might have some trailing embeddings from padding in the batch, so # we clip the embedding and lookup to the right size. full_embedding = full_embedding[:len(tokens), :] embedding = full_embedding[2:len(tokens), :] vocab_size, embedding_dim = list(embedding.size()) from allennlp.modules.token_embedders import Embedding # type: ignore self._bos_embedding = full_embedding[0, :] self._eos_embedding = full_embedding[1, :] self._word_embedding = Embedding(vocab_size, # type: ignore embedding_dim, weight=embedding.data, trainable=self._requires_grad, padding_index=0)
def run(self) -> None: has_reader = self._dataset_reader is not None if has_reader: for batch in lazy_groups_of(self._get_instance_data(), self._batch_size): for result in self._predict_instances(batch): self._maybe_print_to_console_and_file(result) else: for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size): for model_input, result in zip(batch_json, self._predict_json(batch_json)): self._maybe_print_to_console_and_file(result, json.dumps(model_input)) if self._output_file is not None: self._output_file.close()
def get_loss_per_candidate_squad( index, model, trigger_token_ids, cand_trigger_token_ids, vocab, dev_dataset, span_start, span_end, ): """ Similar to get_loss_per_candidate, except that we use multiple batches (in this case 4) rhater than one to evaluate the top trigger token candidates. """ if isinstance(cand_trigger_token_ids[0], (numpy.int64, int)): print("Only 1 candidate for index detected, not searching") return trigger_token_ids model.get_metrics(reset=True) loss_per_candidate = [] iterator = BasicIterator(batch_size=32) batch_count = 0 curr_loss = 0.0 for batch in lazy_groups_of(iterator(dev_dataset, num_epochs=1, shuffle=True), group_size=1): if batch_count > 4: continue batch_count = batch_count + 1 curr_loss += (evaluate_batch_squad( model, batch, trigger_token_ids, vocab, span_start, span_end)["loss"].cpu().detach().numpy()) loss_per_candidate.append((deepcopy(trigger_token_ids), curr_loss)) for cand_id in range(len(cand_trigger_token_ids[0])): temp_trigger_token_ids = deepcopy(trigger_token_ids) temp_trigger_token_ids[index] = cand_trigger_token_ids[index][cand_id] loss = 0 batch_count = 0 for batch in lazy_groups_of(iterator(dev_dataset, num_epochs=1, shuffle=True), group_size=1): if batch_count > 4: continue batch_count = batch_count + 1 loss += (evaluate_batch_squad( model, batch, temp_trigger_token_ids, vocab, span_start, span_end)["loss"].cpu().detach().numpy()) loss_per_candidate.append((deepcopy(temp_trigger_token_ids), loss)) return loss_per_candidate
def get_mixer(it1, ds1, it2, ds2, num_gpus=1, id1=0, which_mixer='bm', min_pct_of_ds2=0.0, shuffle=True): #Pdb().set_trace() num_batches1 = math.ceil(it1.get_num_batches(ds1) / num_gpus) if ds2 is None: num_batches2 = 0 else: num_batches2 = math.ceil(it2.get_num_batches(ds2) / num_gpus) id2 = (id1 + 1) % 2 if (which_mixer in ['em', 'bm']) or (num_batches1 == 0) or (num_batches2 == 0): total_batches = num_batches1 + num_batches2 num_epochs1 = 1 num_epochs2 = 1 elif which_mixer == 'cm': total_batches = round( max(2 * num_batches1, 2 * min_pct_of_ds2 * num_batches2)) # total_batches = 2*max(num_batches1, num_batches2) #total_batches = 2*num_batches1 # num_epochs1 = math.ceil(total_batches/(2*num_batches1)) num_epochs1 = round(total_batches / (2 * num_batches1)) #num_epochs1 = 1 num_epochs2 = math.ceil(total_batches / (2 * num_batches2)) else: raise "incorrect value of which_mixer {}".format(which_mixer) raw_g1 = it1(ds1, num_epochs=num_epochs1, shuffle=shuffle) raw_g2 = it2(ds2, num_epochs=num_epochs2, shuffle=shuffle) g1 = lazy_groups_of(raw_g1, num_gpus) if ds2 is None: g2 = None else: g2 = lazy_groups_of(raw_g2, num_gpus) if (which_mixer == 'em') or (num_batches1 == 0) or (num_batches2 == 0): mixer = mix_generators_em(g1, g2, id1) elif which_mixer == 'bm': mixer = mix_generators_bm(g1, g2, num_batches1, num_batches2, id1) elif which_mixer == 'cm': mixer = mix_generators_cm(g1, g2, id1) # return (mixer, total_batches)
def validate(self, trainer: 'CallbackTrainer'): # If the trainer has MovingAverage objects, use their weights for validation. for moving_average in self.moving_averages: moving_average.assign_average_value() with torch.no_grad(): # We have a validation set, so compute all the metrics on it. logger.info("Validating") trainer.model.eval() num_gpus = len(trainer._cuda_devices) # pylint: disable=protected-access raw_val_generator = self.iterator(self.instances, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( self.iterator.get_num_batches(self.instances) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch_group in val_generator_tqdm: loss = trainer.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() if self.loss_tracker is not None: ''' update validation regular / irregular loss status ''' if trainer.model._effective_encoder is trainer.model._sketch_encoder: self.loss_tracker.cumulated_regular_loss += loss.detach( ).cpu().numpy() self.loss_tracker.regular_batch_count += 1. else: self.loss_tracker.cumulated_irregular_loss += loss.detach( ).cpu().numpy() self.loss_tracker.irregular_batch_count += 1. # Update the description with the latest metrics val_metrics = training_util.get_metrics( trainer.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics( val_metrics) val_generator_tqdm.set_description(description, refresh=False) trainer.val_metrics = training_util.get_metrics(trainer.model, val_loss, batches_this_epoch, reset=True) # If the trainer has a moving average, restore for moving_average in self.moving_averages: moving_average.restore()
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: for instance_list in self._memory_sized_lists(instances): instance_list = sort_by_padding(instance_list, self._sorting_keys, self.vocab, self._padding_noise) batches = [] for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances): batches.append(Batch(possibly_smaller_batches)) move_to_front = self._biggest_batch_first and len(batches) > 1 if move_to_front: # We'll actually pop the last _two_ batches, because the last one might not be full. last_batch = batches.pop() penultimate_batch = batches.pop() if shuffle: random.shuffle(batches) else: logger.warning("shuffle parameter is set to False," " while bucket iterators by definition change the order of your data.") if move_to_front: batches.insert(0, penultimate_batch) batches.insert(0, last_batch) yield from batches
def _create_batches(self, instances , shuffle ) : for instance_list in self._memory_sized_lists(instances): instance_list = sort_by_padding(instance_list, self._sorting_keys, self.vocab, self._padding_noise) batches = [] for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances): batches.append(Batch(possibly_smaller_batches)) move_to_front = self._biggest_batch_first and len(batches) > 1 if move_to_front: # We'll actually pop the last _two_ batches, because the last one might not be full. last_batch = batches.pop() penultimate_batch = batches.pop() if shuffle: random.shuffle(batches) else: logger.warning(u"shuffle parameter is set to False," u" while bucket iterators by definition change the order of your data.") if move_to_front: batches.insert(0, penultimate_batch) batches.insert(0, last_batch) _i = batches while True: yield _i.next()
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: for instance_list in self._memory_sized_lists(instances): instance_list = sort_by_padding(instance_list, self._sorting_keys, self.vocab, self._padding_noise) batches = [] for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small( batch_instances): batches.append(Batch(possibly_smaller_batches)) move_to_front = self._biggest_batch_first and len(batches) > 1 if move_to_front: # We'll actually pop the last _two_ batches, because the last one might not be full. last_batch = batches.pop() penultimate_batch = batches.pop() if shuffle: # NOTE: if shuffle is false, the data will still be in a different order # because of the bucket sorting. random.shuffle(batches) if move_to_front: batches.insert(0, penultimate_batch) batches.insert(0, last_batch) yield from batches
def _test_epoch(self): """ Validate after training an epoch :return: A log that contains information about validation Note: The validation metrics in log must have the key 'val_metrics'. """ self.valid_iter, self.valid_num_batches = self.data_loader.get_iterator_and_num_batches( 'test') self.valid_iter = lazy_groups_of(self.valid_iter, self.n_gpu_use) self.model.eval() total_val_loss = 0 with torch.no_grad(): for batch_idx, data in enumerate(self.valid_iter): output = self._run_model(data) loss = output['loss'] total_val_loss += loss.item() metrics = self.model.get_metrics(True) metrics.update({ 'test_loss': total_val_loss / self.valid_num_batches, }) return metrics
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) # Divvy up the instances based on their value of the "partition_key" field. hoppers: Dict[str, List[Instance]] = defaultdict(list) for instance in instance_list: partition = instance.fields[ self._partition_key].metadata # type: ignore hoppers[partition].append(instance) # Get a `lazy_groups_of` iterator over each set of homogeneous instances. batches = { key: lazy_groups_of(iter(hopper), self._batch_size) for key, hopper in hoppers.items() } remaining = set(batches) # Yield batches in a round-robin fashion until none are left. while remaining: # TODO: shuffle keys before each batch creation. # Another approach can be to sample a task proportional to its data probability. # Then sample a batch from that task. # Data prob can be updated once the batch is sampled. for key, lazy_batches in batches.items(): if key in remaining: try: batch = next(lazy_batches) yield Batch(batch) except StopIteration: remaining.remove(key)
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: for instance_list in self._memory_sized_lists(instances): instance_list = sort_by_padding(instance_list, self._sorting_keys, self.vocab, self._padding_noise) batches = [] excess: Deque[Instance] = deque() for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances, excess): if self._skip_smaller_batches and len(possibly_smaller_batches) < self._batch_size: continue batches.append(Batch(possibly_smaller_batches)) if excess and (not self._skip_smaller_batches or len(excess) == self._batch_size): batches.append(Batch(excess)) # TODO(brendanr): Add multi-GPU friendly grouping, i.e. group # num_gpu batches together, shuffle and then expand the groups. # This guards against imbalanced batches across GPUs. move_to_front = self._biggest_batch_first and len(batches) > 1 if move_to_front: # We'll actually pop the last _two_ batches, because the last one might not be full. last_batch = batches.pop() penultimate_batch = batches.pop() if shuffle: # NOTE: if shuffle is false, the data will still be in a different order # because of the bucket sorting. random.shuffle(batches) if move_to_front: batches.insert(0, penultimate_batch) batches.insert(0, last_batch) yield from batches
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: epoch_number = self._epochs[id(instances)] filtered_instances = [] for instance in instances: if 'epoch_numbers' in instance.fields: epoch_numbers = instance.fields['epoch_numbers'].array if epoch_number in epoch_numbers: filtered_instances.append(instance) continue elif -1 in epoch_numbers: filtered_instances.append(instance) continue else: continue filtered_instances.append(instance) for instance_list in self._memory_sized_lists(filtered_instances): if shuffle: random.shuffle(instance_list) iterator = iter(instance_list) excess: Deque[Instance] = deque() # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small( batch_instances, excess): batch = Batch(possibly_smaller_batches) yield batch if excess: yield Batch(excess)
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) # Divvy up the instances based on their value of the "partition_key" field. hoppers: Dict[str, List[Instance]] = defaultdict(list) for instance in instance_list: partition = instance.fields[self._partition_key].metadata # type: ignore hoppers[partition].append(instance) # Get a `lazy_groups_of` iterator over each set of homogeneous instances. batches = {key: lazy_groups_of(iter(hopper), self._batch_size) for key, hopper in hoppers.items()} remaining = set(batches) # Yield batches in a round-robin fashion until none are left. while remaining: for key, lazy_batches in batches.items(): if key in remaining: try: batch = next(lazy_batches) if not self._skip_smaller_batches or len(batch) == self._batch_size: yield Batch(batch) except StopIteration: remaining.remove(key)
def get_average_grad_squad(model, vocab, trigger_token_ids, dev_dataset, span_start, span_end): """ Same as get_average_grad() in utils.py, except that we use the entire development set to compute the gradient for the triggers (rather than one batch). """ batch_count = 0 optimizer = optim.Adam(model.parameters()) iterator = BasicIterator(batch_size=32) iterator.index_with(vocab) for batch in lazy_groups_of(iterator(dev_dataset, num_epochs=1, shuffle=True), group_size=1): optimizer.zero_grad() utils.extracted_grads = [] # clear existing stored grads loss = evaluate_batch_squad(model, batch, trigger_token_ids, vocab, span_start, span_end)['loss'] loss.backward() if batch_count == 0: grads = torch.sum(utils.extracted_grads[0], dim=0).detach()[0:len( trigger_token_ids)] # inddex 0 is passage else: grads += torch.sum(utils.extracted_grads[0], dim=0).detach()[0:len(trigger_token_ids)] batch_count = batch_count + 1 averaged_grad = grads / batch_count return averaged_grad.cpu()
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: for instance_list in self._memory_sized_lists(instances): instance_list = sort_by_padding(instance_list, self._sorting_keys, self.vocab, self._padding_noise) batches = [] excess: Deque[Instance] = deque() for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances, excess): batches.append(Batch(possibly_smaller_batches)) if excess: batches.append(Batch(excess)) move_to_front = self._biggest_batch_first and len(batches) > 1 if move_to_front: # We'll actually pop the last _two_ batches, because the last one might not be full. last_batch = batches.pop() penultimate_batch = batches.pop() if shuffle: # NOTE: if shuffle is false, the data will still be in a different order # because of the bucket sorting. random.shuffle(batches) if move_to_front: batches.insert(0, penultimate_batch) batches.insert(0, last_batch) yield from batches
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator num_gpus = len(self._cuda_devices) raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False) val_generator = lazy_groups_of(raw_val_generator, num_gpus) num_validation_batches = math.ceil( val_iterator.get_num_batches(self._validation_data) / num_gpus) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) print("val gene called") batches_this_epoch = 0 val_loss = 0 try: few_shot = val_generator.__next__() except: print("Error could not do few shot validation") return batches_this_epoch, val_loss self.reptile_inner_update(few_shot) self.model.eval() with torch.no_grad(): for batch_group in val_generator_tqdm: loss = self.batch_loss(batch_group, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( self.model, val_loss, batches_this_epoch) description = training_util.description_from_metrics( val_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def _instances_to_batches( self, instance_iterator: Iterable[Instance], move_to_device ) -> Iterator[TensorDict]: instance_iterator = (self._index_instance(instance) for instance in instance_iterator) if move_to_device and self.cuda_device is not None: tensorize = lambda batch: nn_util.move_to_device( # noqa: E731 self.collate_fn(batch), self.cuda_device ) else: tensorize = self.collate_fn if self.batch_sampler is not None: instance_chunks: Iterable[List[Instance]] if self.max_instances_in_memory is not None: instance_chunks = lazy_groups_of(instance_iterator, self.max_instances_in_memory) else: instance_chunks = [list(instance_iterator)] for instances in instance_chunks: batches = ( [instances[i] for i in batch_indices] for batch_indices in self.batch_sampler.get_batch_indices(instances) ) for batch in batches: yield tensorize(batch) else: # Safe to assume this is not `None` when `self.batch_sampler` is `None`. assert self.batch_size is not None if self.shuffle: if self.max_instances_in_memory is not None: instance_iterator = shuffle_iterable( instance_iterator, self.max_instances_in_memory, ) else: # At this point we've already loaded the instances in memory and indexed them, # so this won't take long. instance_iterator = list(instance_iterator) random.shuffle(instance_iterator) for batch in lazy_groups_of(instance_iterator, self.batch_size): if self.drop_last and len(batch) < self.batch_size: break yield tensorize(batch)
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): instances_w_epoch_num = 0 for instance in instances: if "epoch_num" in instance.fields: instances_w_epoch_num += 1 print(f"\nInstances: {len(instance_list)}") epochs_list = list(self._epochs.values()) assert len(epochs_list) == 1, f"Multiple epoch keys: {self._epochs}" epoch_num = epochs_list[0] if self._track_epoch: for instance in instance_list: instance.fields["epoch_num"] = epoch_num supervision_dict = defaultdict(int) qtype_dict = defaultdict(int) for instance in instance_list: for key in self.supervision_keys: supervision_dict[key] += 1 if instance[key].metadata else 0 qtype_dict[instance["qtypes"].metadata] += 1 print(f"QType: {qtype_dict}") # These QType instances will not be kept in the first curriculum even if supervised NO_CURRICULUM = [ constants.COUNT_filter_find_qtype, constants.MAX_filter_find_qtype, constants.MIN_filter_find_qtype, constants.NUM_filter_find_qtype, ] filtered_instance_list = [] if self.filter_instances and epoch_num < self.filter_for_epochs: for instance in instance_list: if ( any(instance[key].metadata is True for key in self.supervision_keys) and not instance["qtypes"].metadata in NO_CURRICULUM ): filtered_instance_list.append(instance) else: filtered_instance_list = instance_list print(f"SupervisionDict: {supervision_dict}") print(f"Filtered Instances: {len(filtered_instance_list)}") if shuffle: random.shuffle(filtered_instance_list) iterator = iter(filtered_instance_list) excess: Deque[Instance] = deque() # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances, excess): batch = Batch(possibly_smaller_batches) yield batch if excess: yield Batch(excess)
def test_lazy_groups_of(self): xs = [1, 2, 3, 4, 5, 6, 7] groups = util.lazy_groups_of(iter(xs), group_size=3) assert next(groups) == [1, 2, 3] assert next(groups) == [4, 5, 6] assert next(groups) == [7] with pytest.raises(StopIteration): _ = next(groups)
def retrieve_cluster(target_data): encoder_outs = [] for batch in lazy_groups_of(iterator(target_data, num_epochs=1, shuffle=True), group_size=1): output = evaluate_batch(model, batch, trigger_token_ids=None, snli=False) encoder_outs.append(output['encoder_out'].detach().cpu()) # break encoder_outs = torch.cat(encoder_outs, 0) #bs * 128 return encoder_outs, torch.mean(encoder_outs,0)
def __iter__(self) -> Iterable[List[int]]: indices = self._argsort_by_padding(self.data_source) for group in lazy_groups_of(indices, self.batch_size): batch_indices = list(group) if self.drop_last and len(batch_indices) < self.batch_size: continue yield batch_indices
def run(self): has_reader = self._dataset_reader is not None if has_reader: for batch in lazy_groups_of(self._get_instance_data(), self._batch_size): for result in self._predict_instances(batch): self._maybe_print_to_console_and_file(result) else: for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size): for model_input, result in izip( batch_json, self._predict_json(batch_json)): self._maybe_print_to_console_and_file( result, json.dumps(model_input)) if self._output_file is not None: self._output_file.close()
def get_accuracy(model, dev_dataset, vocab, trigger_token_ids=None, snli=False, reset_metric=True): """ When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with triggers prepended for the whole dev_dataset. """ if reset_metric: model.get_metrics(reset=True) model.eval() # model should be in eval() already, but just in case if snli: iterator = BucketIterator(batch_size=128, sorting_keys=[("premise", "num_tokens")]) else: iterator = BucketIterator(batch_size=128, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) logits = [] labels = [] print_string = [] if trigger_token_ids: for idx in trigger_token_ids: print_string += [vocab.get_token_from_index(idx)] for batch in lazy_groups_of(iterator(dev_dataset, num_epochs=1, shuffle=False), group_size=1): output = evaluate_batch(model, batch, trigger_token_ids, snli) logits.append(output['logits'].detach().cpu().numpy()) labels.append(output['labels'].detach().cpu().numpy()) logits = np.concatenate(logits, 0) labels = np.concatenate(labels, 0) preds_int = np.argmax(logits, 1) success_idx = np.where(labels != preds_int)[0] acc = accuracy_score(labels, preds_int) if len(np.unique(labels)) > 1: f1_weighted = f1_score(labels, preds_int, average="weighted") try: f1 = f1_score(labels, preds_int) except: f1 = f1_weighted else: f1 = 'N/A' f1_weighted = 'N/A' try: auc = roc_auc_score(labels, preds_int) auc = "{:.4f}".format(auc) except: auc = "N/A" return acc, auc, f1, f1_weighted, success_idx
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: for instance_list in self._memory_sized_lists(instances): # WARNING: Assumes that defaultdict is ordered i.e > python 3.6 # Divvy up the instances based on their value of the "partition_key" field. hoppers: Dict[str, List[Instance]] = defaultdict(list) for instance in instance_list: partition = instance.fields[ self._partition_key].metadata # type: ignore hoppers[partition].append(instance) # Shuffle each parition separately if shuffle: # NOTE: if shuffle is false, the data will still be in a different order # because of the bucket sorting. for k, v in hoppers.items(): random.shuffle(v) # Sort each partition spearately hoppers = { key: sort_by_padding(value, self._sorting_keys, self.vocab, self._padding_noise) for key, value in hoppers.items() } instance_list = sort_by_padding(instance_list, self._sorting_keys, self.vocab, self._padding_noise) # Get a `lazy_groups_of` iterator over each set of homogeneous instances. batches = { key: lazy_groups_of(iter(hopper), self._batch_size) for key, hopper in hoppers.items() } remaining = set(batches) # Yield batches in a round-robin fashion until none are left. keys = batches.keys() # TODO(brendanr): Add multi-GPU friendly grouping, i.e. group # num_gpu batches together, shuffle and then expand the groups. # This guards against imbalanced batches across GPUs. move_to_front = self._biggest_batch_first and len(batches) > 1 if move_to_front: # We'll actually pop the last _two_ batches, because the last one might not be full. last_batch = batches[keys[0]].pop() penultimate_batch = batches[keys[0]].pop() batches[keys[0]].insert(0, penultimate_batch) batches[keys[0]].insert(0, last_batch) while remaining: for key, lazy_batches in batches.items(): if key in remaining: try: batch = next(lazy_batches) yield Batch(batch) except StopIteration: remaining.remove(key)
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) iterator = iter(instance_list) # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): yield Batch(batch_instances)
def _iter_batches(self) -> Iterator[TensorDict]: if self.shuffle: random.shuffle(self.instances) for batch in lazy_groups_of(self.iter_instances(), self.batch_size): tensor_dict = allennlp_collate(batch) if self.cuda_device is not None: tensor_dict = nn_util.move_to_device(tensor_dict, self.cuda_device) yield tensor_dict
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: # First break the dataset into memory-sized lists: for instance_list in self._memory_sized_lists(instances): if shuffle: random.shuffle(instance_list) iterator = iter(instance_list) # Then break each memory-sized list into batches. for batch_instances in lazy_groups_of(iterator, self._batch_size): for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances): batch = Batch(possibly_smaller_batches) yield batch
def embed_sentences(self, sentences: Iterable[List[str]], batch_size: int = DEFAULT_BATCH_SIZE) -> Iterable[numpy.ndarray]: """ Computes the ELMo embeddings for a iterable of sentences. Parameters ---------- sentences : ``Iterable[List[str]]``, required An iterable of tokenized sentences. batch_size : ``int``, required The number of sentences ELMo should process at once. Returns ------- A list of tensors, each representing the ELMo vectors for the input sentence at the same index. """ for batch in lazy_groups_of(iter(sentences), batch_size): yield from self.embed_batch(batch)
def embed_sentences(self, sentences: Iterable[List[str]], batch_size: int = DEFAULT_BATCH_SIZE) -> Iterable[numpy.ndarray]: """ Computes the ELMo embeddings for a iterable of sentences. Please note that ELMo has internal state and will give different results for the same input. See the comment under the class definition. Parameters ---------- sentences : ``Iterable[List[str]]``, required An iterable of tokenized sentences. batch_size : ``int``, required The number of sentences ELMo should process at once. Returns ------- A list of tensors, each representing the ELMo vectors for the input sentence at the same index. """ for batch in lazy_groups_of(iter(sentences), batch_size): yield from self.embed_batch(batch)