def read(self, file_path: str) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning("DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if lazy: return _LazyInstances(lambda: iter(self._read(file_path))) else: instances = self._read(file_path) if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return instances
loader = IrTupleDatasetReader(lazy=True, lowercase=True) vocab = Vocabulary.from_files(args.vocab) if args.qrel: qrels = load_reference(args.qrel) not_judged = 0 oov_queries = 0 non_oov_queries = 0 oov_count_list = [] instances = loader.read(args.query) with open(args.out_file_oov, "w", encoding="utf8") as out_file_oov: with open(args.out_file_no_oov, "w", encoding="utf8") as out_file_non_oov: for i in Tqdm.tqdm(instances): id_str = i["source_tokens"].tokens[0].text if args.qrel and int(id_str) not in qrels: not_judged += 1 continue i.index_fields(vocab) indexes = i["target_tokens"]._indexed_tokens["tokens"] if 1 in i["target_tokens"]._indexed_tokens["tokens"]: # we have a oov query oov_queries += 1 oov_count_list.append(sum(1 for t in indexes if t == 1)) out_file_oov.write(id_str + "\t" + " ".join(
def _read_embeddings_from_text_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens" ) -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped. The remainder of the docstring is identical to `_read_pretrained_embeddings_file`. """ tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) char_embeddings = {} embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(" ") if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line, ) continue vector = numpy.asarray(fields[1:], dtype="float32") for char in list(token): if char in char_embeddings: char_embeddings[char] = (char_embeddings[char][0] + vector, char_embeddings[char][1] + 1) else: char_embeddings[char] = (vector, 1) embeddings[token] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary" ) char_embeddings = {char: char_embeddings[char][0] / char_embeddings[char][1] for char in char_embeddings} chars = set(char_embeddings.keys()) all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std ) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 elif len(set(token) - chars) == 0: embedding_matrix[i] = torch.FloatTensor([char_embeddings[char] for char in list(token)]).sum(dim=-2) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token ) logger.info( "Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size ) return embedding_matrix
def search_learning_rate( trainer: GradientDescentTrainer, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, ) -> Tuple[List[float], List[float]]: """ Runs training loop on the model using [`GradientDescentTrainer`](../training/trainer.md#gradientdescenttrainer) increasing learning rate from `start_lr` to `end_lr` recording the losses. # Parameters trainer: `GradientDescentTrainer` start_lr : `float` The learning rate to start the search. end_lr : `float` The learning rate upto which search is done. num_batches : `int` Number of batches to run the learning rate finder. linear_steps : `bool` Increase learning rate linearly if False exponentially. stopping_factor : `float` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If `None` search proceeds till the `end_lr` # Returns (learning_rates, losses) : `Tuple[List[float], List[float]]` Returns list of learning rates and corresponding losses. Note: The losses are recorded before applying the corresponding learning rate """ if num_batches <= 10: raise ConfigurationError( "The number of iterations for learning rate finder should be greater than 10." ) trainer.model.train() infinite_generator = itertools.cycle(trainer.data_loader) train_generator_tqdm = Tqdm.tqdm(infinite_generator, total=num_batches) learning_rates = [] losses = [] best = 1e9 if linear_steps: lr_update_factor = (end_lr - start_lr) / num_batches else: lr_update_factor = (end_lr / start_lr)**(1.0 / num_batches) for i, batch in enumerate(train_generator_tqdm): if linear_steps: current_lr = start_lr + (lr_update_factor * i) else: current_lr = start_lr * (lr_update_factor**i) for param_group in trainer.optimizer.param_groups: param_group["lr"] = current_lr trainer.optimizer.zero_grad() loss = trainer.batch_outputs(batch, for_training=True)["loss"] loss.backward() loss = loss.detach().cpu().item() if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best): logger.info( f"Loss ({loss}) exceeds stopping_factor * lowest recorded loss." ) break trainer.rescale_gradients() trainer.optimizer.step() learning_rates.append(current_lr) losses.append(loss) if loss < best and i > 10: best = loss if i == num_batches: break return learning_rates, losses
def _instances_to_cache_file(self, cache_filename, instances) -> None: with open(cache_filename, "w") as cache: for instance in Tqdm.tqdm(instances): cache.write(self.serialize_instance(instance) + "\n")
lazy=True, max_doc_length=180, max_query_length=30, tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter() )) # already spacy tokenized, so that it is faster _iterator = BucketIterator(batch_size=64, sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")]) _iterator.index_with(vocab) for epoch in range(2): for batch in Tqdm.tqdm( _iterator(_triple_loader.read(config["train_data"]), num_epochs=1)): # todo train loop pass # # eval (duplicate for validation inside train loop - but rename "_iterator", since # otherwise it will overwrite the original train iterator, which is instantiated outside the loop) # _tuple_loader = IrLabeledTupleDatasetReader( lazy=True, max_doc_length=180, max_query_length=30) # not spacy tokenized already (default is spacy) _iterator = BucketIterator(batch_size=128, sorting_keys=[("doc_tokens", "num_tokens"), ("query_tokens", "num_tokens")])
def _validation_loss( self, epoch: int ) -> Tuple[float, float, int, List[Dict[str, torch.Tensor]]]: """ Computes the validation loss. Returns it and the number of batches. Also returns list of predictions. """ logger.info("Validating") self._pytorch_model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_data_loader is not None: validation_data_loader = self._validation_data_loader else: raise ConfigurationError( "Validation results cannot be calculated without a validation_data_loader" ) val_generator_tqdm = Tqdm.tqdm(validation_data_loader) batches_this_epoch = 0 val_loss = 0 val_reg_loss = 0 done_early = False preds = [] for batch in val_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing validation early! " "This implies that there is an imbalance in your validation " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batch_outputs = self.batch_outputs(batch, for_training=False) loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() if reg_loss is not None: val_reg_loss += reg_loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( self.model, val_loss, val_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=self.cuda_device, ) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) if self.dataset_writer: output_dict = self.model.make_output_human_readable( batch_outputs) output_dict = split_up(output_dict, batch["order_metadata"]) preds.extend(output_dict) for callback in self._batch_callbacks: callback( self, [batch], [batch_outputs], epoch, batches_this_epoch, is_training=False, is_master=self._master, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)." ) # Indicate that we're done so that any workers that have remaining data stop validation early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, val_reg_loss, batches_this_epoch, preds
args = parser.parse_args() # # work # qrels = load_reference(args.qrel) result_files = glob.glob(args.results) best_mrr = (0, "", 0) best_relevant = (0, "", 0) with open(args.out_file, "w", encoding="utf8") as metric_file: for t, res_file in Tqdm.tqdm(enumerate(result_files)): try: res_candidate = load_candidate(res_file, args.cutoff) for i in Tqdm.tqdm(range(1, args.cutoff)): metrics = compute_metrics(qrels, res_candidate, i) if i == 1 and t == 0: metric_file.write("sep=,\nFile,Cutoff," + ",".join(k for k, v in metrics.items()) + "\n") if metrics["QueriesWithRelevant"] > best_relevant[0]: best_relevant = (metrics["QueriesWithRelevant"], res_file, i) print("got new best QueriesWithRelevant", best_relevant)
# if early stopping has been triggered during validation, exit from the epoch if earlyStop is True: break perf_monitor.start_block(monitorModel) perf_start_inst = 0 # prep model for training model.train() # Creating a label tensor filled with ones --> will be needed for marginranking loss # should be initialized in each outer loop, since in the last loop in the inner cycle the size of the tensor # will probably change label = torch.ones(trainBatchSize).cuda() batchCounter = 0 # Train loop for batch in Tqdm.tqdm( _iterator(_triple_loader.read(config["train_data"]), num_epochs=1)): iterCounter += 1 batch = move_to_device(batch, 0) # batch = Parameter(batch).to(device) batchCounter += 1 model.train() # todo train loop # in the beggining of each train loop, clean the optimizer (zero_grad() method) optimAdam.zero_grad() # retrieve the current batch size --> The iterators do not guarantee a fixed batch size # (the last one will probably be smaller) --> so we will retrieve the number of tokens from e.g. the query currentBatchSize = batch["query_tokens"]["tokens"].shape[0] # for the batch size, th # based on the slides, the model will be trained with triplets:
def main(file, embeddings, model, emb_wt_key, namespace, output_dir): archive = load_archive(model) config = archive.config os.makedirs(output_dir, exist_ok=True) config.to_file(os.path.join(output_dir, CONFIG_NAME)) model = archive.model # first expand the vocabulary dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) instances = dataset_reader.read(file) vocab = model.vocab # get all the tokens in the new file namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict( lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) old_token_size = vocab.get_vocab_size(namespace) print("Before expansion: Number of instances in {} namespace: {}".format( namespace, old_token_size)) if namespace not in namespace_token_counts: logger.error( "No tokens found for namespace: {} in the new input file".format( namespace)) # identify the new tokens in the new instances token_to_add = set() token_hits = 0 for token, count in namespace_token_counts[namespace].items(): if token not in vocab._token_to_index[namespace]: # new token, must add token_to_add.add(token) else: token_hits += 1 print("Found {} existing tokens and {} new tokens in {}".format( token_hits, len(token_to_add), file)) # add the new tokens to the vocab for token in token_to_add: vocab.add_token_to_namespace(token=token, namespace=namespace) archived_parameters = dict(model.named_parameters()) # second, expand the embedding matrix for name, weights in archived_parameters.items(): # find the wt matrix for the embeddings if name == emb_wt_key: if weights.dim() != 2: logger.error( "Expected an embedding matrix for the parameter: {} instead" "found {} tensor".format(emb_wt_key, weights.shape)) emb_dim = weights.shape[-1] print("Before expansion: Size of emb matrix: {}".format( weights.shape)) # Loading embeddings for old and new tokens since that is cleaner than copying all # the embedding loading logic here all_embeddings = _read_pretrained_embeddings_file( embeddings, emb_dim, vocab, namespace) # concatenate the new entries i.e last token_to_add embeddings to the original weights if len(token_to_add) > 0: weights.data = torch.cat( [weights.data, all_embeddings[-len(token_to_add):, :]]) print("After expansion: Size of emb matrix: {}".format( weights.shape)) # save the files needed by the model archiver model_path = os.path.join(output_dir, "weight.th") model_state = model.state_dict() torch.save(model_state, model_path) vocab.save_to_files(os.path.join(output_dir, "vocabulary")) archive_model(output_dir, weights="weight.th") # more debug messages new_token_size = vocab.get_vocab_size(namespace) for name, weights in archived_parameters.items(): if name == emb_wt_key: print("Size of emb matrix: {}".format(weights.shape)) print("After expansion: Number of instances in {} namespace: {}".format( namespace, new_token_size))
""" Take a src and tgt files (standard input in opennmt and fairseq) and join them into a single tsv """ import os from allennlp.common import Tqdm def gen_line(path): file = open(path) for line in file: yield line file.close() if __name__ == '__main__': src_path = '../data/bbc/val.txt.src' tgt_path = '../data/bbc/val.txt.tgt' src = gen_line(src_path) tgt = gen_line(tgt_path) new_lines = [] i = 0 for src_line, tgt_line in Tqdm.tqdm(zip(src, tgt)): new_line = src_line.strip() + '\t' + tgt_line.strip() + '\n' new_lines.append(new_line) file = open( os.path.join('../data/bbc/', os.path.basename(src_path).split('.')[0] + '.tsv'), 'w') file.writelines(new_lines) file.close()
def evaluate_model(model, config, _logger, cuda_device, eval_tsv, eval_batch_count, use_cache=False): model.eval() # turning off training validation_results = {} fill_cache = False cached_batches = None try: if use_cache: global evaluate_cache if eval_tsv not in evaluate_cache: fill_cache = True evaluate_cache[eval_tsv] = [] cached_batches = evaluate_cache[eval_tsv] if not use_cache or fill_cache: validation_queue, validation_processes, validation_exit = get_multiprocess_batch_queue( "eval-batches", multiprocess_validation_loader, glob.glob(eval_tsv), config, _logger, queue_size=200) #time.sleep(len(validation_processes)) # fill the queue _logger.info("[eval_model] --- Start validation with queue.size:" + str(validation_queue.qsize())) else: _logger.info("[eval_model] --- Start validation with cache size:" + str(len(cached_batches))) with torch.no_grad(): for i in Tqdm.tqdm(range(0, eval_batch_count), disable=config["tqdm_disabled"]): if not use_cache or fill_cache: batch_orig = validation_queue.get() if fill_cache: cached_batches.append(batch_orig) else: batch_orig = cached_batches[i] batch = move_to_device(copy.deepcopy(batch_orig), cuda_device) output = model.forward(batch["query_tokens"], batch["doc_tokens"], batch["query_length"], batch["doc_length"]) output = output.cpu( ) # get the output back to the cpu - in one piece for sample_i, sample_query_id in enumerate( batch_orig["query_id"]): # operate on cpu memory sample_query_id = int(sample_query_id) sample_doc_id = int( batch_orig["doc_id"] [sample_i]) # again operate on cpu memory if sample_query_id not in validation_results: validation_results[sample_query_id] = [] validation_results[sample_query_id].append( (sample_doc_id, float(output[sample_i]))) #if not use_cache or fill_cache and i % 100 == 0: # only to check for performance regresion # if validation_queue.qsize() < 10: # _logger.warning("validation_queue.qsize() < 10") if not use_cache or fill_cache: # make sure we didn't make a mistake in the configuration / data preparation if validation_queue.qsize() != 0: _logger.error( "validation_queue.qsize() is not empty after evaluation") validation_exit.set() # allow sub-processes to exit except BaseException as e: _logger.info('-' * 89) _logger.exception('[eval_model] Got exception: ') print( "----- Attention! - something went wrong in eval_model (see logger) ----- " ) if not use_cache or fill_cache: for proc in validation_processes: if proc.is_alive(): proc.terminate() raise e return validation_results
args = parser.parse_args() # # compare (different mrr gains up,same,down) # ------------------------------- # res = load_candidate_from_stream_with_score(open(args.res_in, "r")) #,space_for_rank=30000) candidate_set = None if args.candidate_file: candidate_set = parse_candidate_set(args.candidate_file, 1000) with open(args.res_out, "w", encoding="utf8") as res_out: for query, data in Tqdm.tqdm(res.items()): out_count = 0 for (pid, rank, score) in data: if out_count == args.top_n: break if candidate_set is not None: if candidate_set[query][pid] <= args.cs_n: res_out.write( str(query) + " Q0 " + str(pid) + " " + str(out_count) + " " + str(score) + " " + args.run_id + "\n") out_count += 1 else: res_out.write( str(query) + " Q0 " + str(pid) + " " + str(out_count) + " " + str(score) + " " + args.run_id + "\n")
def read(self, file_path: str) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning( "DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if lazy: return _LazyInstances(lambda: iter(self._read(file_path))) else: if self.cache_path is not None: # create a key for the file based on the reader config hash_ = self.get_hash(file_path) pathlib.Path(self.cache_path).mkdir(parents=True, exist_ok=True) cache_file = os.path.join(self.cache_path, (hash_ + '.cache')) if not os.path.exists(cache_file) or self.overwrite_cache: instances = self._read(file_path) if not isinstance(instances, list): instances = [ instance for instance in Tqdm.tqdm(instances) ] if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) logger.info(f'caching instances to file: {cache_file}') with open(cache_file, 'wb') as cache: dill.dump(instances, cache) else: logger.info( f'Reading instances from cache file: {cache_file}') # instances = [] # with open(cache_file, 'rb') as cache: # start = time.time() # instances = [] # for line in Tqdm.tqdm(cache): # instances.append(self.deserialize_instance(line.strip())) # print(time.time()-start) with open(cache_file, 'rb') as f_in: instances = dill.load(f_in) else: instances = self._read(file_path) if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return instances
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._pytorch_model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False) num_validation_batches = val_iterator.get_num_batches( self._validation_data) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 done_early = False for batch in val_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing validation early! " "This implies that there is an imbalance in your validation " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break loss = self.batch_loss(batch, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( self.model, val_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)." ) # Indicate that we're done so that any workers that have remaining data stop validation early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def search_learning_rate(trainer: Trainer, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None) -> Tuple[List[float], List[float]]: """ Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer` increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses. Parameters ---------- trainer: :class:`~allennlp.training.trainer.Trainer` start_lr: ``float`` The learning rate to start the search. end_lr: ``float`` The learning rate upto which search is done. num_batches: ``int`` Number of batches to run the learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` Returns ------- (learning_rates, losses): ``Tuple[List[float], List[float]]`` Returns list of learning rates and corresponding losses. Note: The losses are recorded before applying the corresponding learning rate """ if num_batches <= 10: raise ConfigurationError('The number of iterations for learning rate finder should be greater than 10.') trainer.model.train() train_generator = trainer.iterator(trainer.train_data, shuffle=trainer.shuffle) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_batches) learning_rates = [] losses = [] best = 1e9 if linear_steps: lr_update_factor = (end_lr - start_lr) / num_batches else: lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches) for i, batch in enumerate(train_generator_tqdm): if linear_steps: current_lr = start_lr + (lr_update_factor * i) else: current_lr = start_lr * (lr_update_factor ** i) for param_group in trainer.optimizer.param_groups: param_group['lr'] = current_lr trainer.optimizer.zero_grad() loss = trainer.batch_loss(batch, for_training=True) loss.backward() loss = loss.detach().cpu().item() if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best): logger.info(f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.') break trainer.rescale_gradients() trainer.optimizer.step() learning_rates.append(current_lr) losses.append(loss) if loss < best and i > 10: best = loss if i == num_batches: break return learning_rates, losses
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info(f"Epoch: {epoch}/{self._num_epochs - 1}") cpu_memory_usage = [] for worker, memory in common_util.peak_memory_mb().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage MB: {memory}") gpu_memory_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") for component_optimizer in self.component_optimizers.values(): component_optimizer.reset_loss('train') self.model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps) except TypeError: num_training_batches = float("inf") batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total for component_optimizer in self.component_optimizers.values(): component_optimizer.zero_grad() batch_group_metrics = [] meta_batch = deepcopy(batch_group) # Train the Sub Models first for name, sub_model in self._pytorch_model.component_models.items( ): component_optimizer = self.component_optimizers[name] batch_group_outputs, metrics = component_optimizer.process_batch_group( batch_group, True, batch_num_total, batches_this_epoch, True) batch_group_metrics.append(metrics) for i, batch_outputs in enumerate(batch_group_outputs): component_output = batch_outputs["output"] component_output = component_output.detach() meta_batch[i][name] = component_output meta_optimizer = self.component_optimizers["meta"] meta_batch_outputs, meta_metrics = meta_optimizer.process_batch_group( meta_batch, True, batch_num_total, batches_this_epoch, False) # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) batch_group_metrics.append(meta_metrics) all_metrics = ChainMap(*batch_group_metrics) description = training_util.description_from_metrics(all_metrics) batch_group_generator_tqdm.set_description(description, refresh=False) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return all_metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps ) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps ) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm( batch_group_generator, total=num_training_batches ) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss, effective_batch_size = self.batch_loss(batch, for_training=True) if not loss: # weak supervision can have no loss continue if torch.isnan(loss): raise ValueError("nan loss encountered") denom = effective_batch_size if effective_batch_size else len(batch_group) loss = loss / denom loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch() and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7) ) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch() and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum(training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if ( self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master ): last_save_time = time.time() self._save_checkpoint( "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time))) ) # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _validation_loss(self, epoch: int) -> Tuple[float, float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self.model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_data_loader is not None: validation_data_loader = self._validation_data_loader else: raise ConfigurationError( "Validation results cannot be calculated without a validation_data_loader" ) val_generator_tqdm = Tqdm.tqdm(validation_data_loader) for component_optimizer in self.component_optimizers.values(): component_optimizer.reset_loss('validation') batches_this_epoch = 0 done_early = False for batch in val_generator_tqdm: batches_this_epoch += 1 batch_metrics = [] batch_group = [batch] meta_batch = deepcopy(batch_group) # Train the Sub Models first for name, sub_model in self._pytorch_model.component_models.items( ): component_optimizer = self.component_optimizers[name] batch_group_outputs, metrics = component_optimizer.process_batch_group( batch_group, for_training=False, batches_this_epoch=batches_this_epoch) batch_metrics.append(metrics) for i, batch_outputs in enumerate(batch_group_outputs): meta_batch[i][name] = batch_outputs["output"] meta_optimizer = self.component_optimizers["meta"] meta_batch_outputs, meta_metrics = meta_optimizer.process_batch_group( meta_batch, for_training=False, batches_this_epoch=batches_this_epoch) batch_metrics.append(meta_metrics) all_metrics = ChainMap(*batch_metrics) description = training_util.description_from_metrics(all_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return all_metrics
# # load data & create vocab # ------------------------------- # loader = IrTupleDatasetReader(lazy=True,lowercase=args.lowercase) def getInstances(): for file in args.dataset_files: instances = loader.read(file) for i in instances: yield Instance({"text":i["target_tokens"]}) namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(getInstances()): instance.count_vocab_items(namespace_token_counts) #with open(args.out_dir,"w",encoding="utf8") as out: # for n in namespace_token_counts: # #out.write("--"+n+"\n") # for w,i in namespace_token_counts[n].items(): # out.write(w+"\t"+str(i)+"\n") vocab = Vocabulary(namespace_token_counts, min_count={"tokens":100}) vocab.save_to_files(args.out_dir) #vocab = Vocabulary(namespace_token_counts, min_count={"tokens":50}) #vocab.save_to_files(args.out_dir2) #vocab = Vocabulary(namespace_token_counts, min_count={"tokens":10})
print('Model',config["model"],'total parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad)) print('Network:', model) # # train # _triple_reader = IrTripleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30) _triple_reader = _triple_reader.read(config["train_data"]) _triple_reader.index_with(vocab) loader = PyTorchDataLoader(_triple_reader, batch_size=32) for epoch in range(2): for batch in Tqdm.tqdm(loader): # todo train loop pass # # eval (duplicate for validation inside train loop - but rename "loader", since # otherwise it will overwrite the original train iterator, which is instantiated outside the loop) # _tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30) _tuple_reader = _tuple_reader.read(config["test_data"]) _tuple_reader.index_with(vocab) loader = PyTorchDataLoader(_tuple_reader, batch_size=128) for batch in Tqdm.tqdm(loader):
def read(self, file_path: str) -> Dataset: """ Returns an `Iterable` containing all the instances in the specified dataset. If `self.lazy` is False, this calls `self._read()`, ensures that the result is a list, then returns the resulting list. If `self.lazy` is True, this returns an object whose `__iter__` method calls `self._read()` each iteration. In this case your implementation of `_read()` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a `ConfigurationError`. In either case, the returned `Iterable` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, "lazy", None) if lazy is None: logger.warning( "DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if self._cache_directory: cache_file = self._get_cache_location_for_file_path(file_path) else: cache_file = None if lazy: instances: Iterable[Instance] = _LazyInstances( self._read, file_path, cache_file, self.deserialize_instance, self.serialize_instance, ) if self.max_instances is not None: instances = itertools.islice(instances, 0, self.max_instances) else: # First we read the instances, either from a cache or from the original file. if cache_file and os.path.exists(cache_file): instances = self._instances_from_cache_file(cache_file) else: instances = self._read(file_path) if self.max_instances is not None: if isinstance(instances, list): instances = instances[:self.max_instances] else: instances = itertools.islice(instances, 0, self.max_instances) # Then some validation. if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) # And finally we write to the cache if we need to. if cache_file and not os.path.exists(cache_file): logger.info(f"Caching instances to {cache_file}") self._instances_to_cache_file(cache_file, instances) instances = AllennlpDataset(instances) return instances
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) cpu_memory_usage = [] for worker, memory in common_util.peak_memory_mb().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage MB: {memory}") gpu_memory_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") regularization_penalty = self.model.get_regularization_penalty() train_loss = 0.0 batch_loss = 0.0 if regularization_penalty is not None: train_reg_loss = 0.0 batch_reg_loss = 0.0 else: train_reg_loss = None batch_reg_loss = None # Set the model to "train" mode. self.model_engine.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps) except TypeError: num_training_batches = float("inf") # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown batch_group_generator_tqdm = batch_group_generator if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() batch_group_outputs = [] for batch in batch_group: with amp.autocast(self._use_amp): batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) batch_loss = loss.item() train_loss += batch_loss if reg_loss is not None: reg_loss = reg_loss / len(batch_group) batch_reg_loss = reg_loss.item() train_reg_loss += batch_reg_loss self.model_engine.backward(loss) self.model_engine.step() param_updates = None if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # Get the magnitude of parameter updates for logging. We need to do some # computation before and after the optimizer step, and it's expensive because of # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so # we don't do this every batch, only when it's requested. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) else: if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss, batch_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=self.cuda_device, ) if self._master: # Updating tqdm only for the master as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._tensorboard.log_batch( self.model, self.optimizer, 0., # batch_grad_norm, metrics, batch_group, param_updates, ) self._checkpointer.maybe_save_checkpoint( self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( self, batch_group, batch_group_outputs, epoch, batches_this_epoch, is_training=True, is_master=self._master, ) metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss=None, batch_reg_loss=None, num_batches=batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def run( # type: ignore self, model: Model, dataset: DatasetDict, split: str = "validation", data_loader: Optional[Lazy[TangoDataLoader]] = None, ) -> EvaluationResult: """ Runs an evaluation on a dataset. * `model` is the model we want to evaluate. * `dataset` is the dataset we want to evaluate on. * `split` is the name of the split we want to evaluate on. * `data_loader` gives you the chance to choose a custom dataloader for the evaluation. By default this step evaluates on batches of 32 instances each. """ concrete_data_loader: TangoDataLoader if data_loader is None: concrete_data_loader = BatchSizeDataLoader(dataset.splits[split], batch_size=32, shuffle=False) else: concrete_data_loader = data_loader.construct( instances=dataset.splits[split]) if torch.cuda.device_count() > 0: cuda_device = torch.device(0) else: cuda_device = torch.device("cpu") check_for_gpu(cuda_device) generator_tqdm = Tqdm.tqdm(iter(concrete_data_loader)) # Number of batches in instances. predictions: List[Dict[str, Any]] = [] # Number of batches where the model produces a loss. loss_count = 0 batch_count = 0 # Cumulative loss total_loss = 0.0 with torch.inference_mode(): model.eval() for batch in concrete_data_loader: batch_count += 1 batch = move_to_device(batch, cuda_device) output_dict = model(**batch) metrics = model.get_metrics() loss = output_dict.pop("loss", None) if loss is not None: loss_count += 1 total_loss += loss.item() metrics["loss"] = total_loss / loss_count if any( metric_name.startswith("_") for metric_name in metrics): self.logger.warning_once( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar.") description = (", ".join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||") generator_tqdm.set_description(description, refresh=False) output_dict = sanitize(output_dict) # This is write-only code, but it's quite fast. predictions.extend( dict(zip(output_dict.keys(), x)) for x in zip(*output_dict.values())) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes produced a loss!" ) final_metrics["loss"] = total_loss / loss_count return self.EvaluationResult(final_metrics, predictions)
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 train_reg_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches = math.ceil( len(self.data_loader) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() batch_group_outputs = [] for batch in batch_group: batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs["loss"] reg_loss = batch_outputs["reg_loss"] if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) reg_loss = reg_loss / len(batch_group) if self._opt_level is not None: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_reg_loss += reg_loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) param_updates = None if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # Get the magnitude of parameter updates for logging. We need to do some # computation before and after the optimizer step, and it's expensive because of # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so # we don't do this every batch, only when it's requested. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._tensorboard.log_batch(self.model, self.optimizer, batch_grad_norm, metrics, batch_group, param_updates) if self._master: self._checkpointer.maybe_save_checkpoint( self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( self, batch_group, batch_group_outputs, epoch, batches_this_epoch, is_training=True, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def search_learning_rate( trainer: Trainer, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None) -> Tuple[List[float], List[float]]: """ Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer` increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses. Parameters ---------- trainer: :class:`~allennlp.training.trainer.Trainer` start_lr: ``float`` The learning rate to start the search. end_lr: ``float`` The learning rate upto which search is done. num_batches: ``int`` Number of batches to run the learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` Returns ------- (learning_rates, losses): ``Tuple[List[float], List[float]]`` Returns list of learning rates and corresponding losses. Note: The losses are recorded before applying the corresponding learning rate """ if num_batches <= 10: raise ConfigurationError( 'The number of iterations for learning rate finder should be greater than 10.' ) trainer.model.train() train_generator = trainer.iterator(trainer.train_data, shuffle=trainer.shuffle) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_batches) learning_rates = [] losses = [] best = 1e9 if linear_steps: lr_update_factor = (end_lr - start_lr) / num_batches else: lr_update_factor = (end_lr / start_lr)**(1.0 / num_batches) for i, batch in enumerate(train_generator_tqdm): if linear_steps: current_lr = start_lr + (lr_update_factor * i) else: current_lr = start_lr * (lr_update_factor**i) for param_group in trainer.optimizer.param_groups: param_group['lr'] = current_lr trainer.optimizer.zero_grad() loss = trainer.batch_loss(batch, for_training=True) loss.backward() loss = loss.detach().cpu().item() if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best): logger.info( f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.' ) break trainer.rescale_gradients() trainer.optimizer.step() learning_rates.append(current_lr) losses.append(loss) if loss < best and i > 10: best = loss if i == num_batches: break return learning_rates, losses
training_batch_size = int(config["batch_size_train"]) # label is always set to 1 - indicating first input is pos (see criterion:MarginRankingLoss) + cache on gpu label = torch.ones(training_batch_size).cuda(cuda_device) # helper vars for quick checking if we should validate during the epoch validate_every_n_batches = config["validate_every_n_batches"] do_validate_every_n_batches = validate_every_n_batches > -1 #s_pos = torch.cuda.Stream() #s_neg = torch.cuda.Stream() # # train loop # ------------------------------- # for i in Tqdm.tqdm(range(0, config["training_batch_count"]), disable=config["tqdm_disabled"]): batch = training_queue.get() current_batch_size = batch["query_tokens"]["tokens"].shape[0] batch = move_to_device(batch, cuda_device) optimizer.zero_grad() if embedding_optimizer: embedding_optimizer.zero_grad() #with torch.cuda.stream(s_pos): output_pos = model.forward(batch["query_tokens"], batch["doc_pos_tokens"], batch["query_length"],
parser.add_argument('--dataset-files', nargs='+', action='store', dest='dataset_files', help='file format <id>\t<sequence text>', required=True) args = parser.parse_args() # # load data & create vocab # ------------------------------- # loader = IrTupleDatasetReader(lazy=True,source_tokenizer=BlingFireTokenizer(),target_tokenizer=BlingFireTokenizer(),lowercase=args.lowercase) total_documents=0 all_tokens={} for file in args.dataset_files: for instance in Tqdm.tqdm(loader.read(file)): token_set = set([tok.text.lower() for tok in instance["target_tokens"].tokens]) for token_text in token_set: if token_text not in all_tokens: all_tokens[token_text]=0 all_tokens[token_text]+=1 total_documents += 1 with open(args.out_dir,"w",encoding="utf8") as out: for token,count in all_tokens.items(): out.write(token+" "+f'{math.log(total_documents/count):1.20f}'+"\n")
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) cpu_memory_usage = [] for worker, memory in common_util.peak_cpu_memory().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") gpu_memory_usage = [] for gpu, memory in common_util.peak_gpu_memory().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") regularization_penalty = self.model.get_regularization_penalty() train_loss = 0.0 batch_loss = 0.0 train_reg_loss = None if regularization_penalty is None else 0.0 batch_reg_loss = None if regularization_penalty is None else 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps ) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps ) except TypeError: num_training_batches = float("inf") # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's # progress is shown if self._primary: batch_group_generator_tqdm = Tqdm.tqdm( batch_group_generator, total=num_training_batches ) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: if done_early: break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total # Zero gradients. # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()` # because it avoids a read op when the gradients are first updated below. for param_group in self.optimizer.param_groups: for p in param_group["params"]: p.grad = None batch_loss = 0.0 batch_group_outputs = [] for batch in batch_group: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced." ) break with amp.autocast(self._use_amp): batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs["loss"] reg_loss = batch_outputs.get("reg_loss") if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) batch_loss += loss.item() if reg_loss is not None: reg_loss = reg_loss / len(batch_group) batch_reg_loss = reg_loss.item() train_reg_loss += batch_reg_loss # type: ignore if self._scaler is not None: self._scaler.scale(loss).backward() else: loss.backward() if len(batch_group_outputs) <= 0: continue train_loss += batch_loss batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss, batch_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=self.cuda_device, ) if self._primary: # Updating tqdm only for the primary as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) if self._checkpointer is not None: self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._callbacks: callback.on_batch( self, batch_group, batch_group_outputs, metrics, epoch, batches_this_epoch, is_training=True, is_primary=self._primary, batch_grad_norm=batch_grad_norm, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss=None, batch_reg_loss=None, num_batches=batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024) for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _multi_worker_islice( self, iterable: Iterable[Any], transform: Optional[Callable[[Any], Instance]] = None, ensure_lazy: bool = False, ) -> Iterable[Instance]: """ Helper method that determines which raw instances to skip based on the current node rank (for distributed training) and worker ID (for multi-process data loading). # Parameters iterable : `Iterable[Any]` An iterable that yields raw data that can be transformed into `Instance`s through the `transform` function. transform : `Optional[Callable[[Any], Instance]]`, optional (default = `None`) An optional function that will be applied to the raw data generated by `iterable` to create `Instance`s. This is used, e.g., when reading cached data. ensure_lazy : `bool`, optional (default = `False`) If `True`, a `ConfigurationError` error will be raised if `iterable` is a list instead of a lazy generator type. # Returns `Iterable[Instance]` """ if ensure_lazy and isinstance(iterable, (list, tuple)): raise ConfigurationError( "For a lazy dataset reader, _read() must return a generator") wrap_with_tqdm = True start_index = 0 step_size = 1 if not self.manual_distributed_sharding and util.is_distributed(): start_index = dist.get_rank() step_size = dist.get_world_size() worker_info = None if self.manual_multi_process_sharding else get_worker_info( ) if worker_info: warnings.warn( "Using multi-process data loading without setting " "DatasetReader.manual_multi_process_sharding to True.\n" "Did you forget to set this?\n" "If you're not handling the multi-process sharding logic within your " "_read() method, there is probably no benefit to using more than one " "worker.", UserWarning, ) # Scale `start_index` by `num_workers`, then shift by worker `id`. start_index *= worker_info.num_workers start_index += worker_info.id # Scale `step_size` by `num_workers`. step_size *= worker_info.num_workers if worker_info.id > 0: # We only want to log with tqdm from the main loader process. wrap_with_tqdm = False islice = itertools.islice(iterable, start_index, self.max_instances, step_size) if wrap_with_tqdm: islice = Tqdm.tqdm(islice, desc="reading instances") if transform is not None: return (transform(x) for x in islice) return islice
def _read_embeddings_from_text_file(file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError("No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
args = parser.parse_args() # # compare (different mrr gains up,same,down) # ------------------------------- # max_out_rank = args.top_n #max_cs_n = args.top_n #res = load_candidate(args.res_in,space_for_rank=30000) #candidate_set = parse_candidate_set(args.candidate_file) with open(args.res_out, "w", encoding="utf8") as res_out: with open(args.res_in, "r", encoding="utf8") as res_in: for l in Tqdm.tqdm(res_in): l_split = l.strip().split() if len(l_split) == 4: # own format rank = int(l_split[2]) elif len(l_split) == 6: # original trec format rank = int(l_split[3]) if rank <= max_out_rank: res_out.write(l) #for query,data in Tqdm.tqdm(res.items()): # out_count = 0 # for (pid,rank,score) in data: # if out_count == max_out_rank: # break