Ejemplo n.º 1
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning("DatasetReader.lazy is not set, "
                           "did you forget to call the superclass constructor?")

        if lazy:
            return _LazyInstances(lambda: iter(self._read(file_path)))
        else:
            instances = self._read(file_path)
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError("No instances were read from the given filepath {}. "
                                         "Is the path correct?".format(file_path))
            return instances
loader = IrTupleDatasetReader(lazy=True, lowercase=True)
vocab = Vocabulary.from_files(args.vocab)
if args.qrel:
    qrels = load_reference(args.qrel)

not_judged = 0
oov_queries = 0
non_oov_queries = 0
oov_count_list = []
instances = loader.read(args.query)

with open(args.out_file_oov, "w", encoding="utf8") as out_file_oov:
    with open(args.out_file_no_oov, "w", encoding="utf8") as out_file_non_oov:

        for i in Tqdm.tqdm(instances):
            id_str = i["source_tokens"].tokens[0].text
            if args.qrel and int(id_str) not in qrels:
                not_judged += 1
                continue

            i.index_fields(vocab)

            indexes = i["target_tokens"]._indexed_tokens["tokens"]

            if 1 in i["target_tokens"]._indexed_tokens["tokens"]:
                # we have a oov query
                oov_queries += 1
                oov_count_list.append(sum(1 for t in indexes if t == 1))

                out_file_oov.write(id_str + "\t" + " ".join(
Ejemplo n.º 3
0
def _read_embeddings_from_text_file(
        file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens"
) -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped.

    The remainder of the docstring is identical to `_read_pretrained_embeddings_file`.
    """
    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    char_embeddings = {}
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(" ", 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(" ")
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1,
                        line,
                    )
                    continue

                vector = numpy.asarray(fields[1:], dtype="float32")
                for char in list(token):
                    if char in char_embeddings:
                        char_embeddings[char] = (char_embeddings[char][0] + vector, char_embeddings[char][1] + 1)
                    else:
                        char_embeddings[char] = (vector, 1)
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary"
        )

    char_embeddings = {char: char_embeddings[char][0] / char_embeddings[char][1] for char in char_embeddings}
    chars = set(char_embeddings.keys())

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std
    )
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        elif len(set(token) - chars) == 0:
            embedding_matrix[i] = torch.FloatTensor([char_embeddings[char] for char in list(token)]).sum(dim=-2)
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.", token
            )

    logger.info(
        "Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size
    )

    return embedding_matrix
Ejemplo n.º 4
0
def search_learning_rate(
    trainer: GradientDescentTrainer,
    start_lr: float = 1e-5,
    end_lr: float = 10,
    num_batches: int = 100,
    linear_steps: bool = False,
    stopping_factor: float = None,
) -> Tuple[List[float], List[float]]:
    """
    Runs training loop on the model using [`GradientDescentTrainer`](../training/trainer.md#gradientdescenttrainer)
    increasing learning rate from `start_lr` to `end_lr` recording the losses.

    # Parameters

    trainer: `GradientDescentTrainer`
    start_lr : `float`
        The learning rate to start the search.
    end_lr : `float`
        The learning rate upto which search is done.
    num_batches : `int`
        Number of batches to run the learning rate finder.
    linear_steps : `bool`
        Increase learning rate linearly if False exponentially.
    stopping_factor : `float`
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If `None` search proceeds till the `end_lr`

    # Returns

    (learning_rates, losses) : `Tuple[List[float], List[float]]`
        Returns list of learning rates and corresponding losses.
        Note: The losses are recorded before applying the corresponding learning rate
    """
    if num_batches <= 10:
        raise ConfigurationError(
            "The number of iterations for learning rate finder should be greater than 10."
        )

    trainer.model.train()

    infinite_generator = itertools.cycle(trainer.data_loader)
    train_generator_tqdm = Tqdm.tqdm(infinite_generator, total=num_batches)

    learning_rates = []
    losses = []
    best = 1e9
    if linear_steps:
        lr_update_factor = (end_lr - start_lr) / num_batches
    else:
        lr_update_factor = (end_lr / start_lr)**(1.0 / num_batches)

    for i, batch in enumerate(train_generator_tqdm):

        if linear_steps:
            current_lr = start_lr + (lr_update_factor * i)
        else:
            current_lr = start_lr * (lr_update_factor**i)

        for param_group in trainer.optimizer.param_groups:
            param_group["lr"] = current_lr

        trainer.optimizer.zero_grad()
        loss = trainer.batch_outputs(batch, for_training=True)["loss"]
        loss.backward()
        loss = loss.detach().cpu().item()

        if stopping_factor is not None and (math.isnan(loss)
                                            or loss > stopping_factor * best):
            logger.info(
                f"Loss ({loss}) exceeds stopping_factor * lowest recorded loss."
            )
            break

        trainer.rescale_gradients()
        trainer.optimizer.step()

        learning_rates.append(current_lr)
        losses.append(loss)

        if loss < best and i > 10:
            best = loss

        if i == num_batches:
            break

    return learning_rates, losses
Ejemplo n.º 5
0
 def _instances_to_cache_file(self, cache_filename, instances) -> None:
     with open(cache_filename, "w") as cache:
         for instance in Tqdm.tqdm(instances):
             cache.write(self.serialize_instance(instance) + "\n")
Ejemplo n.º 6
0
    lazy=True,
    max_doc_length=180,
    max_query_length=30,
    tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter()
                            ))  # already spacy tokenized, so that it is faster

_iterator = BucketIterator(batch_size=64,
                           sorting_keys=[("doc_pos_tokens", "num_tokens"),
                                         ("doc_neg_tokens", "num_tokens")])

_iterator.index_with(vocab)

for epoch in range(2):

    for batch in Tqdm.tqdm(
            _iterator(_triple_loader.read(config["train_data"]),
                      num_epochs=1)):
        # todo train loop
        pass

#
# eval (duplicate for validation inside train loop - but rename "_iterator", since
# otherwise it will overwrite the original train iterator, which is instantiated outside the loop)
#

_tuple_loader = IrLabeledTupleDatasetReader(
    lazy=True, max_doc_length=180,
    max_query_length=30)  # not spacy tokenized already (default is spacy)
_iterator = BucketIterator(batch_size=128,
                           sorting_keys=[("doc_tokens", "num_tokens"),
                                         ("query_tokens", "num_tokens")])
Ejemplo n.º 7
0
    def _validation_loss(
            self, epoch: int
    ) -> Tuple[float, float, int, List[Dict[str, torch.Tensor]]]:
        """
        Computes the validation loss. Returns it and the number of batches.
        Also returns list of predictions.
        """
        logger.info("Validating")

        self._pytorch_model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_data_loader is not None:
            validation_data_loader = self._validation_data_loader
        else:
            raise ConfigurationError(
                "Validation results cannot be calculated without a validation_data_loader"
            )

        val_generator_tqdm = Tqdm.tqdm(validation_data_loader)
        batches_this_epoch = 0
        val_loss = 0
        val_reg_loss = 0
        done_early = False
        preds = []
        for batch in val_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing validation early! "
                        "This implies that there is an imbalance in your validation "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batch_outputs = self.batch_outputs(batch, for_training=False)
            loss = batch_outputs.get("loss")
            reg_loss = batch_outputs.get("reg_loss")
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()
                if reg_loss is not None:
                    val_reg_loss += reg_loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(
                self.model,
                val_loss,
                val_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=self.cuda_device,
            )
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

            if self.dataset_writer:
                output_dict = self.model.make_output_human_readable(
                    batch_outputs)
                output_dict = split_up(output_dict, batch["order_metadata"])
                preds.extend(output_dict)

            for callback in self._batch_callbacks:
                callback(
                    self,
                    [batch],
                    [batch_outputs],
                    epoch,
                    batches_this_epoch,
                    is_training=False,
                    is_master=self._master,
                )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)."
            )
            # Indicate that we're done so that any workers that have remaining data stop validation early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, val_reg_loss, batches_this_epoch, preds
args = parser.parse_args()

#
# work
#

qrels = load_reference(args.qrel)
result_files = glob.glob(args.results)

best_mrr = (0, "", 0)
best_relevant = (0, "", 0)

with open(args.out_file, "w", encoding="utf8") as metric_file:

    for t, res_file in Tqdm.tqdm(enumerate(result_files)):
        try:
            res_candidate = load_candidate(res_file, args.cutoff)
            for i in Tqdm.tqdm(range(1, args.cutoff)):

                metrics = compute_metrics(qrels, res_candidate, i)

                if i == 1 and t == 0:
                    metric_file.write("sep=,\nFile,Cutoff," +
                                      ",".join(k for k, v in metrics.items()) +
                                      "\n")

                if metrics["QueriesWithRelevant"] > best_relevant[0]:
                    best_relevant = (metrics["QueriesWithRelevant"], res_file,
                                     i)
                    print("got new best QueriesWithRelevant", best_relevant)
Ejemplo n.º 9
0
Archivo: train.py Proyecto: bgeszti/ML
    # if early stopping has been triggered during validation, exit from the epoch
    if earlyStop is True:
        break

    perf_monitor.start_block(monitorModel)
    perf_start_inst = 0
    # prep model for training
    model.train()
    # Creating a label tensor filled with ones --> will be needed for marginranking loss
    # should be initialized in each outer loop, since in the last loop in the inner cycle the size of the tensor
    # will probably change
    label = torch.ones(trainBatchSize).cuda()
    batchCounter = 0
    # Train loop
    for batch in Tqdm.tqdm(
            _iterator(_triple_loader.read(config["train_data"]),
                      num_epochs=1)):
        iterCounter += 1
        batch = move_to_device(batch, 0)
        # batch  = Parameter(batch).to(device)
        batchCounter += 1
        model.train()
        # todo train loop
        # in the beggining of each train loop, clean the optimizer (zero_grad() method)
        optimAdam.zero_grad()
        # retrieve the current batch size --> The iterators do not guarantee a fixed batch size
        # (the last one will probably be smaller) --> so we will retrieve the number of tokens from e.g. the query
        currentBatchSize = batch["query_tokens"]["tokens"].shape[0]
        # for the batch size, th

        # based on the slides, the model will be trained with triplets:
Ejemplo n.º 10
0
def main(file, embeddings, model, emb_wt_key, namespace, output_dir):
    archive = load_archive(model)
    config = archive.config
    os.makedirs(output_dir, exist_ok=True)
    config.to_file(os.path.join(output_dir, CONFIG_NAME))

    model = archive.model
    # first expand the vocabulary
    dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    instances = dataset_reader.read(file)
    vocab = model.vocab

    # get all the tokens in the new file
    namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(
        lambda: defaultdict(int))
    for instance in Tqdm.tqdm(instances):
        instance.count_vocab_items(namespace_token_counts)
    old_token_size = vocab.get_vocab_size(namespace)
    print("Before expansion: Number of instances in {} namespace: {}".format(
        namespace, old_token_size))
    if namespace not in namespace_token_counts:
        logger.error(
            "No tokens found for namespace: {} in the new input file".format(
                namespace))
    # identify the new tokens in the new instances
    token_to_add = set()
    token_hits = 0
    for token, count in namespace_token_counts[namespace].items():
        if token not in vocab._token_to_index[namespace]:
            # new token, must add
            token_to_add.add(token)
        else:
            token_hits += 1
    print("Found {} existing tokens and {} new tokens in {}".format(
        token_hits, len(token_to_add), file))

    # add the new tokens to the vocab
    for token in token_to_add:
        vocab.add_token_to_namespace(token=token, namespace=namespace)
    archived_parameters = dict(model.named_parameters())

    # second, expand the embedding matrix
    for name, weights in archived_parameters.items():
        # find the wt matrix for the embeddings
        if name == emb_wt_key:
            if weights.dim() != 2:
                logger.error(
                    "Expected an embedding matrix for the parameter: {} instead"
                    "found {} tensor".format(emb_wt_key, weights.shape))
            emb_dim = weights.shape[-1]
            print("Before expansion: Size of emb matrix: {}".format(
                weights.shape))
            # Loading embeddings for old and new tokens since that is cleaner than copying all
            # the embedding loading logic here
            all_embeddings = _read_pretrained_embeddings_file(
                embeddings, emb_dim, vocab, namespace)
            # concatenate the new entries i.e last token_to_add embeddings to the original weights
            if len(token_to_add) > 0:
                weights.data = torch.cat(
                    [weights.data, all_embeddings[-len(token_to_add):, :]])
            print("After expansion: Size of emb matrix: {}".format(
                weights.shape))

    # save the files needed by the model archiver
    model_path = os.path.join(output_dir, "weight.th")
    model_state = model.state_dict()
    torch.save(model_state, model_path)
    vocab.save_to_files(os.path.join(output_dir, "vocabulary"))
    archive_model(output_dir, weights="weight.th")

    # more debug messages
    new_token_size = vocab.get_vocab_size(namespace)
    for name, weights in archived_parameters.items():
        if name == emb_wt_key:
            print("Size of emb matrix: {}".format(weights.shape))
    print("After expansion: Number of instances in {} namespace: {}".format(
        namespace, new_token_size))
Ejemplo n.º 11
0
"""
Take a src and tgt files (standard input in opennmt and fairseq) and join them into a single tsv
"""
import os

from allennlp.common import Tqdm


def gen_line(path):
    file = open(path)
    for line in file:
        yield line
    file.close()


if __name__ == '__main__':
    src_path = '../data/bbc/val.txt.src'
    tgt_path = '../data/bbc/val.txt.tgt'
    src = gen_line(src_path)
    tgt = gen_line(tgt_path)
    new_lines = []
    i = 0
    for src_line, tgt_line in Tqdm.tqdm(zip(src, tgt)):
        new_line = src_line.strip() + '\t' + tgt_line.strip() + '\n'
        new_lines.append(new_line)
    file = open(
        os.path.join('../data/bbc/',
                     os.path.basename(src_path).split('.')[0] + '.tsv'), 'w')
    file.writelines(new_lines)
    file.close()
Ejemplo n.º 12
0
def evaluate_model(model,
                   config,
                   _logger,
                   cuda_device,
                   eval_tsv,
                   eval_batch_count,
                   use_cache=False):

    model.eval()  # turning off training
    validation_results = {}
    fill_cache = False
    cached_batches = None

    try:
        if use_cache:
            global evaluate_cache
            if eval_tsv not in evaluate_cache:
                fill_cache = True
                evaluate_cache[eval_tsv] = []
            cached_batches = evaluate_cache[eval_tsv]

        if not use_cache or fill_cache:
            validation_queue, validation_processes, validation_exit = get_multiprocess_batch_queue(
                "eval-batches",
                multiprocess_validation_loader,
                glob.glob(eval_tsv),
                config,
                _logger,
                queue_size=200)
            #time.sleep(len(validation_processes))  # fill the queue
            _logger.info("[eval_model] --- Start validation with queue.size:" +
                         str(validation_queue.qsize()))
        else:
            _logger.info("[eval_model] --- Start validation with cache size:" +
                         str(len(cached_batches)))

        with torch.no_grad():
            for i in Tqdm.tqdm(range(0, eval_batch_count),
                               disable=config["tqdm_disabled"]):

                if not use_cache or fill_cache:
                    batch_orig = validation_queue.get()
                    if fill_cache:
                        cached_batches.append(batch_orig)
                else:
                    batch_orig = cached_batches[i]

                batch = move_to_device(copy.deepcopy(batch_orig), cuda_device)

                output = model.forward(batch["query_tokens"],
                                       batch["doc_tokens"],
                                       batch["query_length"],
                                       batch["doc_length"])
                output = output.cpu(
                )  # get the output back to the cpu - in one piece

                for sample_i, sample_query_id in enumerate(
                        batch_orig["query_id"]):  # operate on cpu memory

                    sample_query_id = int(sample_query_id)
                    sample_doc_id = int(
                        batch_orig["doc_id"]
                        [sample_i])  # again operate on cpu memory

                    if sample_query_id not in validation_results:
                        validation_results[sample_query_id] = []

                    validation_results[sample_query_id].append(
                        (sample_doc_id, float(output[sample_i])))

                #if not use_cache or fill_cache and i % 100 == 0: # only to check for performance regresion
                #    if validation_queue.qsize() < 10:
                #        _logger.warning("validation_queue.qsize() < 10")

        if not use_cache or fill_cache:
            # make sure we didn't make a mistake in the configuration / data preparation
            if validation_queue.qsize() != 0:
                _logger.error(
                    "validation_queue.qsize() is not empty after evaluation")

            validation_exit.set()  # allow sub-processes to exit

    except BaseException as e:
        _logger.info('-' * 89)
        _logger.exception('[eval_model] Got exception: ')
        print(
            "----- Attention! - something went wrong in eval_model (see logger) ----- "
        )

        if not use_cache or fill_cache:
            for proc in validation_processes:
                if proc.is_alive():
                    proc.terminate()
        raise e

    return validation_results
Ejemplo n.º 13
0
args = parser.parse_args()

#
# compare (different mrr gains up,same,down)
# -------------------------------
#
res = load_candidate_from_stream_with_score(open(args.res_in,
                                                 "r"))  #,space_for_rank=30000)

candidate_set = None
if args.candidate_file:
    candidate_set = parse_candidate_set(args.candidate_file, 1000)

with open(args.res_out, "w", encoding="utf8") as res_out:
    for query, data in Tqdm.tqdm(res.items()):
        out_count = 0
        for (pid, rank, score) in data:
            if out_count == args.top_n:
                break

            if candidate_set is not None:
                if candidate_set[query][pid] <= args.cs_n:
                    res_out.write(
                        str(query) + " Q0 " + str(pid) + " " + str(out_count) +
                        " " + str(score) + " " + args.run_id + "\n")
                    out_count += 1
            else:
                res_out.write(
                    str(query) + " Q0 " + str(pid) + " " + str(out_count) +
                    " " + str(score) + " " + args.run_id + "\n")
Ejemplo n.º 14
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?")
        if lazy:
            return _LazyInstances(lambda: iter(self._read(file_path)))
        else:
            if self.cache_path is not None:
                # create a key for the file based on the reader config
                hash_ = self.get_hash(file_path)
                pathlib.Path(self.cache_path).mkdir(parents=True,
                                                    exist_ok=True)
                cache_file = os.path.join(self.cache_path, (hash_ + '.cache'))
                if not os.path.exists(cache_file) or self.overwrite_cache:
                    instances = self._read(file_path)
                    if not isinstance(instances, list):
                        instances = [
                            instance for instance in Tqdm.tqdm(instances)
                        ]
                    if not instances:
                        raise ConfigurationError(
                            "No instances were read from the given filepath {}. "
                            "Is the path correct?".format(file_path))
                    logger.info(f'caching instances to file: {cache_file}')

                    with open(cache_file, 'wb') as cache:
                        dill.dump(instances, cache)
                else:
                    logger.info(
                        f'Reading instances from cache file: {cache_file}')
                    # instances = []
                    # with open(cache_file, 'rb') as cache:
                    #     start   = time.time()
                    #     instances = []
                    #     for line in Tqdm.tqdm(cache):
                    #         instances.append(self.deserialize_instance(line.strip()))
                    #     print(time.time()-start)
                    with open(cache_file, 'rb') as f_in:
                        instances = dill.load(f_in)
            else:
                instances = self._read(file_path)
                if not isinstance(instances, list):
                    instances = [instance for instance in Tqdm.tqdm(instances)]
                if not instances:
                    raise ConfigurationError(
                        "No instances were read from the given filepath {}. "
                        "Is the path correct?".format(file_path))
            return instances
Ejemplo n.º 15
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._pytorch_model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        val_generator = val_iterator(self._validation_data,
                                     num_epochs=1,
                                     shuffle=False)
        num_validation_batches = val_iterator.get_num_batches(
            self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        done_early = False
        for batch in val_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing validation early! "
                        "This implies that there is an imbalance in your validation "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            loss = self.batch_loss(batch, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(
                self.model,
                val_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)."
            )
            # Indicate that we're done so that any workers that have remaining data stop validation early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
Ejemplo n.º 16
0
def search_learning_rate(trainer: Trainer,
                         start_lr: float = 1e-5,
                         end_lr: float = 10,
                         num_batches: int = 100,
                         linear_steps: bool = False,
                         stopping_factor: float = None) -> Tuple[List[float], List[float]]:
    """
    Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer`
    increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses.

    Parameters
    ----------
    trainer: :class:`~allennlp.training.trainer.Trainer`
    start_lr: ``float``
        The learning rate to start the search.
    end_lr: ``float``
        The learning rate upto which search is done.
    num_batches: ``int``
        Number of batches to run the learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``

    Returns
    -------
    (learning_rates, losses): ``Tuple[List[float], List[float]]``
        Returns list of learning rates and corresponding losses.
        Note: The losses are recorded before applying the corresponding learning rate
    """
    if num_batches <= 10:
        raise ConfigurationError('The number of iterations for learning rate finder should be greater than 10.')

    trainer.model.train()

    train_generator = trainer.iterator(trainer.train_data,
                                       shuffle=trainer.shuffle)
    train_generator_tqdm = Tqdm.tqdm(train_generator,
                                     total=num_batches)

    learning_rates = []
    losses = []
    best = 1e9
    if linear_steps:
        lr_update_factor = (end_lr - start_lr) / num_batches
    else:
        lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches)

    for i, batch in enumerate(train_generator_tqdm):

        if linear_steps:
            current_lr = start_lr + (lr_update_factor * i)
        else:
            current_lr = start_lr * (lr_update_factor ** i)

        for param_group in trainer.optimizer.param_groups:
            param_group['lr'] = current_lr

        trainer.optimizer.zero_grad()
        loss = trainer.batch_loss(batch, for_training=True)
        loss.backward()
        loss = loss.detach().cpu().item()

        if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best):
            logger.info(f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.')
            break

        trainer.rescale_gradients()
        trainer.optimizer.step()

        learning_rates.append(current_lr)
        losses.append(loss)

        if loss < best and i > 10:
            best = loss

        if i == num_batches:
            break

    return learning_rates, losses
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info(f"Epoch: {epoch}/{self._num_epochs - 1}")
        cpu_memory_usage = []
        for worker, memory in common_util.peak_memory_mb().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage MB: {memory}")
        gpu_memory_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        for component_optimizer in self.component_optimizers.values():
            component_optimizer.reset_loss('train')

        self.model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps)
        except TypeError:
            num_training_batches = float("inf")

        batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                               total=num_training_batches)

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False

        for batch_group in batch_group_generator_tqdm:

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            for component_optimizer in self.component_optimizers.values():
                component_optimizer.zero_grad()

            batch_group_metrics = []

            meta_batch = deepcopy(batch_group)

            # Train the Sub Models first
            for name, sub_model in self._pytorch_model.component_models.items(
            ):
                component_optimizer = self.component_optimizers[name]
                batch_group_outputs, metrics = component_optimizer.process_batch_group(
                    batch_group, True, batch_num_total, batches_this_epoch,
                    True)
                batch_group_metrics.append(metrics)

                for i, batch_outputs in enumerate(batch_group_outputs):
                    component_output = batch_outputs["output"]
                    component_output = component_output.detach()
                    meta_batch[i][name] = component_output

            meta_optimizer = self.component_optimizers["meta"]
            meta_batch_outputs, meta_metrics = meta_optimizer.process_batch_group(
                meta_batch, True, batch_num_total, batches_this_epoch, False)

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            batch_group_metrics.append(meta_metrics)

            all_metrics = ChainMap(*batch_group_metrics)

            description = training_util.description_from_metrics(all_metrics)
            batch_group_generator_tqdm.set_description(description,
                                                       refresh=False)

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory

        return all_metrics
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps
        )
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps
        )
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(
                batch_group_generator, total=num_training_batches
            )
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss, effective_batch_size = self.batch_loss(batch, for_training=True)
                if not loss:  # weak supervision can have no loss
                    continue
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                denom = effective_batch_size if effective_batch_size else len(batch_group)
                loss = loss / denom
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch() and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name, update_norm / (param_norm + 1e-7)
                    )
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model, self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"])
                self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch() and self._master:
                self._tensorboard.log_histograms(self.model, histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(training_util.get_batch_size(batch) for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size", batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size", average)

            # Save model if needed.
            if (
                self._model_save_interval is not None
                and (time.time() - last_save_time > self._model_save_interval)
                and self._master
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                    "{0}.{1}".format(epoch, training_util.time_to_str(int(last_save_time)))
                )

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
    def _validation_loss(self, epoch: int) -> Tuple[float, float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self.model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_data_loader is not None:
            validation_data_loader = self._validation_data_loader
        else:
            raise ConfigurationError(
                "Validation results cannot be calculated without a validation_data_loader"
            )

        val_generator_tqdm = Tqdm.tqdm(validation_data_loader)

        for component_optimizer in self.component_optimizers.values():
            component_optimizer.reset_loss('validation')

        batches_this_epoch = 0
        done_early = False

        for batch in val_generator_tqdm:
            batches_this_epoch += 1

            batch_metrics = []
            batch_group = [batch]
            meta_batch = deepcopy(batch_group)

            # Train the Sub Models first
            for name, sub_model in self._pytorch_model.component_models.items(
            ):
                component_optimizer = self.component_optimizers[name]
                batch_group_outputs, metrics = component_optimizer.process_batch_group(
                    batch_group,
                    for_training=False,
                    batches_this_epoch=batches_this_epoch)
                batch_metrics.append(metrics)

                for i, batch_outputs in enumerate(batch_group_outputs):
                    meta_batch[i][name] = batch_outputs["output"]

            meta_optimizer = self.component_optimizers["meta"]
            meta_batch_outputs, meta_metrics = meta_optimizer.process_batch_group(
                meta_batch,
                for_training=False,
                batches_this_epoch=batches_this_epoch)
            batch_metrics.append(meta_metrics)

            all_metrics = ChainMap(*batch_metrics)
            description = training_util.description_from_metrics(all_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return all_metrics
Ejemplo n.º 20
0
#
# load data & create vocab
# -------------------------------
#  

loader = IrTupleDatasetReader(lazy=True,lowercase=args.lowercase)


def getInstances():
    for file in args.dataset_files:
        instances = loader.read(file)
        for i in instances:
            yield Instance({"text":i["target_tokens"]})

namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
for instance in Tqdm.tqdm(getInstances()):
    instance.count_vocab_items(namespace_token_counts)

#with open(args.out_dir,"w",encoding="utf8") as out:
#    for n in namespace_token_counts:
#        #out.write("--"+n+"\n")
#        for w,i in namespace_token_counts[n].items():
#            out.write(w+"\t"+str(i)+"\n")

vocab = Vocabulary(namespace_token_counts, min_count={"tokens":100})
vocab.save_to_files(args.out_dir)

#vocab = Vocabulary(namespace_token_counts, min_count={"tokens":50})
#vocab.save_to_files(args.out_dir2)

#vocab = Vocabulary(namespace_token_counts, min_count={"tokens":10})
Ejemplo n.º 21
0
print('Model',config["model"],'total parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))
print('Network:', model)

#
# train
#

_triple_reader = IrTripleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_triple_reader = _triple_reader.read(config["train_data"])
_triple_reader.index_with(vocab)
loader = PyTorchDataLoader(_triple_reader, batch_size=32)

for epoch in range(2):

    for batch in Tqdm.tqdm(loader):
        # todo train loop
        pass


#
# eval (duplicate for validation inside train loop - but rename "loader", since
# otherwise it will overwrite the original train iterator, which is instantiated outside the loop)
#

_tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30)
_tuple_reader = _tuple_reader.read(config["test_data"])
_tuple_reader.index_with(vocab)
loader = PyTorchDataLoader(_tuple_reader, batch_size=128)

for batch in Tqdm.tqdm(loader):
Ejemplo n.º 22
0
    def read(self, file_path: str) -> Dataset:
        """
        Returns an `Iterable` containing all the instances
        in the specified dataset.

        If `self.lazy` is False, this calls `self._read()`,
        ensures that the result is a list, then returns the resulting list.

        If `self.lazy` is True, this returns an object whose
        `__iter__` method calls `self._read()` each iteration.
        In this case your implementation of `_read()` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a `ConfigurationError`.

        In either case, the returned `Iterable` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, "lazy", None)

        if lazy is None:
            logger.warning(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?")

        if self._cache_directory:
            cache_file = self._get_cache_location_for_file_path(file_path)
        else:
            cache_file = None

        if lazy:
            instances: Iterable[Instance] = _LazyInstances(
                self._read,
                file_path,
                cache_file,
                self.deserialize_instance,
                self.serialize_instance,
            )
            if self.max_instances is not None:
                instances = itertools.islice(instances, 0, self.max_instances)
        else:
            # First we read the instances, either from a cache or from the original file.
            if cache_file and os.path.exists(cache_file):
                instances = self._instances_from_cache_file(cache_file)
            else:
                instances = self._read(file_path)

            if self.max_instances is not None:
                if isinstance(instances, list):
                    instances = instances[:self.max_instances]
                else:
                    instances = itertools.islice(instances, 0,
                                                 self.max_instances)

            # Then some validation.
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError(
                    "No instances were read from the given filepath {}. "
                    "Is the path correct?".format(file_path))

            # And finally we write to the cache if we need to.
            if cache_file and not os.path.exists(cache_file):
                logger.info(f"Caching instances to {cache_file}")
                self._instances_to_cache_file(cache_file, instances)

            instances = AllennlpDataset(instances)

        return instances
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        cpu_memory_usage = []
        for worker, memory in common_util.peak_memory_mb().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage MB: {memory}")
        gpu_memory_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        regularization_penalty = self.model.get_regularization_penalty()

        train_loss = 0.0
        batch_loss = 0.0

        if regularization_penalty is not None:
            train_reg_loss = 0.0
            batch_reg_loss = 0.0
        else:
            train_reg_loss = None
            batch_reg_loss = None
        # Set the model to "train" mode.
        self.model_engine.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps)
        except TypeError:
            num_training_batches = float("inf")

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        batch_group_generator_tqdm = batch_group_generator
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            batch_group_outputs = []
            for batch in batch_group:
                with amp.autocast(self._use_amp):
                    batch_outputs = self.batch_outputs(batch,
                                                       for_training=True)
                    batch_group_outputs.append(batch_outputs)
                    loss = batch_outputs.get("loss")
                    reg_loss = batch_outputs.get("reg_loss")
                    if torch.isnan(loss):
                        raise ValueError("nan loss encountered")
                    loss = loss / len(batch_group)

                    batch_loss = loss.item()
                    train_loss += batch_loss
                    if reg_loss is not None:
                        reg_loss = reg_loss / len(batch_group)
                        batch_reg_loss = reg_loss.item()
                        train_reg_loss += batch_reg_loss

                self.model_engine.backward(loss)
                self.model_engine.step()

            param_updates = None
            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # Get the magnitude of parameter updates for logging.  We need to do some
                # computation before and after the optimizer step, and it's expensive because of
                # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so
                # we don't do this every batch, only when it's requested.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }

                if self._scaler is not None:
                    self._scaler.step(self.optimizer)
                    self._scaler.update()
                else:
                    self.optimizer.step()

                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
            else:
                if self._scaler is not None:
                    self._scaler.step(self.optimizer)
                    self._scaler.update()
                else:
                    self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batch_loss,
                batch_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=self.cuda_device,
            )

            if self._master:
                # Updating tqdm only for the master as the trainers wouldn't have one
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)
                self._tensorboard.log_batch(
                    self.model,
                    self.optimizer,
                    0.,  # batch_grad_norm,
                    metrics,
                    batch_group,
                    param_updates,
                )

                self._checkpointer.maybe_save_checkpoint(
                    self, epoch, batches_this_epoch)

            for callback in self._batch_callbacks:
                callback(
                    self,
                    batch_group,
                    batch_group_outputs,
                    epoch,
                    batches_this_epoch,
                    is_training=True,
                    is_master=self._master,
                )

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batch_loss=None,
            batch_reg_loss=None,
            num_batches=batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=self.cuda_device,
        )

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
Ejemplo n.º 24
0
    def run(  # type: ignore
        self,
        model: Model,
        dataset: DatasetDict,
        split: str = "validation",
        data_loader: Optional[Lazy[TangoDataLoader]] = None,
    ) -> EvaluationResult:
        """
        Runs an evaluation on a dataset.

        * `model` is the model we want to evaluate.
        * `dataset` is the dataset we want to evaluate on.
        * `split` is the name of the split we want to evaluate on.
        * `data_loader` gives you the chance to choose a custom dataloader for the evaluation.
          By default this step evaluates on batches of 32 instances each.
        """

        concrete_data_loader: TangoDataLoader
        if data_loader is None:
            concrete_data_loader = BatchSizeDataLoader(dataset.splits[split],
                                                       batch_size=32,
                                                       shuffle=False)
        else:
            concrete_data_loader = data_loader.construct(
                instances=dataset.splits[split])

        if torch.cuda.device_count() > 0:
            cuda_device = torch.device(0)
        else:
            cuda_device = torch.device("cpu")
        check_for_gpu(cuda_device)

        generator_tqdm = Tqdm.tqdm(iter(concrete_data_loader))

        # Number of batches in instances.
        predictions: List[Dict[str, Any]] = []
        # Number of batches where the model produces a loss.
        loss_count = 0
        batch_count = 0
        # Cumulative loss
        total_loss = 0.0

        with torch.inference_mode():
            model.eval()

            for batch in concrete_data_loader:
                batch_count += 1
                batch = move_to_device(batch, cuda_device)
                output_dict = model(**batch)

                metrics = model.get_metrics()

                loss = output_dict.pop("loss", None)
                if loss is not None:
                    loss_count += 1
                    total_loss += loss.item()
                    metrics["loss"] = total_loss / loss_count

                    if any(
                            metric_name.startswith("_")
                            for metric_name in metrics):
                        self.logger.warning_once(
                            'Metrics with names beginning with "_" will '
                            "not be logged to the tqdm progress bar.")

                    description = (", ".join([
                        "%s: %.2f" % (name, value)
                        for name, value in metrics.items()
                        if not name.startswith("_")
                    ]) + " ||")
                    generator_tqdm.set_description(description, refresh=False)

                output_dict = sanitize(output_dict)

                # This is write-only code, but it's quite fast.
                predictions.extend(
                    dict(zip(output_dict.keys(), x))
                    for x in zip(*output_dict.values()))

            final_metrics = model.get_metrics(reset=True)

        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes produced a loss!"
                )
            final_metrics["loss"] = total_loss / loss_count

        return self.EvaluationResult(final_metrics, predictions)
Ejemplo n.º 25
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        train_reg_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches = math.ceil(
            len(self.data_loader) / self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            batch_group_outputs = []
            for batch in batch_group:
                batch_outputs = self.batch_outputs(batch, for_training=True)
                batch_group_outputs.append(batch_outputs)
                loss = batch_outputs["loss"]
                reg_loss = batch_outputs["reg_loss"]
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                reg_loss = reg_loss / len(batch_group)
                if self._opt_level is not None:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                train_loss += loss.item()
                train_reg_loss += reg_loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            param_updates = None
            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # Get the magnitude of parameter updates for logging.  We need to do some
                # computation before and after the optimizer step, and it's expensive because of
                # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so
                # we don't do this every batch, only when it's requested.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)
                self._tensorboard.log_batch(self.model, self.optimizer,
                                            batch_grad_norm, metrics,
                                            batch_group, param_updates)

            if self._master:
                self._checkpointer.maybe_save_checkpoint(
                    self, epoch, batches_this_epoch)
                for callback in self._batch_callbacks:
                    callback(
                        self,
                        batch_group,
                        batch_group_outputs,
                        epoch,
                        batches_this_epoch,
                        is_training=True,
                    )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
Ejemplo n.º 26
0
def search_learning_rate(
        trainer: Trainer,
        start_lr: float = 1e-5,
        end_lr: float = 10,
        num_batches: int = 100,
        linear_steps: bool = False,
        stopping_factor: float = None) -> Tuple[List[float], List[float]]:
    """
    Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer`
    increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses.
    Parameters
    ----------
    trainer: :class:`~allennlp.training.trainer.Trainer`
    start_lr: ``float``
        The learning rate to start the search.
    end_lr: ``float``
        The learning rate upto which search is done.
    num_batches: ``int``
        Number of batches to run the learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    Returns
    -------
    (learning_rates, losses): ``Tuple[List[float], List[float]]``
        Returns list of learning rates and corresponding losses.
        Note: The losses are recorded before applying the corresponding learning rate
    """
    if num_batches <= 10:
        raise ConfigurationError(
            'The number of iterations for learning rate finder should be greater than 10.'
        )

    trainer.model.train()

    train_generator = trainer.iterator(trainer.train_data,
                                       shuffle=trainer.shuffle)
    train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_batches)

    learning_rates = []
    losses = []
    best = 1e9
    if linear_steps:
        lr_update_factor = (end_lr - start_lr) / num_batches
    else:
        lr_update_factor = (end_lr / start_lr)**(1.0 / num_batches)

    for i, batch in enumerate(train_generator_tqdm):

        if linear_steps:
            current_lr = start_lr + (lr_update_factor * i)
        else:
            current_lr = start_lr * (lr_update_factor**i)

        for param_group in trainer.optimizer.param_groups:
            param_group['lr'] = current_lr

        trainer.optimizer.zero_grad()
        loss = trainer.batch_loss(batch, for_training=True)
        loss.backward()
        loss = loss.detach().cpu().item()

        if stopping_factor is not None and (math.isnan(loss)
                                            or loss > stopping_factor * best):
            logger.info(
                f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.'
            )
            break

        trainer.rescale_gradients()
        trainer.optimizer.step()

        learning_rates.append(current_lr)
        losses.append(loss)

        if loss < best and i > 10:
            best = loss

        if i == num_batches:
            break

    return learning_rates, losses
            training_batch_size = int(config["batch_size_train"])
            # label is always set to 1 - indicating first input is pos (see criterion:MarginRankingLoss) + cache on gpu
            label = torch.ones(training_batch_size).cuda(cuda_device)

            # helper vars for quick checking if we should validate during the epoch
            validate_every_n_batches = config["validate_every_n_batches"]
            do_validate_every_n_batches = validate_every_n_batches > -1

            #s_pos = torch.cuda.Stream()
            #s_neg = torch.cuda.Stream()

            #
            # train loop
            # -------------------------------
            #
            for i in Tqdm.tqdm(range(0, config["training_batch_count"]),
                               disable=config["tqdm_disabled"]):

                batch = training_queue.get()

                current_batch_size = batch["query_tokens"]["tokens"].shape[0]

                batch = move_to_device(batch, cuda_device)

                optimizer.zero_grad()
                if embedding_optimizer:
                    embedding_optimizer.zero_grad()

                #with torch.cuda.stream(s_pos):
                output_pos = model.forward(batch["query_tokens"],
                                           batch["doc_pos_tokens"],
                                           batch["query_length"],
parser.add_argument('--dataset-files', nargs='+', action='store', dest='dataset_files',
                    help='file format <id>\t<sequence text>', required=True)

args = parser.parse_args()


#
# load data & create vocab
# -------------------------------
#  

loader = IrTupleDatasetReader(lazy=True,source_tokenizer=BlingFireTokenizer(),target_tokenizer=BlingFireTokenizer(),lowercase=args.lowercase)

total_documents=0
all_tokens={}

for file in args.dataset_files:
    for instance in Tqdm.tqdm(loader.read(file)):

        token_set = set([tok.text.lower() for tok in instance["target_tokens"].tokens])
        for token_text in token_set:
            if token_text not in all_tokens:
                all_tokens[token_text]=0
            all_tokens[token_text]+=1

        total_documents += 1

with open(args.out_dir,"w",encoding="utf8") as out:
    for token,count in all_tokens.items():
        out.write(token+" "+f'{math.log(total_documents/count):1.20f}'+"\n")
Ejemplo n.º 29
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        cpu_memory_usage = []
        for worker, memory in common_util.peak_cpu_memory().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}")
        gpu_memory_usage = []
        for gpu, memory in common_util.peak_gpu_memory().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}")

        regularization_penalty = self.model.get_regularization_penalty()

        train_loss = 0.0
        batch_loss = 0.0
        train_reg_loss = None if regularization_penalty is None else 0.0
        batch_reg_loss = None if regularization_penalty is None else 0.0

        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps
        )

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps
            )
        except TypeError:
            num_training_batches = float("inf")

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's
        # progress is shown
        if self._primary:
            batch_group_generator_tqdm = Tqdm.tqdm(
                batch_group_generator, total=num_training_batches
            )
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if done_early:
                break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            # Zero gradients.
            # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()`
            # because it avoids a read op when the gradients are first updated below.
            for param_group in self.optimizer.param_groups:
                for p in param_group["params"]:
                    p.grad = None

            batch_loss = 0.0
            batch_group_outputs = []
            for batch in batch_group:
                if self._distributed:
                    # Check whether the other workers have stopped already (due to differing amounts of
                    # data in each). If so, we can't proceed because we would hang when we hit the
                    # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                    # here because NCCL process groups apparently don't support BoolTensor.
                    done = torch.tensor(0, device=self.cuda_device)
                    torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
                    if done.item() > 0:
                        done_early = True
                        logger.warning(
                            f"Worker {torch.distributed.get_rank()} finishing training early! "
                            "This implies that there is an imbalance in your training "
                            "data across the workers and that some amount of it will be "
                            "ignored. A small amount of this is fine, but a major imbalance "
                            "should be avoided. Note: This warning will appear unless your "
                            "data is perfectly balanced."
                        )
                        break

                with amp.autocast(self._use_amp):
                    batch_outputs = self.batch_outputs(batch, for_training=True)
                    batch_group_outputs.append(batch_outputs)
                    loss = batch_outputs["loss"]
                    reg_loss = batch_outputs.get("reg_loss")
                    if torch.isnan(loss):
                        raise ValueError("nan loss encountered")
                    loss = loss / len(batch_group)

                    batch_loss += loss.item()
                    if reg_loss is not None:
                        reg_loss = reg_loss / len(batch_group)
                        batch_reg_loss = reg_loss.item()
                        train_reg_loss += batch_reg_loss  # type: ignore

                if self._scaler is not None:
                    self._scaler.scale(loss).backward()
                else:
                    loss.backward()
            if len(batch_group_outputs) <= 0:
                continue

            train_loss += batch_loss

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._scaler is not None:
                self._scaler.step(self.optimizer)
                self._scaler.update()
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batch_loss,
                batch_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=self.cuda_device,
            )

            if self._primary:
                # Updating tqdm only for the primary as the trainers wouldn't have one
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description, refresh=False)

                if self._checkpointer is not None:
                    self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch)

            for callback in self._callbacks:
                callback.on_batch(
                    self,
                    batch_group,
                    batch_group_outputs,
                    metrics,
                    epoch,
                    batches_this_epoch,
                    is_training=True,
                    is_primary=self._primary,
                    batch_grad_norm=batch_grad_norm,
                )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batch_loss=None,
            batch_reg_loss=None,
            num_batches=batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=self.cuda_device,
        )

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024)
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024)
        return metrics
Ejemplo n.º 30
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
Ejemplo n.º 31
0
    def _multi_worker_islice(
        self,
        iterable: Iterable[Any],
        transform: Optional[Callable[[Any], Instance]] = None,
        ensure_lazy: bool = False,
    ) -> Iterable[Instance]:
        """
        Helper method that determines which raw instances to skip based on the current
        node rank (for distributed training) and worker ID (for multi-process data loading).

        # Parameters

        iterable : `Iterable[Any]`
            An iterable that yields raw data that can be transformed into `Instance`s
            through the `transform` function.
        transform : `Optional[Callable[[Any], Instance]]`, optional (default = `None`)
            An optional function that will be applied to the raw data generated
            by `iterable` to create `Instance`s. This is used, e.g., when reading
            cached data.
        ensure_lazy : `bool`, optional (default = `False`)
            If `True`, a `ConfigurationError` error will be raised if `iterable`
            is a list instead of a lazy generator type.

        # Returns

        `Iterable[Instance]`
        """
        if ensure_lazy and isinstance(iterable, (list, tuple)):
            raise ConfigurationError(
                "For a lazy dataset reader, _read() must return a generator")

        wrap_with_tqdm = True
        start_index = 0
        step_size = 1
        if not self.manual_distributed_sharding and util.is_distributed():
            start_index = dist.get_rank()
            step_size = dist.get_world_size()
        worker_info = None if self.manual_multi_process_sharding else get_worker_info(
        )
        if worker_info:
            warnings.warn(
                "Using multi-process data loading without setting "
                "DatasetReader.manual_multi_process_sharding to True.\n"
                "Did you forget to set this?\n"
                "If you're not handling the multi-process sharding logic within your "
                "_read() method, there is probably no benefit to using more than one "
                "worker.",
                UserWarning,
            )
            # Scale `start_index` by `num_workers`, then shift by worker `id`.
            start_index *= worker_info.num_workers
            start_index += worker_info.id
            # Scale `step_size` by `num_workers`.
            step_size *= worker_info.num_workers
            if worker_info.id > 0:
                # We only want to log with tqdm from the main loader process.
                wrap_with_tqdm = False

        islice = itertools.islice(iterable, start_index, self.max_instances,
                                  step_size)
        if wrap_with_tqdm:
            islice = Tqdm.tqdm(islice, desc="reading instances")

        if transform is not None:
            return (transform(x) for x in islice)
        return islice
Ejemplo n.º 32
0
def _read_embeddings_from_text_file(file_uri: str,
                                    embedding_dim: int,
                                    vocab: Vocabulary,
                                    namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                                   embedding_dim, len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError("No embeddings of correct dimension found; you probably "
                                 "misspecified your embedding_dim parameter, or didn't "
                                 "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
                                                                            embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
Ejemplo n.º 33
0
args = parser.parse_args()

#
# compare (different mrr gains up,same,down)
# -------------------------------
#
max_out_rank = args.top_n
#max_cs_n = args.top_n

#res = load_candidate(args.res_in,space_for_rank=30000)
#candidate_set = parse_candidate_set(args.candidate_file)

with open(args.res_out, "w", encoding="utf8") as res_out:
    with open(args.res_in, "r", encoding="utf8") as res_in:

        for l in Tqdm.tqdm(res_in):
            l_split = l.strip().split()

            if len(l_split) == 4:  # own format
                rank = int(l_split[2])
            elif len(l_split) == 6:  # original trec format
                rank = int(l_split[3])

            if rank <= max_out_rank:
                res_out.write(l)

    #for query,data in Tqdm.tqdm(res.items()):
    #    out_count = 0
    #    for (pid,rank,score) in data:
    #        if out_count == max_out_rank:
    #            break