Ejemplo n.º 1
0
def prepare_global_logging(params) -> None:
    """
    This function configures 3 global logging attributes - streaming stdout and stderr
    to a file as well as the terminal, setting the formatting for the python logging
    library and setting the interval frequency for the Tqdm progress bar.
    Note that this function does not set the logging level, which is set in ``allennlp/run.py``.
    Parameters
    ----------
    serializezation_dir : ``str``, required.
        The directory to stream logs to.
    file_friendly_logging : ``bool``, required.
        Whether logs should clean the output to prevent carridge returns
        (used to update progress bars on a single terminal line).
    """
    serialization_dir = params['serialization_dir']
    file_friendly_logging = params['file_friendly_logging']
    Tqdm.set_slower_interval(file_friendly_logging)
    std_out_file = os.path.join(serialization_dir, "stdout.log")
    sys.stdout = TeeLogger(
        std_out_file,  # type: ignore
        sys.stdout,
        file_friendly_logging)
    sys.stderr = TeeLogger(
        os.path.join(serialization_dir, "stderr.log"),  # type: ignore
        sys.stderr,
        file_friendly_logging)

    logging.init_logger(log_file=std_out_file)
Ejemplo n.º 2
0
    def extend_from_instances(self,
                              params: Params,
                              instances: Iterable['adi.Instance'] = ()) -> None:
        """
        Extends an already generated vocabulary using a collection of instances.
        """
        min_count = params.pop("min_count", None)
        max_vocab_size = pop_max_vocab_size(params)
        non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES)
        pretrained_files = params.pop("pretrained_files", {})
        min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None)
        only_include_pretrained_words = params.pop_bool("only_include_pretrained_words", False)
        tokens_to_add = params.pop("tokens_to_add", None)
        params.assert_empty("Vocabulary - from dataset")

        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)
        self._extend(counter=namespace_token_counts,
                     min_count=min_count,
                     max_vocab_size=max_vocab_size,
                     non_padded_namespaces=non_padded_namespaces,
                     pretrained_files=pretrained_files,
                     only_include_pretrained_words=only_include_pretrained_words,
                     tokens_to_add=tokens_to_add,
                     min_pretrained_embeddings=min_pretrained_embeddings)
Ejemplo n.º 3
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?")

        if lazy:
            return _LazyInstances(lambda: iter(self._read(file_path)))
        else:
            instances = self._read(file_path)
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError(
                    "No instances were read from the given filepath {}. "
                    "Is the path correct?".format(file_path))
            return instances
Ejemplo n.º 4
0
    def from_instances(cls,
                       instances: Iterable['adi.Instance'],
                       min_count: Dict[str, int] = None,
                       max_vocab_size: Union[int, Dict[str, int]] = None,
                       non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
                       pretrained_files: Optional[Dict[str, str]] = None,
                       only_include_pretrained_words: bool = False,
                       tokens_to_add: Dict[str, List[str]] = None,
                       min_pretrained_embeddings: Dict[str, int] = None) -> 'Vocabulary':
        """
        Constructs a vocabulary given a collection of `Instances` and some parameters.
        We count all of the vocabulary items in the instances, then pass those counts
        and the other parameters, to :func:`__init__`.  See that method for a description
        of what the other parameters do.
        """
        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)

        return cls(counter=namespace_token_counts,
                   min_count=min_count,
                   max_vocab_size=max_vocab_size,
                   non_padded_namespaces=non_padded_namespaces,
                   pretrained_files=pretrained_files,
                   only_include_pretrained_words=only_include_pretrained_words,
                   tokens_to_add=tokens_to_add,
                   min_pretrained_embeddings=min_pretrained_embeddings)
Ejemplo n.º 5
0
def http_get(url: str, temp_file: IO) -> None:
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = Tqdm.tqdm(unit="B", total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk:  # filter out keep-alive new chunks
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()
Ejemplo n.º 6
0
    def _validate_dev(self, epoch):
        """
        Computes the dev loss. Returns it and the number of batches.
        """
        logger.info("Validating on dev")

        self._model.eval()

        # TODO: edge loss is wrong when _dev_iterator is used.
        if False:  # self._dev_iterator is not None:
            dev_iterator = self._dev_iterator
        else:
            dev_iterator = self._iterator

        dev_generator = dev_iterator(instances=self._dev_dataset,
                                     shuffle=False,
                                     num_epochs=1)

        num_dev_batches = dev_iterator.get_num_batches(self._dev_dataset)
        dev_generator_tqdm = Tqdm.tqdm(dev_generator, total=num_dev_batches)
        batches_this_epoch = 0
        dev_loss = 0
        for batch in dev_generator_tqdm:

            batches_this_epoch += 1
            loss = self._batch_loss(batch, for_training=True)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                dev_loss += loss.item()

            # Update the description with the latest metrics
            if self._n_gpus > 1:
                dev_metrics = self._model.module.get_metrics()
            else:
                dev_metrics = self._model.get_metrics()
            description = self._description_from_metrics(dev_metrics)
            dev_generator_tqdm.set_description(description, refresh=False)

        if self._n_gpus > 1:
            return self._model.module.get_metrics(reset=True,
                                                  mimick_test=epoch > 50,
                                                  validation=True)
        else:
            return self._model.get_metrics(reset=True,
                                           mimick_test=epoch > 50,
                                           validation=True)
Ejemplo n.º 7
0
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]:
    # Moving this import to the top breaks everything (cycling import, I guess)
    from stog.modules.token_embedders import EmbeddingsTextFile

    logger.info('Reading pretrained tokens from: %s', embeddings_file_uri)
    tokens: List[str] = []
    with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file:
        for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1):
            token_end = line.find(' ')
            if token_end >= 0:
                token = line[:token_end]
                tokens.append(token)
            else:
                line_begin = line[:20] + '...' if len(line) > 20 else line
                logger.warning(f'Skipping line number %d: %s', line_number, line_begin)
    return tokens
Ejemplo n.º 8
0
def evaluate(model, instances, iterator, device):
    with torch.no_grad():
        model.eval()
        model.decode_type = 'mst'

        test_generator = iterator(instances=instances,
                                  shuffle=False,
                                  num_epochs=1)

        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(test_generator,
                                   total=iterator.get_num_batches(instances))
        for batch in generator_tqdm:
            batch = move_to_device(batch, device)
            model(batch, for_training=True)
            metrics = model.get_metrics()
            description = ', '.join([
                "%s: %.2f" % (name, value) for name, value in metrics.items()
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        return model.get_metrics(reset=True)
Ejemplo n.º 9
0
    def _train_epoch(self, epoch):
        logger.info('Epoch {}/{}'.format(epoch, self._num_epochs - 1))
        logger.info(f'Peak CPU memory usage MB: {peak_memory_mb()}')
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        training_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        # Get tqdm for the training batches
        # TODO: How to deal with cuda device. Typically I set CUDA_VISIBLE_DEVICES before execute script, so it;s alway 0
        train_generator = self._iterator(instances=self._training_dataset,
                                         shuffle=self._shuffle,
                                         num_epochs=1)

        num_training_batches = self._iterator.get_num_batches(
            self._training_dataset)

        logger.info('Training...')
        last_save_time = time.time()
        batches_this_epoch = 0
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)

        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._num_trained_batches += 1

            self._optimizer.zero_grad()
            loss = self._batch_loss(batch, for_training=True)
            loss.backward()
            training_loss += loss.item()
            self._optimizer.step()

            # Update the description with the latest metrics
            if self._n_gpus > 1:
                metrics = self._model.module.get_metrics()
            else:
                metrics = self._model.get_metrics()
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._num_trained_batches % self._summary_interval == 0:
                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics[self._dev_metric],
                                                   self._num_trained_batches)
                self._metrics_to_tensorboard(
                    self._num_trained_batches,
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval):
                last_save_time = time.time()
                self._save_checkpoint('{0}.{1}'.format(
                    epoch, time_to_str(int(last_save_time))), [],
                                      is_best=False)
        logger.info('Finish one epoch.')
        if self._n_gpus > 1:
            return self._model.module.get_metrics(reset=True)
        else:
            return self._model.get_metrics(reset=True)
Ejemplo n.º 10
0
def _read_embeddings_from_text_file(file_uri: str,
                                    embedding_dim: int,
                                    vocab: Vocabulary,
                                    namespace: str = "tokens",
                                    amr: bool = False) -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set()
    for token in vocab.get_token_to_index_vocabulary(namespace):
        # TODO: Is there a better way to do this? Currently we have a very specific 'amr' param.
        if amr:
            token = re.sub(r'-\d\d$', '', token)
        tokens_to_keep.add(token)

    vocab_size = vocab.get_vocab_size(namespace)

    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            if amr:
                normalized_token = re.sub(r'-\d\d$', '', token)
                if normalized_token in embeddings:
                    embedding_matrix[i] = torch.FloatTensor(
                        embeddings[normalized_token])
                    num_tokens_found += 1
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.",
                token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix