Ejemplo n.º 1
0
def prepare_global_logging(serialization_dir: str, file_friendly_logging: bool) -> None:
    """
    This function configures 3 global logging attributes - streaming stdout and stderr
    to a file as well as the terminal, setting the formatting for the python logging
    library and setting the interval frequency for the Tqdm progress bar.

    Note that this function does not set the logging level, which is set in ``allennlp/run.py``.

    Parameters
    ----------
    serializezation_dir : ``str``, required.
        The directory to stream logs to.
    file_friendly_logging : ``bool``, required.
        Whether logs should clean the output to prevent carridge returns
        (used to update progress bars on a single terminal line).
    """
    Tqdm.set_slower_interval(file_friendly_logging)
    std_out_file = os.path.join(serialization_dir, "stdout.log")
    sys.stdout = TeeLogger(std_out_file, # type: ignore
                           sys.stdout,
                           file_friendly_logging)
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), # type: ignore
                           sys.stderr,
                           file_friendly_logging)

    stdout_handler = logging.FileHandler(std_out_file)
    stdout_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(stdout_handler)
Ejemplo n.º 2
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._model.eval()

        val_generator = self._iterator(self._validation_data,
                                       num_epochs=1,
                                       cuda_device=self._iterator_device,
                                       for_training=False)
        num_validation_batches = self._iterator.get_num_batches(self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch in val_generator_tqdm:
            batches_this_epoch += 1

            loss = self._batch_loss(batch, for_training=False)
            val_loss += loss.data.cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = self._get_metrics(val_loss, batches_this_epoch)
            description = self._description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        return val_loss, batches_this_epoch
Ejemplo n.º 3
0
    def from_instances(cls,
                       instances: Iterable['adi.Instance'],
                       min_count: Dict[str, int] = None,
                       max_vocab_size: Union[int, Dict[str, int]] = None,
                       non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
                       pretrained_files: Optional[Dict[str, str]] = None,
                       only_include_pretrained_words: bool = False,
                       tokens_to_add: Dict[str, List[str]] = None,
                       min_pretrained_embeddings: Dict[str, int] = None) -> 'Vocabulary':
        """
        Constructs a vocabulary given a collection of `Instances` and some parameters.
        We count all of the vocabulary items in the instances, then pass those counts
        and the other parameters, to :func:`__init__`.  See that method for a description
        of what the other parameters do.
        """
        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)

        return cls(counter=namespace_token_counts,
                   min_count=min_count,
                   max_vocab_size=max_vocab_size,
                   non_padded_namespaces=non_padded_namespaces,
                   pretrained_files=pretrained_files,
                   only_include_pretrained_words=only_include_pretrained_words,
                   tokens_to_add=tokens_to_add,
                   min_pretrained_embeddings=min_pretrained_embeddings)
Ejemplo n.º 4
0
    def extend_from_instances(self,
                              params: Params,
                              instances: Iterable['adi.Instance'] = ()) -> None:
        """
        Extends an already generated vocabulary using a collection of instances.
        """
        min_count = params.pop("min_count", None)
        max_vocab_size = pop_max_vocab_size(params)
        non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES)
        pretrained_files = params.pop("pretrained_files", {})
        min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None)
        only_include_pretrained_words = params.pop_bool("only_include_pretrained_words", False)
        tokens_to_add = params.pop("tokens_to_add", None)
        params.assert_empty("Vocabulary - from dataset")

        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)
        self._extend(counter=namespace_token_counts,
                     min_count=min_count,
                     max_vocab_size=max_vocab_size,
                     non_padded_namespaces=non_padded_namespaces,
                     pretrained_files=pretrained_files,
                     only_include_pretrained_words=only_include_pretrained_words,
                     tokens_to_add=tokens_to_add,
                     min_pretrained_embeddings=min_pretrained_embeddings)
Ejemplo n.º 5
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
        for batch in generator_tqdm:
            batch = util.move_to_device(batch, cuda_device)
            model(**batch)
            metrics = model.get_metrics()
            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        return model.get_metrics(reset=True)
Ejemplo n.º 6
0
def get_from_cache(url: str, cache_dir: str = None) -> str:
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
        cache_dir = DATASET_CACHE

    os.makedirs(cache_dir, exist_ok=True)

    # make HEAD request to check ETag
    response = requests.head(url, allow_redirects=True)
    if response.status_code != 200:
        raise IOError("HEAD request failed for url {}".format(url))

    # add ETag to filename if it exists
    etag = response.headers.get("ETag")
    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    if not os.path.exists(cache_path):
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        with tempfile.NamedTemporaryFile() as temp_file:
            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)

            # GET file object
            req = requests.get(url, stream=True)
            content_length = req.headers.get('Content-Length')
            total = int(content_length) if content_length is not None else None
            progress = Tqdm.tqdm(unit="B", total=total)
            for chunk in req.iter_content(chunk_size=1024):
                if chunk: # filter out keep-alive new chunks
                    progress.update(len(chunk))
                    temp_file.write(chunk)
            progress.close()

            # we are copying the file before closing it, so flush to avoid truncation
            temp_file.flush()
            # shutil.copyfileobj() starts at the current position, so go to the start
            temp_file.seek(0)

            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)

            logger.info("creating metadata file for %s", cache_path)
            meta = {'url': url, 'etag': etag}
            meta_path = cache_path + '.json'
            with open(meta_path, 'w') as meta_file:
                json.dump(meta, meta_file)

            logger.info("removing temp file %s", temp_file.name)

    return cache_path
Ejemplo n.º 7
0
def http_get(url: str, temp_file: IO) -> None:
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = Tqdm.tqdm(unit="B", total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk: # filter out keep-alive new chunks
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()
Ejemplo n.º 8
0
    def embed_file(self,
                   input_file: IO,
                   output_file_path: str,
                   output_format: str = "all",
                   batch_size: int = DEFAULT_BATCH_SIZE) -> None:
        """
        Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace.
        The ELMo embeddings are written out in HDF5 format, where each sentences is saved in a dataset.

        Parameters
        ----------
        input_file : ``IO``, required
            A file with one tokenized sentence per line.
        output_file_path : ``str``, required
            A path to the output hdf5 file.
        output_format : ``str``, optional, (default = "all")
            The embeddings to output.  Must be one of "all", "top", or "average".
        batch_size : ``int``, optional, (default = 64)
            The number of sentences to process in ELMo at one time.
        """

        assert output_format in ["all", "top", "average"]

        # Tokenizes the sentences.
        sentences = [line.strip() for line in input_file if line.strip()]
        split_sentences = [sentence.split() for sentence in sentences]
        # Uses the sentence as the key.
        embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size))

        logger.info("Processing sentences.")
        with h5py.File(output_file_path, 'w') as fout:
            for key, embeddings in Tqdm.tqdm(embedded_sentences):
                if key in fout.keys():
                    logger.warning(f"Key already exists in {output_file_path}, skipping: {key}")
                else:
                    if output_format == "all":
                        output = embeddings
                    elif output_format == "top":
                        output = embeddings[2]
                    elif output_format == "average":
                        output = numpy.average(embeddings, axis=0)

                    fout.create_dataset(
                            key,
                            output.shape, dtype='float32',
                            data=output
                    )
        input_file.close()
Ejemplo n.º 9
0
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]:
    # Moving this import to the top breaks everything (cycling import, I guess)
    from allennlp.modules.token_embedders.embedding import EmbeddingsTextFile

    logger.info('Reading pretrained tokens from: %s', embeddings_file_uri)
    tokens: List[str] = []
    with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file:
        for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1):
            token_end = line.find(' ')
            if token_end >= 0:
                token = line[:token_end]
                tokens.append(token)
            else:
                line_begin = line[:20] + '...' if len(line) > 20 else line
                logger.warning(f'Skipping line number %d: %s', line_number, line_begin)
    return tokens
Ejemplo n.º 10
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device, for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description, refresh=False)

    return model.get_metrics(reset=True)
Ejemplo n.º 11
0
def get_from_cache(url: str, cache_dir: str = None) -> str:
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
        cache_dir = DATASET_CACHE

    os.makedirs(cache_dir, exist_ok=True)

    # make HEAD request to check ETag
    response = requests.head(url)
    if response.status_code != 200:
        raise IOError("HEAD request failed for url {}".format(url))

    # add ETag to filename if it exists
    etag = response.headers.get("ETag")
    filename = url_to_filename(url, etag)

    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)

    if not os.path.exists(cache_path):
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        _, temp_filename = tempfile.mkstemp()
        logger.info("%s not found in cache, downloading to %s", url, temp_filename)

        # GET file object
        req = requests.get(url, stream=True)
        content_length = req.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None
        progress = Tqdm.tqdm(unit="B", total=total)
        with open(temp_filename, 'wb') as temp_file:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk: # filter out keep-alive new chunks
                    progress.update(len(chunk))
                    temp_file.write(chunk)

        progress.close()

        logger.info("copying %s to cache at %s", temp_filename, cache_path)
        shutil.copyfile(temp_filename, cache_path)
        logger.info("removing temp file %s", temp_filename)
        os.remove(temp_filename)

    return cache_path
def main(serialization_directory, device):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    """

    config = Params.from_file(os.path.join(serialization_directory, "config.json"))
    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = config['validation_data_path']

    model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device)

    prediction_file_path = os.path.join(serialization_directory, "predictions.txt")
    gold_file_path = os.path.join(serialization_directory, "gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("Reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)
    iterator = BasicIterator(batch_size=32)
    iterator.index_with(model.vocab)

    model_predictions = []
    batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False)
    for batch in Tqdm.tqdm(batches):
        result = model(**batch)
        predictions = model.decode(result)
        model_predictions.extend(predictions["tags"])

    for instance, prediction in zip(instances, model_predictions):
        fields = instance.fields
        try:
            # Most sentences have a verbal predicate, but not all.
            verb_index = fields["verb_indicator"].labels.index(1)
        except ValueError:
            verb_index = None

        gold_tags = fields["tags"].labels
        sentence = fields["tokens"].tokens

        write_to_conll_eval_file(prediction_file, gold_file,
                                 verb_index, sentence, prediction, gold_tags)
    prediction_file.close()
    gold_file.close()
Ejemplo n.º 13
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = " ".join([x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index : (index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({"input_tokens": input_field, "output_tokens": output_field})
Ejemplo n.º 14
0
def evaluate(model: Model, instances: Iterable[Instance],
             data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    model.eval()

    iterator = data_iterator(instances,
                             num_epochs=1,
                             cuda_device=cuda_device,
                             for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator,
                               total=data_iterator.get_num_batches(instances))
    for batch in generator_tqdm:
        #        print(batch)
        model(**batch)
        metrics = model.get_metrics()
        description = ', '.join(
            ["%s: %.2f" % (name, value)
             for name, value in metrics.items()]) + " ||"
        generator_tqdm.set_description(description, refresh=False)

    return model.get_metrics(reset=True)
Ejemplo n.º 15
0
    def from_instances(
        cls,
        instances: Iterable["adi.Instance"],
        min_count: Dict[str, int] = None,
        max_vocab_size: Union[int, Dict[str, int]] = None,
        non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
        pretrained_files: Optional[Dict[str, str]] = None,
        only_include_pretrained_words: bool = False,
        tokens_to_add: Dict[str, List[str]] = None,
        min_pretrained_embeddings: Dict[str, int] = None,
        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
    ) -> "Vocabulary":
        """
        Constructs a vocabulary given a collection of `Instances` and some parameters.
        We count all of the vocabulary items in the instances, then pass those counts
        and the other parameters, to :func:`__init__`.  See that method for a description
        of what the other parameters do.
        """
        logger.info("Fitting token dictionary from dataset.")
        padding_token = padding_token if padding_token is not None else DEFAULT_PADDING_TOKEN
        oov_token = oov_token if oov_token is not None else DEFAULT_OOV_TOKEN
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(
            lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)

        return cls(
            counter=namespace_token_counts,
            min_count=min_count,
            max_vocab_size=max_vocab_size,
            non_padded_namespaces=non_padded_namespaces,
            pretrained_files=pretrained_files,
            only_include_pretrained_words=only_include_pretrained_words,
            tokens_to_add=tokens_to_add,
            min_pretrained_embeddings=min_pretrained_embeddings,
            padding_token=padding_token,
            oov_token=oov_token,
        )
Ejemplo n.º 16
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._model.eval()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self._iterator

        val_generator = val_iterator(self._validation_data,
                                     num_epochs=1,
                                     cuda_device=self._iterator_device)
        num_validation_batches = val_iterator.get_num_batches(self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch in val_generator_tqdm:

            loss = self._batch_loss(batch, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = self._get_metrics(val_loss, batches_this_epoch)
            description = self._description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        return val_loss, batches_this_epoch
Ejemplo n.º 17
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._model.eval()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self._iterator

        val_generator = val_iterator(self._validation_data,
                                     num_epochs=1,
                                     cuda_device=self._iterator_device)
        num_validation_batches = val_iterator.get_num_batches(self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch in val_generator_tqdm:

            loss = self._batch_loss(batch, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = self._get_metrics(val_loss, batches_this_epoch)
            description = self._description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        return val_loss, batches_this_epoch
Ejemplo n.º 18
0
    def _gather_train_instances_and_compute_gradients(self) -> None:
        logger.info(
            "Gathering training instances and computing gradients. "
            "The result will be cached so this only needs to be done once.")
        self._train_instances = []
        self.model.train()
        for instance in Tqdm.tqdm(self._train_loader.iter_instances(),
                                  desc="calculating training gradients"):
            batch = Batch([instance])
            batch.index_instances(self.vocab)
            tensor_dict = move_to_device(batch.as_tensor_dict(), self.device)

            self.model.zero_grad()

            # Compute loss with respect to the test instance.
            output_dict = self.model(**tensor_dict)
            loss = output_dict["loss"]

            if self._used_params is None or self._used_param_names is None:
                self._used_params = []
                self._used_param_names = []
                # we only know what parameters in the models requires gradient after
                # we do the first .backward() and we store those used parameters
                loss.backward(retain_graph=True)
                for name, param in self.model.named_parameters():
                    if param.requires_grad and param.grad is not None:
                        self._used_params.append(param)
                        self._used_param_names.append(name)

            # Get gradients.
            grads = autograd.grad(loss, self._used_params)
            # Sanity check.
            assert len(grads) == len(self._used_params)

            self._train_instances.append(
                InstanceWithGrads(instance=instance,
                                  loss=loss.detach().item(),
                                  grads=grads))
Ejemplo n.º 19
0
    def from_files_and_instances(
        cls,
        instances: Iterable["adi.Instance"],
        directory: str,
        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
        min_count: Dict[str, int] = None,
        max_vocab_size: Union[int, Dict[str, int]] = None,
        non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
        pretrained_files: Optional[Dict[str, str]] = None,
        only_include_pretrained_words: bool = False,
        tokens_to_add: Dict[str, List[str]] = None,
        min_pretrained_embeddings: Dict[str, int] = None,
    ) -> "Vocabulary":
        """
        Extends an already generated vocabulary using a collection of instances.

        The `instances` parameter does not get an entry in a typical AllenNLP configuration file,
        but the other parameters do (if you want non-default parameters).  See `__init__` for a
        description of what the other parameters mean.
        """
        vocab = cls.from_files(directory, padding_token, oov_token)
        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(
            lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)
        vocab._extend(
            counter=namespace_token_counts,
            min_count=min_count,
            max_vocab_size=max_vocab_size,
            non_padded_namespaces=non_padded_namespaces,
            pretrained_files=pretrained_files,
            only_include_pretrained_words=only_include_pretrained_words,
            tokens_to_add=tokens_to_add,
            min_pretrained_embeddings=min_pretrained_embeddings,
        )
        return vocab
    def _read(self, file_path: str):
        self._debug_prints = 5
        cached_file_path = cached_path(file_path)

        if file_path.endswith('.gz'):
            data_file = gzip.open(cached_file_path, 'rb')
        else:
            data_file = open(cached_file_path, 'r')

        logger.info("Reading QA instances from jsonl dataset at: %s",
                    file_path)
        item_jsons = []
        for line in data_file:
            item_jsons.append(json.loads(line.strip()))

        if self._sample != -1:
            item_jsons = random.sample(item_jsons, self._sample)
            logger.info("Sampling %d examples", self._sample)

        for item_json in Tqdm.tqdm(item_jsons, total=len(item_jsons)):
            self._debug_prints -= 1
            if self._debug_prints >= 0:
                logger.info(f"====================================")
                logger.info(f"Input json: {item_json}")
            item_id = item_json["id"]

            statement_text = item_json["phrase"]
            metadata = {} if "metadata" not in item_json else item_json[
                "metadata"]
            context = item_json["context"] if "context" in item_json else None

            yield self.text_to_instance(item_id=item_id,
                                        question=statement_text,
                                        answer_id=item_json["answer"],
                                        context=context,
                                        org_metadata=metadata)

        data_file.close()
Ejemplo n.º 21
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in Tqdm.tqdm(itertools.groupby(data_file, _is_divider)):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in zip(*fields)]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]
                    sequence = TextField(tokens, self._token_indexers)

                    instance_fields: Dict[str, Field] = {'tokens': sequence}

                    # Add "feature labels" to instance
                    if 'pos' in self.feature_labels:
                        instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags")
                    if 'chunk' in self.feature_labels:
                        instance_fields['chunk_tags'] = SequenceLabelField(chunk_tags, sequence, "chunk_tags")
                    if 'ner' in self.feature_labels:
                        instance_fields['ner_tags'] = SequenceLabelField(ner_tags, sequence, "ner_tags")

                    # Add "tag label" to instance
                    if self.tag_label == 'ner':
                        instance_fields['tags'] = SequenceLabelField(ner_tags, sequence)
                    elif self.tag_label == 'pos':
                        instance_fields['tags'] = SequenceLabelField(pos_tags, sequence)
                    elif self.tag_label == 'chunk':
                        instance_fields['tags'] = SequenceLabelField(chunk_tags, sequence)

                    yield Instance(instance_fields)
Ejemplo n.º 22
0
    def evaluate(self, model: Model):
        model.eval()

        generator_tqdm = Tqdm.tqdm(self.dataloader, total=len(self.dataloader))
        model_outputs = {}

        for batch in generator_tqdm:
            with torch.no_grad():
                batch = util.move_to_device(batch, self.cuda_device)
                output_dict = model.predict(**batch)
                for key in output_dict:
                    if key not in model_outputs:
                        model_outputs[key] = output_dict[key]
                    else:
                        model_outputs[key] += output_dict[key]

        evaluation_results = self.corpus.evaluate(model_outputs, self.dataset)
        
        print(evaluation_results['logging'])
        
        model.train()

        return evaluation_results['score']
Ejemplo n.º 23
0
    def embed_file(self,
                   input_path: str,
                   output_file_prefix: str,
                   is_propbank: bool = False,
                   batch_size: int = DEFAULT_BATCH_SIZE) -> None:
        def get_sentences():
            if not is_propbank:
                return self.get_qasrl_sentences(input_path)
            else:
                return self.get_propbank_sentences(input_path)

        with open(output_file_prefix + "_ids.jsonl", "w") as f_ids:
            with open(output_file_prefix + "_emb.bin", "wb") as f_emb:
                for sentence_batch in lazy_groups_of(
                        Tqdm.tqdm(self.get_qasrl_sentences(input_path)),
                        batch_size):
                    batch_sentences, batch_metas = map(list,
                                                       zip(*sentence_batch))
                    for verb_id, emb in self.embed_batch(
                            batch_sentences, batch_metas):
                        f_ids.write(json.dumps(verb_id) + "\n")
                        bs = emb.numpy().tobytes()
                        f_emb.write(bs)
Ejemplo n.º 24
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = " ".join([x.replace("\n", " ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info("Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({'input_tokens': input_field,
                            'output_tokens': output_field})
Ejemplo n.º 25
0
    def _read(self, file_path):
        for filename in os.listdir(file_path):
            with open(os.path.join(file_path, filename), "r") as data_file:
                logger.info("Reading instances from lines in file at: %s",
                            filename)
                filename_splitted = filename.split('_')
                task_name = filename_splitted[-3]
                domain_name = filename_splitted[-2]
                for line_num, line in enumerate(Tqdm.tqdm(data_file)):
                    line = line.strip("\n")

                    if not line:
                        continue

                    line_parts = line.split('\t')
                    if len(line_parts) != 2:
                        raise ConfigurationError(
                            "Invalid line format: %s (line number %d)" %
                            (line, line_num + 1))
                    source_sequence, target_sequence = line_parts
                    yield self.text_to_instance(task_name, domain_name,
                                                source_sequence,
                                                target_sequence)
Ejemplo n.º 26
0
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in Tqdm.tqdm(
                ontonotes_reader.dataset_document_iterator(file_path)):
            clusters: DefaultDict[int, List[Tuple[
                int, int]]] = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append(
                        (start + total_tokens, end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences],
                                        canonical_clusters)
    def from_instances(cls,
                       instances: Iterable['adi.Instance'],
                       min_count: Dict[str, int] = None,
                       max_vocab_size: Union[int, Dict[str, int]] = None,
                       non_padded_namespaces: Sequence[str] = DEFAULT_NON_PADDED_NAMESPACES,
                       pretrained_files: Optional[Dict[str, str]] = None,
                       only_include_pretrained_words: bool = False) -> 'Vocabulary':
        """
        Constructs a vocabulary given a collection of `Instances` and some parameters.
        We count all of the vocabulary items in the instances, then pass those counts
        and the other parameters, to :func:`__init__`.  See that method for a description
        of what the other parameters do.
        """
        logger.info("Fitting token dictionary from dataset.")
        namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for instance in Tqdm.tqdm(instances):
            instance.count_vocab_items(namespace_token_counts)

        return Vocabulary(counter=namespace_token_counts,
                          min_count=min_count,
                          max_vocab_size=max_vocab_size,
                          non_padded_namespaces=non_padded_namespaces,
                          pretrained_files=pretrained_files,
                          only_include_pretrained_words=only_include_pretrained_words)
 def _read(self, file_path):
     # if `file_path` is a URL, redirect to the cache
     # file_path = cached_path(file_path)
     for filename in os.listdir(file_path):
         filename_splitted = filename.split('_')
         task_name = filename_splitted[-3]
         domain_name = filename_splitted[-2]
         if task_name not in self._tasks or domain_name not in self._domains:
             continue
         with open(os.path.join(file_path, filename), "r") as data_file:
             logger.info("Reading instances from lines in file at: %s",
                         filename)
             for line in Tqdm.tqdm(data_file):
                 line = line.strip("\n")
                 # skip blank lines
                 if not line:
                     continue
                 tokens_and_tags = [
                     pair.rsplit(self._word_tag_delimiter, 1)
                     for pair in line.split(self._token_delimiter)
                 ]
                 tokens_and_tags = ([['<<' + task_name + '>>', 'O'],
                                     ['<<' + domain_name + '>>', 'O']] +
                                    tokens_and_tags)
                 tokens = [Token(token) for token, tag in tokens_and_tags]
                 tags = [tag for token, tag in tokens_and_tags]
                 task_field = LabelField(task_name,
                                         label_namespace="task_labels")
                 sequence = TextField(tokens, self._token_indexers)
                 sequence_tags = SequenceLabelField(
                     tags, sequence, label_namespace='labels')
                 yield Instance({
                     'task_token': task_field,
                     'tokens': sequence,
                     'tags': sequence_tags
                 })
Ejemplo n.º 29
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        train_loss = 0.0
        self.model.train()

        num_gpus = len(self._cuda_devices)

        if getattr(self, "train_dataset", None) is None:
            self.train_dataset = DMDataSet(data=self.train_data[0],
                                           batch_size=self.batch_size,
                                           num_gpus=num_gpus,
                                           shuffle=True,
                                           distributed=True,
                                           data_slice=True)
        self.train_dataset.set_epoch(epoch)
        num_training_batches = math.ceil(
            len(self.train_dataset) / self.batch_size / num_gpus)
        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(self.train_dataset,
                                         total=num_training_batches)

        for batch_group in train_generator_tqdm:
            # print('batch_size: ', len(batch_group["source_tokens"]["tokens"]))
            # gpu_data = batch_group
            # src_data = gpu_data["source_tokens"]["tokens"]
            # tgt_data = gpu_data["target_tokens"]["tokens"]
            # for sdata, tdata in zip(src_data, tgt_data):
            #     s = ''.join([self.get_model().vocab.get_token_from_index(x, "source_tokens") if x != 0 else '' for x in
            #                  sdata.numpy()])
            #     t = ''.join([self.get_model().vocab.get_token_from_index(x, "target_tokens") if x != 0 else '' for x in
            #                  tdata.numpy()])
            #     print(s)
            #     print(t)
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")
            loss.backward()

            train_loss += loss.item()

            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            self.optimizer.step()
            metrics = training_util.get_metrics(self.get_model(), train_loss,
                                                batches_this_epoch)
            description = self.get_desc_from_metrics(metrics, epoch)
            train_generator_tqdm.set_description(description, refresh=False)

        metrics = training_util.get_metrics(self.get_model(),
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        return metrics
Ejemplo n.º 30
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        # Get tqdm for the training batches
        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device)
        num_training_batches = self._iterator.get_num_batches(self._train_data)
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                    batch_num_total % self._histogram_interval == 0)

            self._optimizer.zero_grad()

            loss = self._batch_loss(batch, for_training=True)
            loss.backward()

            # Make sure Variable is on the cpu before converting to numpy.
            # .cpu() is a no-op if you aren't using GPUs.
            train_loss += loss.data.cpu().numpy()

            batch_grad_norm = self._rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().data.cpu().clone()
                                 for name, param in self._model.named_parameters()}
                self._optimizer.step()
                for name, param in self._model.named_parameters():
                    param_updates[name].sub_(param.detach().data.cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, ))
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7),
                                                       batch_num_total)
            else:
                self._optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if batch_num_total % self._summary_interval == 0:
                self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm)
                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total)
                self._metrics_to_tensorboard(batch_num_total,
                                             {"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._log_histograms_this_batch:
                self._histograms_to_tensorboard(batch_num_total, histogram_parameters)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False
                )

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)
Ejemplo n.º 31
0
    if single_file_path_cached.endswith('.gz'):
        f = gzip.open(single_file_path_cached, 'rb')
    else:
        f = open(single_file_path_cached)
    for example in f:
        context = json.loads(example)
        if 'header' in context:
            continue
        contexts.append(context)
    f.close()

    s = socket.socket()
    for i in range(600):  # Try for 10 minutes
        try:
            s.connect(('127.0.0.1', args.port))
        except socket.error as e:
            if e.errno != errno.ECONNREFUSED:
                # Something other than Connection refused means server is running
                break
        time.sleep(1)
    else:
        raise Exception('Could not connect to server')
    s.close()

    for context in Tqdm.tqdm(contexts, total=len(contexts)):
        pred = requests.post('http://127.0.0.1:%d' % args.port, json=context)
        all_predictions.update(pred.json())

    with open(args.output_file, 'w') as f:
        json.dump(all_predictions, f)
Ejemplo n.º 32
0
def get_inverse_hvp_lissa(
    vs: Sequence[torch.Tensor],
    model: Model,
    used_params: Sequence[torch.Tensor],
    lissa_data_loader: DataLoader,
    damping: float,
    num_samples: int,
    scale: float,
) -> torch.Tensor:
    """
    This function approximates the product of the inverse of the Hessian and
    the vectors `vs` using LiSSA.

    Adapted from [github.com/kohpangwei/influence-release]
    (https://github.com/kohpangwei/influence-release/blob/0f656964867da6ddcca16c14b3e4f0eef38a7472/influence/genericNeuralNet.py#L475),
    the repo for [Koh, P.W., & Liang, P. (2017)](https://api.semanticscholar.org/CorpusID:13193974),
    and [github.com/xhan77/influence-function-analysis]
    (https://github.com/xhan77/influence-function-analysis/blob/78d5a967aba885f690d34e88d68da8678aee41f1/bert_util.py#L336),
    the repo for [Han, Xiaochuang et al. (2020)](https://api.semanticscholar.org/CorpusID:218628619).
    """
    inverse_hvps = [torch.tensor(0) for _ in vs]
    for _ in Tqdm.tqdm(range(num_samples),
                       desc="LiSSA samples",
                       total=num_samples):
        # See a explanation at "Stochastic estimation" paragraph in [https://arxiv.org/pdf/1703.04730.pdf]
        # initialize \tilde{H}^{−1}_0 v = v
        cur_estimates = vs
        recursion_iter = Tqdm.tqdm(lissa_data_loader,
                                   desc="LiSSA depth",
                                   total=len(lissa_data_loader))
        for j, training_batch in enumerate(recursion_iter):
            # TODO (epwalsh): should we make sure `model` is in "train" or "eval" mode here?
            model.zero_grad()
            train_output_dict = model(**training_batch)
            # Hessian of loss @ \tilde{H}^{−1}_{j - 1} v
            hvps = get_hvp(train_output_dict["loss"], used_params,
                           cur_estimates)

            # This is the recursive step:
            # cur_estimate = \tilde{H}^{−1}_{j - 1} v
            # (i.e. Hessian-Vector Product estimate from last iteration)
            # Updating for \tilde{H}^{−1}_j v, the new current estimate becomes:
            # v + (I - (Hessian_at_x + damping)) * cur_estimate
            # = v + (I + damping) * cur_estimate - Hessian_at_x * cur_estimate
            # We divide `hvp / scale` here (or, equivalently `Hessian_at_x / scale`)
            # so that we're effectively dividing the loss by `scale`.
            cur_estimates = [
                v + (1 - damping) * cur_estimate - hvp / scale
                for v, cur_estimate, hvp in zip(vs, cur_estimates, hvps)
            ]

            # Update the Tqdm progress bar with the current norm so the user can
            # see it converge.
            if (j % 50 == 0) or (j == len(lissa_data_loader) - 1):
                norm = np.linalg.norm(
                    _flatten_tensors(cur_estimates).cpu().numpy())
                recursion_iter.set_description(
                    desc=f"calculating inverse HVP, norm = {norm:.5f}")

        # Accumulating X_{[i,S_2]}  (notation from the LiSSA (algo. 1) [https://arxiv.org/pdf/1602.03943.pdf]
        # Need to divide by `scale` again here because the `vs` represent gradients
        # that haven't been scaled yet.
        inverse_hvps = [
            inverse_hvp + cur_estimate / scale
            for inverse_hvp, cur_estimate in zip(inverse_hvps, cur_estimates)
        ]
    return_ihvp = _flatten_tensors(inverse_hvps)
    return_ihvp /= num_samples
    return return_ihvp
Ejemplo n.º 33
0
def evaluate(
    model: Model, data_loader: DataLoader, cuda_device: int, batch_weight_key: str,
) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=len(data_loader))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                metric_name.startswith("_") for metric_name in metrics
            ):
                logger.warning(
                    'Metrics with names beginning with "_" will '
                    "not be logged to the tqdm progress bar."
                )
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (
                ", ".join(
                    [
                        "%s: %.2f" % (name, value)
                        for name, value in metrics.items()
                        if not name.startswith("_")
                    ]
                )
                + " ||"
            )
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " + "produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Ejemplo n.º 34
0
    def embed_file(self,
                   input_file,
                   output_file_path,
                   output_format=u"all",
                   batch_size=DEFAULT_BATCH_SIZE,
                   forget_sentences=False,
                   use_sentence_keys=False):
        u"""
        Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace.
        The ELMo embeddings are written out in HDF5 format, where each sentence embedding
        is saved in a dataset with the line number in the original file as the key.

        Parameters
        ----------
        input_file : ``IO``, required
            A file with one tokenized sentence per line.
        output_file_path : ``str``, required
            A path to the output hdf5 file.
        output_format : ``str``, optional, (default = "all")
            The embeddings to output.  Must be one of "all", "top", or "average".
        batch_size : ``int``, optional, (default = 64)
            The number of sentences to process in ELMo at one time.
        forget_sentences : ``bool``, optional, (default = False).
            If use_sentence_keys is False, whether or not to include a string
            serialized JSON dictionary that associates sentences with their
            line number (its HDF5 key). The mapping is placed in the
            "sentence_to_index" HDF5 key. This is useful if
            you want to use the embeddings without keeping the original file
            of sentences around.
        use_sentence_keys : ``bool``, optional, (default = False).
            Whether or not to use full sentences as keys. By default,
            the line numbers of the input file are used as ids, which is more robust.
        """

        assert output_format in [u"all", u"top", u"average"]

        # Tokenizes the sentences.
        sentences = [line.strip() for line in input_file]

        blank_lines = [i for (i, line) in enumerate(sentences) if line == u""]
        if blank_lines:
            raise ConfigurationError(
                "Your input file contains empty lines at indexes "
                "{blank_lines}. Please remove them.")
        split_sentences = [sentence.split() for sentence in sentences]
        # Uses the sentence index as the key.

        if use_sentence_keys:
            logger.warning(
                u"Using sentences as keys can fail if sentences "
                u"contain forward slashes or colons. Use with caution.")
            embedded_sentences = izip(
                sentences, self.embed_sentences(split_sentences, batch_size))
        else:
            embedded_sentences = ((unicode(i), x) for i, x in enumerate(
                self.embed_sentences(split_sentences, batch_size)))

        sentence_to_index = {}
        logger.info(u"Processing sentences.")
        with h5py.File(output_file_path, u'w') as fout:
            for key, embeddings in Tqdm.tqdm(embedded_sentences):
                if use_sentence_keys and key in list(fout.keys()):
                    raise ConfigurationError(
                        "Key already exists in {output_file_path}. "
                        "To encode duplicate sentences, do not pass "
                        "the --use-sentence-keys flag.")

                if not forget_sentences and not use_sentence_keys:
                    sentence = sentences[int(key)]
                    sentence_to_index[sentence] = key

                if output_format == u"all":
                    output = embeddings
                elif output_format == u"top":
                    output = embeddings[-1]
                elif output_format == u"average":
                    output = numpy.average(embeddings, axis=0)

                fout.create_dataset(unicode(key),
                                    output.shape,
                                    dtype=u'float32',
                                    data=output)
            if not forget_sentences and not use_sentence_keys:
                sentence_index_dataset = fout.create_dataset(
                    u"sentence_to_index", (1, ),
                    dtype=h5py.special_dtype(vlen=unicode))
                sentence_index_dataset[0] = json.dumps(sentence_to_index)

        input_file.close()
Ejemplo n.º 35
0
def prepare_global_logging(serialization_dir: str,
                           file_friendly_logging: bool,
                           rank: int = 0,
                           world_size: int = 1) -> None:
    # If we don't have a terminal as stdout,
    # force tqdm to be nicer.
    if not sys.stdout.isatty():
        file_friendly_logging = True

    Tqdm.set_slower_interval(file_friendly_logging)

    # Handlers for stdout/err logging
    output_stream_log_handler = logging.StreamHandler(sys.stdout)
    error_stream_log_handler = logging.StreamHandler(sys.stderr)

    if world_size == 1:
        # This case is not distributed training and hence will stick to the older
        # log file names
        output_file_log_handler = logging.FileHandler(
            filename=os.path.join(serialization_dir, "stdout.log"))
        error_file_log_handler = logging.FileHandler(
            filename=os.path.join(serialization_dir, "stderr.log"))
    else:
        # Create log files with worker ids
        output_file_log_handler = logging.FileHandler(filename=os.path.join(
            serialization_dir, f"stdout_worker{rank}.log"))
        error_file_log_handler = logging.FileHandler(filename=os.path.join(
            serialization_dir, f"stderr_worker{rank}.log"))

        # This adds the worker's rank to messages being logged to files.
        # This will help when combining multiple worker log files using `less` command.
        worker_filter = WorkerLogFilter(rank)
        output_file_log_handler.addFilter(worker_filter)
        error_file_log_handler.addFilter(worker_filter)

    formatter = logging.Formatter(
        "%(asctime)s - %(levelname)s - %(name)s - %(message)s")

    root_logger = logging.getLogger()

    # Remove the already set stream handler in root logger.
    # Not doing this will result in duplicate log messages
    # printed in the console
    if len(root_logger.handlers) > 0:
        for handler in root_logger.handlers:
            root_logger.removeHandler(handler)

    # file handlers need to be handled for tqdm's \r char
    file_friendly_log_filter = FileFriendlyLogFilter()

    if os.environ.get("ALLENNLP_DEBUG"):
        LEVEL = logging.DEBUG
    else:
        LEVEL = logging.INFO

    if rank == 0:
        # stdout/stderr handlers are added only for the
        # master worker. This is to avoid cluttering the console
        # screen with too many log messages from all workers.
        output_stream_log_handler.setFormatter(formatter)
        error_stream_log_handler.setFormatter(formatter)

        output_stream_log_handler.setLevel(LEVEL)
        error_stream_log_handler.setLevel(logging.ERROR)

        if file_friendly_logging:
            output_stream_log_handler.addFilter(file_friendly_log_filter)
            error_stream_log_handler.addFilter(file_friendly_log_filter)

        root_logger.addHandler(output_stream_log_handler)
        root_logger.addHandler(error_stream_log_handler)

    output_file_log_handler.addFilter(file_friendly_log_filter)
    error_file_log_handler.addFilter(file_friendly_log_filter)

    output_file_log_handler.setFormatter(formatter)
    error_file_log_handler.setFormatter(formatter)

    output_file_log_handler.setLevel(LEVEL)
    error_file_log_handler.setLevel(logging.ERROR)

    root_logger.addHandler(output_file_log_handler)
    root_logger.addHandler(error_file_log_handler)

    root_logger.setLevel(LEVEL)
Ejemplo n.º 36
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    _warned_tqdm_ignores_underscores = False
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances,
                                 num_epochs=1,
                                 shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if (not _warned_tqdm_ignores_underscores and
                        any(metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                _warned_tqdm_ignores_underscores = True
            description = ', '.join(["%s: %.2f" % (name, value) for name, value
                                     in metrics.items() if not name.startswith("_")]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                   "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Ejemplo n.º 37
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        logger.info(f"Peak CPU memory usage MB: {peak_memory_mb()}")
        for gpu, memory in gpu_memory_mb().items():
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._model.train()

        # Get tqdm for the training batches
        train_generator = self._iterator(self._train_data,
                                         num_epochs=1,
                                         cuda_device=self._iterator_device)
        num_training_batches = self._iterator.get_num_batches(self._train_data)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        if self._histogram_interval is not None:
            histogram_parameters = set(self._model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        for batch in train_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self._log_histograms_this_batch = self._histogram_interval is not None and (
                    batch_num_total % self._histogram_interval == 0)

            self._optimizer.zero_grad()

            loss = self._batch_loss(batch, for_training=True)
            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self._rescale_gradients()

            # This does nothing if batch_num_total is None or you are using an
            # LRScheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)

            if self._log_histograms_this_batch:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self._model.named_parameters()}
                self._optimizer.step()
                for name, param in self._model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7),
                                                       batch_num_total)
            else:
                self._optimizer.step()

            # Update the description with the latest metrics
            metrics = self._get_metrics(train_loss, batches_this_epoch)
            description = self._description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if batch_num_total % self._summary_interval == 0:
                self._parameter_and_gradient_statistics_to_tensorboard(batch_num_total, batch_grad_norm)
                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"], batch_num_total)
                self._metrics_to_tensorboard(batch_num_total,
                                             {"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._log_histograms_this_batch:
                self._histograms_to_tensorboard(batch_num_total, histogram_parameters)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, time_to_str(int(last_save_time))), [], is_best=False
                )

        return self._get_metrics(train_loss, batches_this_epoch, reset=True)
def main(serialization_directory: int,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config['dataset_reader'])
    evaluation_data_path = data if data else config['validation_data_path']

    archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    instances = dataset_reader.read(evaluation_data_path)

    with torch.autograd.no_grad():
        iterator = BasicIterator(batch_size=32)
        iterator.index_with(model.vocab)

        model_predictions = []
        batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device)
        for batch in Tqdm.tqdm(batches):
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(instances, model_predictions):
            fields = instance.fields
            try:
                # Most sentences have a verbal predicate, but not all.
                verb_index = fields["verb_indicator"].labels.index(1)
            except ValueError:
                verb_index = None

            gold_tags = fields["tags"].labels
            sentence = [x.text for x in fields["tokens"].tokens]

            write_to_conll_eval_file(prediction_file, gold_file,
                                     verb_index, sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
Ejemplo n.º 39
0
    def embed_file(self,
                   input_file: IO,
                   output_file_path: str,
                   output_format: str = "all",
                   batch_size: int = DEFAULT_BATCH_SIZE,
                   forget_sentences: bool = False,
                   use_sentence_keys: bool = False) -> None:
        """
        Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace.
        The ELMo embeddings are written out in HDF5 format, where each sentence embedding
        is saved in a dataset with the line number in the original file as the key.

        Parameters
        ----------
        input_file : ``IO``, required
            A file with one tokenized sentence per line.
        output_file_path : ``str``, required
            A path to the output hdf5 file.
        output_format : ``str``, optional, (default = "all")
            The embeddings to output.  Must be one of "all", "top", or "average".
        batch_size : ``int``, optional, (default = 64)
            The number of sentences to process in ELMo at one time.
        forget_sentences : ``bool``, optional, (default = False).
            If use_sentence_keys is False, whether or not to include a string
            serialized JSON dictionary that associates sentences with their
            line number (its HDF5 key). The mapping is placed in the
            "sentence_to_index" HDF5 key. This is useful if
            you want to use the embeddings without keeping the original file
            of sentences around.
        use_sentence_keys : ``bool``, optional, (default = False).
            Whether or not to use full sentences as keys. By default,
            the line numbers of the input file are used as ids, which is more robust.
        """

        assert output_format in ["all", "top", "average"]

        # Tokenizes the sentences.
        sentences = [line.strip() for line in input_file]

        blank_lines = [i for (i, line) in enumerate(sentences) if line == ""]
        if blank_lines:
            raise ConfigurationError(f"Your input file contains empty lines at indexes "
                                     f"{blank_lines}. Please remove them.")
        split_sentences = [sentence.split() for sentence in sentences]
        # Uses the sentence index as the key.

        if use_sentence_keys:
            logger.warning("Using sentences as keys can fail if sentences "
                           "contain forward slashes or colons. Use with caution.")
            embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size))
        else:
            embedded_sentences = ((str(i), x) for i, x in
                                  enumerate(self.embed_sentences(split_sentences, batch_size)))

        sentence_to_index = {}
        logger.info("Processing sentences.")
        with h5py.File(output_file_path, 'w') as fout:
            for key, embeddings in Tqdm.tqdm(embedded_sentences):
                if use_sentence_keys and key in fout.keys():
                    raise ConfigurationError(f"Key already exists in {output_file_path}. "
                                             f"To encode duplicate sentences, do not pass "
                                             f"the --use-sentence-keys flag.")

                if not forget_sentences and not use_sentence_keys:
                    sentence = sentences[int(key)]
                    sentence_to_index[sentence] = key

                if output_format == "all":
                    output = embeddings
                elif output_format == "top":
                    output = embeddings[-1]
                elif output_format == "average":
                    output = numpy.average(embeddings, axis=0)

                fout.create_dataset(
                        str(key),
                        output.shape, dtype='float32',
                        data=output
                )
            if not forget_sentences and not use_sentence_keys:
                sentence_index_dataset = fout.create_dataset(
                        "sentence_to_index",
                        (1,),
                        dtype=h5py.special_dtype(vlen=str))
                sentence_index_dataset[0] = json.dumps(sentence_to_index)

        input_file.close()
Ejemplo n.º 40
0
def explore_embedding_space(
	embedding_fn: str, 
	out_fn : str,
	num_samples=1000) -> None:
	"""
	Calculate the following statistics for each layer of the model:
	1. mean cosine similarity between a sentence and its words
	2. mean dot product between a sentence and its words
	3. mean word embedding norm
	4. mean cosine similarity between randomly sampled words
	5. mean dot product between randomly sampled words
	6. mean variance explained by first principal component for a random sample of words

	num_samples sentences/words are used to estimate each of these metrics. We randomly sample words
	by first uniformly randomly sampling sentences and then uniformly randomly sampling a single word
	from each sampled sentence. This is because:
		- 	When we say we are interested in the similarity between random words, what we really 
			mean is the similarity between random _word occurrences_ (since each word has a unique 
			vector based on its context).
		- 	By explicitly sampling from different contexts, we avoid running into cases where two
			words are similar due to sharing the same context.

	Create a dictionary mapping each layer to a dictionary of the statistics write it to out_fn.
	"""
	f = h5py.File(embedding_fn, 'r')
	num_layers = f["0"].shape[0]
	num_sentences = len(f)

	sentence_indices = random.sample(list(range(num_sentences)), num_samples)

	mean_cos_sim_between_sent_and_words = { f'layer_{layer}' : [] for layer in range(num_layers) }
	mean_cos_sim_across_words = { f'layer_{layer}' : -1 for layer in range(num_layers) }
	word_norm_std = { f'layer_{layer}' : -1 for layer in range(num_layers) }
	word_norm_mean = { f'layer_{layer}' : -1 for layer in range(num_layers) }
	variance_explained_random = { f'layer_{layer}' : -1 for layer in range(num_layers) }

	for layer in Tqdm.tqdm(range(num_layers)):
		word_vectors = []
		word_norms = []
		mean_cos_sims = []
		mean_dot_products = []

		for sent_index in sentence_indices:
			# average word vectors to get sentence vector
			sentence_vector = f[str(sent_index)][layer].mean(axis=0)
			num_words = f[str(sent_index)].shape[1]

			# randomly add a word vector (not all of them, because that would bias towards longer sentences)
			word_vectors.append(f[str(sent_index)][layer, random.choice(list(range(num_words)))])

			# what is the mean cosine similarity between the sentence and its words?
			mean_cos_sim = np.nanmean([ 1 - cosine(f[str(sent_index)][layer,i], sentence_vector) 
				for i in range(num_words) if f[str(sent_index)][layer, i].shape != () ])
			mean_cos_sims.append(round(mean_cos_sim, 3))

			# what is the mean embedding norm across words?
			word_norms.extend([np.linalg.norm(f[str(sent_index)][layer,i]) for i in range(num_words)])

		mean_cos_sim_between_sent_and_words[f'layer_{layer}'] = round(float(np.mean(mean_cos_sims)), 3)
		mean_cos_sim_across_words[f'layer_{layer}'] = round(np.nanmean([ 1 - cosine(random.choice(word_vectors), 
			random.choice(word_vectors)) for _ in range(num_samples)]), 3)
		word_norm_std[f'layer_{layer}'] = round(float(np.std(word_norms)), 3)
		word_norm_mean[f'layer_{layer}'] = round(float(np.mean(word_norms)), 3)

		# how much of the variance in randomly chosen words can be explained by their first principal component?
		pca = TruncatedSVD(n_components=100)
		pca.fit(word_vectors)
		variance_explained_random[f'layer_{layer}'] = min(1.0, round(float(pca.explained_variance_ratio_[0]), 3))

	json.dump({
		'mean cosine similarity between sentence and words' : mean_cos_sim_between_sent_and_words,
		'mean cosine similarity across words' : mean_cos_sim_across_words,
		'word norm std' : word_norm_std,
		'word norm mean' : word_norm_mean,
		'variance explained for random words' : variance_explained_random
		}, open(out_fn, 'w'), indent=1)
Ejemplo n.º 41
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        for batch_group in batch_group_generator_tqdm:
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
Ejemplo n.º 42
0
def evaluate_perplexity(model: Model, sampler: Model, num_samples: int,
                        instances: Iterator[Instance],
                        data_iterator: DataIterator,
                        cuda_device: int) -> Dict[str, Any]:

    check_for_gpu(cuda_device)

    logger.info('Iterating over dataset')

    with torch.no_grad():

        summands = []
        penalized_summands = []

        for i in range(num_samples):
            iterator = data_iterator(instances, num_epochs=1, shuffle=False)
            generator_tqdm = Tqdm.tqdm(iterator, total=0)

            model.eval()
            sampler.eval()

            summand = 0.0
            penalized_summand = 0.0
            denom = 0
            for batch, _ in generator_tqdm:

                batch = util.move_to_device(batch, cuda_device)

                # We need sequence length to help compute perplexity
                n_tokens = util.get_text_field_mask(
                    batch['source']).float().sum().item()
                denom += n_tokens

                # Draw a sample
                sampler_output = sampler.sample(**batch)
                sample_logp = sampler_output['logp']
                sample = sampler_output['sample']

                # Evaluate on sample
                model_output = model(**sample)
                model_logp = model_output['logp']
                model_penalized_logp = model_output['penalized_logp']
                summand += (model_logp - sample_logp).item()
                penalized_summand += (model_penalized_logp -
                                      sample_logp).item()

            summands.append(summand)
            penalized_summands.append(penalized_summand)
            t = torch.tensor(summands)
            p = torch.tensor(penalized_summands)
            t_sum = torch.logsumexp(t, dim=0)
            p_sum = torch.logsumexp(p, dim=0)
            sum_logp = (t_sum - math.log(i + 1)).item()
            sum_logp_penalized = (p_sum - math.log(i + 1)).item()
            ppl = math.exp(-sum_logp / denom)
            upp = math.exp(-sum_logp_penalized / denom)

            print('PPL: %f' % ppl)
            print('UPP: %f' % upp)

    metrics = {'ppl': ppl, 'upp': upp}
    return metrics
Ejemplo n.º 43
0
def evaluate(
    model: Model, data_loader: DataLoader, cuda_device: int = -1, batch_weight_key: str = None,
) -> Dict[str, Any]:
    """
    # Parameters

    model : `Model`
        The model to evaluate
    data_loader : `DataLoader`
        The `DataLoader` that will iterate over the evaluation data (data loaders already contain
        their data).
    cuda_device : `int`, optional (default=`-1`)
        The cuda device to use for this evaluation.  The model is assumed to already be using this
        device; this parameter is only used for moving the input data to the correct device.
    batch_weight_key : `str`, optional (default=`None`)
        If given, this is a key in the output dictionary for each batch that specifies how to weight
        the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator)

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                metric_name.startswith("_") for metric_name in metrics
            ):
                logger.warning(
                    'Metrics with names beginning with "_" will '
                    "not be logged to the tqdm progress bar."
                )
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (
                ", ".join(
                    [
                        "%s: %.2f" % (name, value)
                        for name, value in metrics.items()
                        if not name.startswith("_")
                    ]
                )
                + " ||"
            )
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " + "produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        return final_metrics
Ejemplo n.º 44
0
def evaluate(model: Model, instances: Iterable[Instance],
             data_iterator: DataIterator, cuda_device: int,
             batch_weight_key: str) -> Dict[str, Any]:
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = data_iterator(instances, num_epochs=1, shuffle=False)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(
            iterator, total=data_iterator.get_num_batches(instances))

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0
        # ksk
        total_probs, all_example_ids = [], []

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight
                # ksk
                if 'probs' in output_dict:
                    total_probs.extend(output_dict['probs'])
                    all_example_ids.extend([
                        batch['metadata'][batch_index]['example_ids']
                        for batch_index in range(len(batch['metadata']))
                    ])

            if (not HasBeenWarned.tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics)):
                logger.warning("Metrics with names beginning with \"_\" will "
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = ', '.join([
                "%s: %.2f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||"
            generator_tqdm.set_description(description, refresh=False)

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes " +
                    "produced a loss!")
            final_metrics["loss"] = total_loss / total_weight
            # ksk
            if 'probs' in output_dict:
                total_probs.extend(output_dict['probs'])
                all_example_ids.extend([
                    batch['metadata'][batch_index]['example_ids']
                    for batch_index in range(len(batch['metadata']))
                ])
                final_metrics["probs"] = total_probs
                final_metrics["example_ids"] = all_example_ids

        return final_metrics
def main(serialization_directory: str,
         device: int,
         data: str,
         prefix: str,
         domain: str = None):
    """
    serialization_directory : str, required.
        The directory containing the serialized weights.
    device: int, default = -1
        The device to run the evaluation on.
    data: str, default = None
        The data to evaluate on. By default, we use the validation data from
        the original experiment.
    prefix: str, default=""
        The prefix to prepend to the generated gold and prediction files, to distinguish
        different models/data.
    domain: str, optional (default = None)
        If passed, filters the ontonotes evaluation/test dataset to only contain the
        specified domain. This overwrites the domain in the config file from the model,
        to allow evaluation on domains other than the one the model was trained on.
    """
    config = Params.from_file(
        os.path.join(serialization_directory, "config.json"))

    if domain is not None:
        # Hack to allow evaluation on different domains than the
        # model was trained on.
        config["dataset_reader"]["domain_identifier"] = domain
        prefix = f"{domain}_{prefix}"
    else:
        config["dataset_reader"].pop("domain_identifier", None)

    dataset_reader = DatasetReader.from_params(config["dataset_reader"])
    evaluation_data_path = data if data else config["validation_data_path"]

    archive = load_archive(os.path.join(serialization_directory,
                                        "model.tar.gz"),
                           cuda_device=device)
    model = archive.model
    model.eval()

    prediction_file_path = os.path.join(serialization_directory,
                                        prefix + "_predictions.txt")
    gold_file_path = os.path.join(serialization_directory,
                                  prefix + "_gold.txt")
    prediction_file = open(prediction_file_path, "w+")
    gold_file = open(gold_file_path, "w+")

    # Load the evaluation data and index it.
    print("reading evaluation data from {}".format(evaluation_data_path))
    dataset = list(dataset_reader.read(evaluation_data_path))

    with torch.autograd.no_grad():
        loader = SimpleDataLoader(dataset, 32)
        model_predictions: List[List[str]] = []
        for batch in Tqdm.tqdm(loader):
            batch = move_to_device(batch, device)
            result = model(**batch)
            predictions = model.decode(result)
            model_predictions.extend(predictions["tags"])

        for instance, prediction in zip(dataset, model_predictions):
            fields = instance.fields
            verb_index = fields["metadata"]["verb_index"]
            gold_tags = fields["metadata"]["gold_tags"]
            sentence = fields["metadata"]["words"]
            write_to_conll_eval_file(prediction_file, gold_file, verb_index,
                                     sentence, prediction, gold_tags)
        prediction_file.close()
        gold_file.close()
Ejemplo n.º 46
0
        dataset_reader = DatasetReader.from_params(config.pop('dataset_reader'))
    evaluation_data_path = args.input_file
    logger.info("Reading evaluation data from %s", evaluation_data_path)
    instances = dataset_reader.read(evaluation_data_path)

    config['iterator']['type'] = 'basic'
    del config['iterator']['sorting_keys']
    data_iterator = DataIterator.from_params(config.pop("iterator"))
    data_iterator.index_with(model.vocab)

    cuda_device = args.cuda_device

    #### EVALUATION AQUI


    model.eval()
    iterator = data_iterator(instances, num_epochs=1, shuffle=False, cuda_device=cuda_device, for_training=False)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))

    label_probs = []
    for batch in generator_tqdm:
        lol = model(**batch)
        label_probs.append(model(**batch)['label_probs'].data.cpu().numpy())
    label_probs = np.concatenate(label_probs)
    my_preds = pd.DataFrame(label_probs, columns=['ending0','ending1','ending2','ending3'])
    my_preds['pred'] = label_probs.argmax(1)
    my_preds.to_csv(args.output_file)


Ejemplo n.º 47
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.model.train()

        num_gpus = len(self._cuda_devices)

        # Get tqdm for the training batches
        raw_train_generator = self.iterator(self.train_data,
                                            num_epochs=1,
                                            shuffle=self.shuffle)
        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
        num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus)
        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())


        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(train_generator,
                                         total=num_training_batches)
        cumulative_batch_size = 0
        for batch_group in train_generator_tqdm:
            self.model.train()
            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            loss = self.batch_loss(batch_group, for_training=True)

            if torch.isnan(loss):
                raise ValueError("nan loss encountered")

            loss.backward()

            train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch():
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {name: param.detach().cpu().clone()
                                 for name, param in self.model.named_parameters()}
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1, ))
                    param_norm = torch.norm(param.view(-1, )).cpu()
                    self._tensorboard.add_train_scalar("gradient_update/" + name,
                                                       update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch)
            description = training_util.description_from_metrics(metrics)

            train_generator_tqdm.set_description(description, refresh=False)

            # Log parameter values to Tensorboard
            if self._tensorboard.should_log_this_batch():
                self._tensorboard.log_parameter_and_gradient_statistics(self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model, self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"])
                self._tensorboard.log_metrics({"epoch_metrics/" + k: v for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch():
                self._tensorboard.log_histograms(self.model, histogram_parameters)

            if self._log_batch_size_period:
                cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group])
                cumulative_batch_size += cur_batch
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_size/batches_this_epoch
                    logger.info(f"current batch size: {cur_batch} mean batch size: {average}")
                    self._tensorboard.add_train_scalar("current_batch_size", cur_batch)
                    self._tensorboard.add_train_scalar("mean_batch_size", average)

            # Save model if needed.
            if self._model_save_interval is not None and (
                    time.time() - last_save_time > self._model_save_interval
            ):
                last_save_time = time.time()
                self._save_checkpoint(
                        '{0}.{1}'.format(epoch, training_util.time_to_str(int(last_save_time)))
                )
            if self._early_stopping_by_batch and self._batch_num_total % 10 == 0:
                if self._validation_data is not None:
                    with torch.no_grad():
                        # We have a validation set, so compute all the metrics on it.
                        val_loss, num_batches = self._validation_loss()
                        val_metrics = training_util.get_metrics(self.model, val_loss, num_batches, reset=True)

                        # Check validation metric for early stopping
                        this_epoch_val_metric = val_metrics[self._validation_metric]
                        self._metric_tracker.add_metric(this_epoch_val_metric)

                        if self._metric_tracker.is_best_so_far():
                            metrics['best_batch'] = self._batch_num_total
                            for key, value in val_metrics.items():
                                metrics["best_validation_" + key] = value
                            self._metric_tracker.best_epoch_metrics = val_metrics

                        self._save_checkpoint(self._batch_num_total)

                        if self.callbacks is not None:
                            for callback in self.callbacks:
                                callback.on_batch_end(self._batch_num_total)

        metrics = training_util.get_metrics(self.model, train_loss, batches_this_epoch, reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_'+str(gpu_num)+'_memory_MB'] = memory
        return metrics
Ejemplo n.º 48
0
    def semi_train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self.trainer._num_epochs - 1)
        peak_cpu_usage = peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self.trainer.model.train()

        num_gpus = len(self.trainer._cuda_devices)

        self.trainer._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self.trainer._batch_num_total is None:
            self.trainer._batch_num_total = 0

        histogram_parameters = set(
            self.trainer.model.
            get_parameters_for_histogram_tensorboard_logging())
        #Pdb().set_trace()
        mixed_generator, num_training_batches = get_mixer(
            self.trainer.iterator, self.trainer.train_data,
            self.trainer.iterator, self.unlabelled_dataset, num_gpus,
            self.labelled_id, self.which_mixer, self.min_pct_of_unlabelled)
        #mixed_generator, num_training_batches = get_mixer(self.trainer.iterator, self.trainer.train_data, self.trainer._validation_iterator,  self.unlabelled_dataset,num_gpus, self.labelled_id, self.which_mixer)

        #generator for lambda update
        mixed_generator_for_lambda, _ = get_mixer(self.trainer.iterator,
                                                  self.trainer.train_data,
                                                  self.trainer.iterator,
                                                  self.unlabelled_dataset,
                                                  num_gpus, self.labelled_id,
                                                  'cm', 1.0)
        #mixed_generator_for_lambda, _ = get_mixer(self.trainer._validation_iterator, self.trainer.train_data, self.trainer._validation_iterator,  self.unlabelled_dataset, num_gpus, self.labelled_id, 'cm')

        logger.info("Training")
        train_generator_tqdm = Tqdm.tqdm(mixed_generator,
                                         total=num_training_batches)
        #train_generator_tqdm = Tqdm.tqdm(zip(train_generator,unlabelled_train_generator),
        #                                 total=num_training_batches)
        cumulative_batch_size = 0
        unlabelled_loss = 0
        unlabelled_batches_this_epoch = 0

        batches_since_last_step = 0
        agg_loss = 0.0
        flag = False
        batch_grad_norm = None
        for batch_group, group_id in train_generator_tqdm:
            #print(batch_group[0]['sentence']['tokens'].shape)
            if self.total_supervised_iters < self.dd_semi_warmup_iters and group_id != self.labelled_id:
                continue
            output_dict = self.batch_loss(
                batch_group,
                for_training=True,
                eval_metric=(group_id == self.labelled_id))
            penalties = defaultdict(float)

            if self.constraints_model is not None:
                penalties = self.constraints_model(
                    output_dict['task1_tag_logits'],
                    output_dict['task2_tag_logits'], output_dict['mask'])

            loss = 0.0
            if 'loss' in output_dict:
                loss = output_dict['loss']
                train_loss += loss.item()
            loss += output_dict.get('regularization_penalty', 0.0)

            loss += self.constraints_wt * penalties['loss']

            unlabelled_loss += penalties['loss'].item() if torch.is_tensor(
                penalties['loss']) else penalties['loss']

            agg_loss += loss
            batches_since_last_step += 1

            if batches_since_last_step == self.backprop_after_xbatches:
                #print("STEP THROUGH! : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss))
                batch_grad_norm = self.step(agg_loss)
                batches_since_last_step = 0
                agg_loss = 0.0
                flag = False
            else:
                flag = True
                #print("skipp : {}. loss: {} agg_loss: {}".format(group_id, loss, agg_loss))

            if (group_id != self.labelled_id):
                unlabelled_batches_this_epoch += 1
                #self.trainer.optimizer.zero_grad()
                #loss.backward()
                #batch_grad_norm = self.trainer.rescale_gradients()
                #self.trainer.optimizer.step()
            else:
                self.total_supervised_iters += 1.0
                batches_this_epoch += 1
                self.trainer._batch_num_total += 1
                batch_num_total = self.trainer._batch_num_total

                #self.trainer.optimizer.zero_grad()
                #loss.backward()
                #batch_grad_norm = self.trainer.rescale_gradients()

                # This does nothing if batch_num_total is None or you are using an
                # LRScheduler which doesn't update per batch.
                if self.trainer._learning_rate_scheduler:
                    self.trainer._learning_rate_scheduler.step_batch(
                        batch_num_total)

                if self.trainer._tensorboard.should_log_histograms_this_batch(
                ):
                    # get the magnitude of parameter updates for logging
                    # We need a copy of current parameters to compute magnitude of updates,
                    # and copy them to CPU so large models won't go OOM on the GPU.
                    param_updates = {
                        name: param.detach().cpu().clone()
                        for name, param in
                        self.trainer.model.named_parameters()
                    }
                    #self.trainer.optimizer.step()
                    for name, param in self.trainer.model.named_parameters():
                        param_updates[name].sub_(param.detach().cpu())
                        update_norm = torch.norm(param_updates[name].view(
                            -1, ))
                        param_norm = torch.norm(param.view(-1, )).cpu()
                        self.trainer._tensorboard.add_train_scalar(
                            "gradient_update/" + name,
                            update_norm / (param_norm + 1e-7))
                else:
                    pass
                    #self.trainer.optimizer.step()

                # Update moving averages
                if self.trainer._moving_average is not None:
                    self.trainer._moving_average.apply(batch_num_total)
            #
                metrics = training_util.get_metrics(self.trainer.model,
                                                    train_loss,
                                                    batches_this_epoch)
                metrics["uloss"] = float(
                    unlabelled_loss /
                    (batches_this_epoch + unlabelled_batches_this_epoch))
                # Update the description with the latest metrics
                description = training_util.description_from_metrics(metrics)
                train_generator_tqdm.set_description(description,
                                                     refresh=False)

                # Log parameter values to Tensorboard
                if self.trainer._tensorboard.should_log_this_batch(
                ) and batch_grad_norm is not None:
                    self.trainer._tensorboard.log_parameter_and_gradient_statistics(
                        self.trainer.model, batch_grad_norm)
                    self.trainer._tensorboard.log_learning_rates(
                        self.trainer.model, self.trainer.optimizer)

                    self.trainer._tensorboard.add_train_scalar(
                        "loss/loss_train", metrics["loss"])
                    self.trainer._tensorboard.log_metrics(
                        {"epoch_metrics/" + k: v
                         for k, v in metrics.items()})

                if self.trainer._tensorboard.should_log_histograms_this_batch(
                ):
                    self.trainer._tensorboard.log_histograms(
                        self.trainer.model, histogram_parameters)

                if self.trainer._log_batch_size_period:
                    cur_batch = sum([
                        training_util.get_batch_size(batch)
                        for batch in batch_group
                    ])
                    cumulative_batch_size += cur_batch
                    if (batches_this_epoch -
                            1) % self.trainer._log_batch_size_period == 0:
                        average = cumulative_batch_size / batches_this_epoch
                        logger.info(
                            f"current batch size: {cur_batch} mean batch size: {average}"
                        )
                        self.trainer._tensorboard.add_train_scalar(
                            "current_batch_size", cur_batch)
                        self.trainer._tensorboard.add_train_scalar(
                            "mean_batch_size", average)

                # Save model if needed.
                if self.trainer._model_save_interval is not None and (
                        time.time() - last_save_time >
                        self.trainer._model_save_interval):
                    last_save_time = time.time()
                    self.trainer._save_checkpoint('{0}.{1}'.format(
                        epoch, training_util.time_to_str(int(last_save_time))))

            #lambda update
            #if  (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters) and (batches_this_epoch % self.dd_update_freq == 0):
            if (self.constraints_model
                    is not None) and (self.dd_optimizer is not None) and (
                        self.total_supervised_iters >= self.dd_warmup_iters
                    ) and (self.total_supervised_iters -
                           self.last_lambda_update >= self.dd_update_freq):
                for batch_group, group_id in mixed_generator_for_lambda:
                    self.lambda_update(batch_group)
                    self.last_lambda_update = self.total_supervised_iters
                    break

                self.count_lambda_updates += 1
                if (self.dd_increase_freq_after
                        is not None) and (self.count_lambda_updates %
                                          self.dd_increase_freq_after == 0):
                    self.dd_update_freq += self.dd_increase_freq_by
        if flag:
            batch_grad_norm = self.step(agg_loss)
            batches_since_last_step = 0
            agg_loss = 0.0
            flag = False

        #lambda update
        #if (self.constraints_model is not None) and (self.dd_optimizer is not None) and (self.total_supervised_iters >= self.dd_warmup_iters):
        if (self.constraints_model
                is not None) and (self.dd_optimizer is not None) and (
                    self.total_supervised_iters >= self.dd_warmup_iters) and (
                        self.total_supervised_iters - self.last_lambda_update
                        >= self.dd_update_freq):
            for batch_group, group_id in mixed_generator_for_lambda:
                self.lambda_update(batch_group)
                self.last_lambda_update = self.total_supervised_iters
                break

            self.count_lambda_updates += 1
            if (self.dd_increase_freq_after
                    is not None) and (self.count_lambda_updates %
                                      self.dd_increase_freq_after == 0):
                self.dd_update_freq += self.dd_increase_freq_by

        metrics = training_util.get_metrics(self.trainer.model,
                                            train_loss,
                                            batches_this_epoch,
                                            reset=True)
        metrics['cpu_memory_MB'] = peak_cpu_usage
        metrics['lb'] = batches_this_epoch
        metrics['ub'] = unlabelled_batches_this_epoch
        metrics["uloss"] = float(
            unlabelled_loss /
            (batches_this_epoch + unlabelled_batches_this_epoch))
        if self.constraints_model is not None:
            lambda_stats_dict = self.constraints_model.lambda_stats()
            metrics.update(lambda_stats_dict)
        for (gpu_num, memory) in gpu_usage:
            metrics['gpu_' + str(gpu_num) + '_memory_MB'] = memory
        return metrics