def prepare_global_logging(params) -> None: """ This function configures 3 global logging attributes - streaming stdout and stderr to a file as well as the terminal, setting the formatting for the python logging library and setting the interval frequency for the Tqdm progress bar. Note that this function does not set the logging level, which is set in ``allennlp/run.py``. Parameters ---------- serializezation_dir : ``str``, required. The directory to stream logs to. file_friendly_logging : ``bool``, required. Whether logs should clean the output to prevent carridge returns (used to update progress bars on a single terminal line). """ serialization_dir = params['serialization_dir'] file_friendly_logging = params['file_friendly_logging'] Tqdm.set_slower_interval(file_friendly_logging) std_out_file = os.path.join(serialization_dir, "stdout.log") sys.stdout = TeeLogger( std_out_file, # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) logging.init_logger(log_file=std_out_file)
def extend_from_instances(self, params: Params, instances: Iterable['adi.Instance'] = ()) -> None: """ Extends an already generated vocabulary using a collection of instances. """ min_count = params.pop("min_count", None) max_vocab_size = pop_max_vocab_size(params) non_padded_namespaces = params.pop("non_padded_namespaces", DEFAULT_NON_PADDED_NAMESPACES) pretrained_files = params.pop("pretrained_files", {}) min_pretrained_embeddings = params.pop("min_pretrained_embeddings", None) only_include_pretrained_words = params.pop_bool("only_include_pretrained_words", False) tokens_to_add = params.pop("tokens_to_add", None) params.assert_empty("Vocabulary - from dataset") logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) self._extend(counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def read(self, file_path: str) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning( "DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if lazy: return _LazyInstances(lambda: iter(self._read(file_path))) else: instances = self._read(file_path) if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return instances
def from_instances(cls, instances: Iterable['adi.Instance'], min_count: Dict[str, int] = None, max_vocab_size: Union[int, Dict[str, int]] = None, non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES, pretrained_files: Optional[Dict[str, str]] = None, only_include_pretrained_words: bool = False, tokens_to_add: Dict[str, List[str]] = None, min_pretrained_embeddings: Dict[str, int] = None) -> 'Vocabulary': """ Constructs a vocabulary given a collection of `Instances` and some parameters. We count all of the vocabulary items in the instances, then pass those counts and the other parameters, to :func:`__init__`. See that method for a description of what the other parameters do. """ logger.info("Fitting token dictionary from dataset.") namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) return cls(counter=namespace_token_counts, min_count=min_count, max_vocab_size=max_vocab_size, non_padded_namespaces=non_padded_namespaces, pretrained_files=pretrained_files, only_include_pretrained_words=only_include_pretrained_words, tokens_to_add=tokens_to_add, min_pretrained_embeddings=min_pretrained_embeddings)
def http_get(url: str, temp_file: IO) -> None: req = requests.get(url, stream=True) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = Tqdm.tqdm(unit="B", total=total) for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close()
def _validate_dev(self, epoch): """ Computes the dev loss. Returns it and the number of batches. """ logger.info("Validating on dev") self._model.eval() # TODO: edge loss is wrong when _dev_iterator is used. if False: # self._dev_iterator is not None: dev_iterator = self._dev_iterator else: dev_iterator = self._iterator dev_generator = dev_iterator(instances=self._dev_dataset, shuffle=False, num_epochs=1) num_dev_batches = dev_iterator.get_num_batches(self._dev_dataset) dev_generator_tqdm = Tqdm.tqdm(dev_generator, total=num_dev_batches) batches_this_epoch = 0 dev_loss = 0 for batch in dev_generator_tqdm: batches_this_epoch += 1 loss = self._batch_loss(batch, for_training=True) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. dev_loss += loss.item() # Update the description with the latest metrics if self._n_gpus > 1: dev_metrics = self._model.module.get_metrics() else: dev_metrics = self._model.get_metrics() description = self._description_from_metrics(dev_metrics) dev_generator_tqdm.set_description(description, refresh=False) if self._n_gpus > 1: return self._model.module.get_metrics(reset=True, mimick_test=epoch > 50, validation=True) else: return self._model.get_metrics(reset=True, mimick_test=epoch > 50, validation=True)
def _read_pretrained_tokens(embeddings_file_uri: str) -> List[str]: # Moving this import to the top breaks everything (cycling import, I guess) from stog.modules.token_embedders import EmbeddingsTextFile logger.info('Reading pretrained tokens from: %s', embeddings_file_uri) tokens: List[str] = [] with EmbeddingsTextFile(embeddings_file_uri) as embeddings_file: for line_number, line in enumerate(Tqdm.tqdm(embeddings_file), start=1): token_end = line.find(' ') if token_end >= 0: token = line[:token_end] tokens.append(token) else: line_begin = line[:20] + '...' if len(line) > 20 else line logger.warning(f'Skipping line number %d: %s', line_number, line_begin) return tokens
def evaluate(model, instances, iterator, device): with torch.no_grad(): model.eval() model.decode_type = 'mst' test_generator = iterator(instances=instances, shuffle=False, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(test_generator, total=iterator.get_num_batches(instances)) for batch in generator_tqdm: batch = move_to_device(batch, device) model(batch, for_training=True) metrics = model.get_metrics() description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() ]) + " ||" generator_tqdm.set_description(description, refresh=False) return model.get_metrics(reset=True)
def _train_epoch(self, epoch): logger.info('Epoch {}/{}'.format(epoch, self._num_epochs - 1)) logger.info(f'Peak CPU memory usage MB: {peak_memory_mb()}') for gpu, memory in gpu_memory_mb().items(): logger.info(f"GPU {gpu} memory usage MB: {memory}") training_loss = 0.0 # Set the model to "train" mode. self._model.train() # Get tqdm for the training batches # TODO: How to deal with cuda device. Typically I set CUDA_VISIBLE_DEVICES before execute script, so it;s alway 0 train_generator = self._iterator(instances=self._training_dataset, shuffle=self._shuffle, num_epochs=1) num_training_batches = self._iterator.get_num_batches( self._training_dataset) logger.info('Training...') last_save_time = time.time() batches_this_epoch = 0 train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) for batch in train_generator_tqdm: batches_this_epoch += 1 self._num_trained_batches += 1 self._optimizer.zero_grad() loss = self._batch_loss(batch, for_training=True) loss.backward() training_loss += loss.item() self._optimizer.step() # Update the description with the latest metrics if self._n_gpus > 1: metrics = self._model.module.get_metrics() else: metrics = self._model.get_metrics() description = self._description_from_metrics(metrics) train_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard if self._num_trained_batches % self._summary_interval == 0: self._tensorboard.add_train_scalar("loss/loss_train", metrics[self._dev_metric], self._num_trained_batches) self._metrics_to_tensorboard( self._num_trained_batches, {"epoch_metrics/" + k: v for k, v in metrics.items()}) # Save model if needed. if self._model_save_interval is not None and ( time.time() - last_save_time > self._model_save_interval): last_save_time = time.time() self._save_checkpoint('{0}.{1}'.format( epoch, time_to_str(int(last_save_time))), [], is_best=False) logger.info('Finish one epoch.') if self._n_gpus > 1: return self._model.module.get_metrics(reset=True) else: return self._model.get_metrics(reset=True)
def _read_embeddings_from_text_file(file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens", amr: bool = False) -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set() for token in vocab.get_token_to_index_vocabulary(namespace): # TODO: Is there a better way to do this? Currently we have a very specific 'amr' param. if amr: token = re.sub(r'-\d\d$', '', token) tokens_to_keep.add(token) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: if amr: normalized_token = re.sub(r'-\d\d$', '', token) if normalized_token in embeddings: embedding_matrix[i] = torch.FloatTensor( embeddings[normalized_token]) num_tokens_found += 1 logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix