def read(self, file_path: str) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning( "DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if self._cache_directory: cache_file = self._get_cache_location_for_file_path(file_path) else: cache_file = None if lazy: return _LazyInstances(lambda: self._read(file_path), cache_file, self.deserialize_instance, self.serialize_instance) else: # First we read the instances, either from a cache or from the original file. if cache_file and os.path.exists(cache_file): instances = self._instances_from_cache_file(cache_file) else: instances = self._read(file_path) # Then some validation. if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) # And finally we write to the cache if we need to. if cache_file and not os.path.exists(cache_file): logger.info(f"Caching instances to {cache_file}") with open(cache_file, 'w') as cache: for instance in Tqdm.tqdm(instances): cache.write(self.serialize_instance(instance) + '\n') return instances
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, output_file: str = None) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, 'w')) for batch in generator_tqdm: model_output = model(**batch) metrics = model.get_metrics() if file_handle: id2label = model.vocab.get_index_to_token_vocabulary("labels") _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label) description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
def re_read_embeddings_from_text_file(file_uri, embedding_dim, vocab, namespace): tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(" ", 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(" ") if len(fields) - 1 != embedding_dim: continue vector = np.asarray(fields[1:], dtype="float32") embeddings[token] = vector index_to_token = vocab.get_index_to_token_vocabulary(namespace) rows_not_to_optimize = [] for i in range(vocab_size): token = index_to_token[i] if token in embeddings: rows_not_to_optimize.append(i) return rows_not_to_optimize
def __init__(self, glove_path: str, embedding_dim: int, trainable: bool = False) -> None: super(GloveContextualizer, self).__init__() self.embedding_dim = embedding_dim self.trainable = trainable # Read the GloVe file, and produce a dictionary of tokens to indices, a dictionary # of indices to tokens, and a PyTorch Embedding object. self.token_to_idx = {DEFAULT_OOV_TOKEN: 0} self.idx_to_token = {0: DEFAULT_OOV_TOKEN} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") embeddings = {} with EmbeddingsTextFile(glove_path) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] fields = line.rstrip().split(' ') if len(fields) - 1 != self.embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", self.embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector self.token_to_idx[token] = len(self.token_to_idx) self.idx_to_token[len(self.idx_to_token)] = token if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. vocab_size = len(self.token_to_idx) logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, self.embedding_dim).normal_( embeddings_mean, embeddings_std) # Start at 1, since the 0th token is OOV, and fill in the embedding matrix for i in range(1, vocab_size): embedding_matrix[i] = torch.FloatTensor( embeddings[self.idx_to_token[i]]) self.weight = torch.nn.Parameter(embedding_matrix, requires_grad=self.trainable)
def read(self, file_path: str) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning("DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if lazy: return _LazyInstances(lambda: iter(self._read(file_path))) else: instances = self._read(file_path) if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return instances
def train_one_epoch(self) -> None: """ Trains the model for a single epoch. Fires off the events EPOCH_START and EPOCH_END, and repeatedly calls self.train_one_batch(). """ self.handler.fire_event(Events.EPOCH_START) self.train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() self.last_log = time.time() logger.info("Training") self.batches_this_epoch = 0 batches_tqdm = Tqdm.tqdm(self.training_batches, total=self.num_training_batches) for self.batch in batches_tqdm: description = self.train_one_batch(self.batch) batches_tqdm.set_description(description, refresh=False) self.handler.fire_event(Events.VALIDATE) self.handler.fire_event(Events.EPOCH_END)
def read(self, url: str, query: str, *args) -> Iterable[Instance]: """ Returns an ``Iterable`` containing all the instances in the specified dataset. If ``self.lazy`` is False, this calls ``self._read()``, ensures that the result is a list, then returns the resulting list. If ``self.lazy`` is True, this returns an object whose ``__iter__`` method calls ``self._read()`` each iteration. In this case your implementation of ``_read()`` must also be lazy (that is, not load all instances into memory at once), otherwise you will get a ``ConfigurationError``. In either case, the returned ``Iterable`` can be iterated over multiple times. It's unlikely you want to override this function, but if you do your result should likewise be repeatedly iterable. """ lazy = getattr(self, 'lazy', None) if lazy is None: logger.warning("DatasetReader.lazy is not set, " "did you forget to call the superclass constructor?") if lazy: return _LazyInstances(lambda: iter(self._read(url=url, query=query, *args))) else: instances = self._read(url=url, query=query, *args) if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] if not instances: raise ConfigurationError("") # TODO: fix error return instances
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, output_file: str = None, eval_type: str = None) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, 'w')) for batch in generator_tqdm: ## made cuda compatible (if needed) batch = move_to_device(batch, cuda_device) model_output = model(**batch) metrics = model.get_metrics() if file_handle: _persist_data(file_handle, batch.get("metadata"), model_output, eval_type) description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() ]) + " ||" generator_tqdm.set_description(description) return model.get_metrics(reset=True)
def main(_): process_flags() if FLAGS.manual_seed: set_manual_seeds(FLAGS.manual_seed) # Create folders and files to store results and configs run_dir = Path(FLAGS.output_folder, FLAGS.run_name) if not os.path.exists(run_dir): os.makedirs(run_dir) # Logging log_fh = log.FileHandler(Path(run_dir, 'log.log')) log_fmt = log.Formatter("%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p") log_fh.setFormatter(log_fmt) log.getLogger().addHandler(log_fh) #Store the run description, if any if FLAGS.description: with open(Path(run_dir,'description.txt'),'w') as f: f.write(FLAGS.description) log.info(f'DESCRIPTION: {FLAGS.description}') # Store configuration in same folder as logs and model flagfile = Path(run_dir, 'flagfile.txt') if os.path.exists(flagfile): os.remove(flagfile) open(flagfile, "x") FLAGS.append_flags_into_file(flagfile) if FLAGS.old_pretrain_data: data_dict = get_data_dict_old() else: data_dict = get_data_dict() train_dataset, test_dataset, val_dataset = (data_dict[key] for key in ('train', 'test', 'val')) model = MLMModelWrapper(MODEL_MAPPING[FLAGS.model]) distributed_wrapper(train,model, run_dir, train_dataset, val_dataset) model.cuda(FLAGS.device_idxs[0]) log.info("Evaluating pretraining performance on test split") test_loader = get_loader(test_dataset) model.eval() batch_generator = iter(test_loader) batch_generator = Tqdm.tqdm( batch_generator) total_metrics = {} with torch.no_grad(): for i, batch in enumerate(batch_generator): batch = move_to_device(batch, FLAGS.device_idxs[0]) if isinstance(batch, torch.Tensor): model(batch) else: model(**batch) if i == 0: total_metrics = model.get_metrics() else: total_metrics = {m: total_metrics[m] + model.get_metrics()[m] for m in total_metrics.keys()} average_metrics = {k: v/(i+1) for k,v in total_metrics.items()} log.info(f"Average test metrics:{average_metrics}")
def _instances_to_cache_file(self, cache_filename, instances) -> None: # We serialize to a temp file first in case anything goes wrong while # writing to cache (e.g., the computer shuts down unexpectedly). # Then we just copy the file over to `cache_filename`. with CacheFile(cache_filename, mode="w+") as cache_handle: logger.info("Caching instances to temp file %s", cache_handle.name) for instance in Tqdm.tqdm(instances, desc="caching instances"): cache_handle.write(self.serialize_instance(instance) + "\n")
def _compute_metrics(self, data_loader: DataLoader) -> Iterable[Tuple[str, Any]]: with torch.no_grad(): # We use batches because the dataset may not fit in memory (the iterable was set up thus to batch) # and the model estimator may support `partial_fit`. If the estimator doesn't support it, # it's gonna raise an exception that the user can see. for tensor_dict_batch in Tqdm.tqdm(data_loader): self.model(**tensor_dict_batch) return self.model.get_metrics(reset=True).items() # noqa
def _validation_loss(self) -> Tuple[float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._pytorch_model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_iterator is not None: val_iterator = self._validation_iterator else: val_iterator = self.iterator val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False) num_validation_batches = val_iterator.get_num_batches( self._validation_data) val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches) batches_this_epoch = 0 val_loss = 0 for batch in val_generator_tqdm: loss = self.batch_loss(batch, for_training=False) if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( self.model, val_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, batches_this_epoch
def _calculate_uncertainty_batch(self, batch: InstanceBatch, progress_bar: Tqdm = None) -> None: uncertainty_df = defaultdict(list) ids, predictions, labels = batch for idx, prediction, label in zip(ids, predictions, labels): for w, word in enumerate(prediction['words']): for model in self.predictor._model.all_model_keys: tag_mean_probability = prediction[f'{model}_class_probabilities'][w] tag_std_probability = prediction[f'{model}_class_prob_std'][w] actual_label_idx = label[w] predicted_label_idx = np.argmax(tag_mean_probability) uncertainty_df['instance_id'].append(idx) uncertainty_df['word_id'].append(w) uncertainty_df['model'].append(model) uncertainty_df['word'].append(word) uncertainty_df['actual_tag'].append( self.predictor._model.vocab.get_token_from_index( actual_label_idx, namespace=self.predictor._model.label_namespace ) ) uncertainty_df['predicted_tag'].append( self.predictor._model.vocab.get_token_from_index( predicted_label_idx, namespace=self.predictor._model.label_namespace ) ) uncertainty_df['actual_confidence_mean'].append(tag_mean_probability[actual_label_idx]) uncertainty_df['actual_confidence_std'].append(tag_std_probability[actual_label_idx]) uncertainty_df['predicted_confidence_mean'].append(tag_mean_probability[predicted_label_idx]) uncertainty_df['predicted_confidence_std'].append(tag_std_probability[predicted_label_idx]) uncertainty_df['mean_probability_distribution'].append(tag_mean_probability) progress_bar.update(1) return uncertainty_df
def read(self, file_path, max_length=None): """Reads the data from a json file Converts the data into ``Instances`` of tokens and ``LabelField`` Read the _read() function for more details """ instances = self._read(file_path, max_length) if not isinstance(instances, list): instances = [instance for instance in Tqdm.tqdm(instances)] return instances
def _calculate_feature_importance_batch(self, batch: InstanceBatch, progress_bar: Tqdm = None) -> None: feature_importance_df = defaultdict(list) ids, labeled_batch, actual_labels = batch batch_text = [[li[fn].tokens for fn in self.field_names] for li in labeled_batch] fields = [list(self.field_names) for _ in range(len(labeled_batch))] predicted_labels = [li['label'].label for li in labeled_batch] seed = [self.seed for _ in range(len(labeled_batch))] for interpreter in self.feature_importance_interpreters + self.attention_interpreters: if progress_bar: progress_bar.set_description(f"{interpreter.id}: interpreting {len(labeled_batch)} instances") # Some feature importance measures are too memory-intensive to run with larger batch sizes # These numbers are based on empirical tests with a standard 16GB gpu if 'shap' in interpreter.id or 'deep' in interpreter.id or 'intgrad' in interpreter.id: batch_scores = [] for sub_batch in utils.batch(labeled_batch, 2): batch_scores.extend(interpreter.saliency_interpret_instances(sub_batch).values()) else: batch_scores = interpreter.saliency_interpret_instances(labeled_batch).values() # # There can be more than one array of scores for an instance (e.g. in the pair sequence case) scores = [[np.asarray(scoreset) for scoreset in v.values()] for v in batch_scores] feature_importance_df['scores'].extend(scores) feature_importance_df['seed'].extend(seed) feature_importance_df['instance_id'].extend(ids) feature_importance_df['instance_text'].extend(batch_text) feature_importance_df['instance_fields'].extend(fields) feature_importance_df['feature_importance_measure'].extend([interpreter.id for _ in range(len(labeled_batch))]) feature_importance_df['predicted'].extend(predicted_labels) feature_importance_df['actual'].extend(actual_labels) if progress_bar: progress_bar.update(1) return feature_importance_df
def calculate_correlation(self, force: bool = False) -> None: pkl_exists = os.path.isfile(os.path.join(self.serialization_dir, 'correlation.pkl')) if pkl_exists and not force: self.logger.info("Correlations exist and force was not specified. Loading from disk...") self.correlation_results = pd.read_pickle(os.path.join(self.serialization_dir, 'correlation.pkl')) else: correlation_df = defaultdict(list) self.logger.info('Calculating correlations...') progress_bar = Tqdm.tqdm(total=len(self.correlation_combos)) # We need to compare combinations with at least one attention interpreter first to get the k_values # for an apples to apples comparison with combinations where both interpreters are # feature importance measures unfair_k = defaultdict(lambda: defaultdict(list)) for (key1, key2) in self.correlation_combos: if 'attn' in key1 or 'attn' in key2: correlations, unfair_k_values = self._calculate_correlation_combo(key1, key2) for key, values in correlations.items(): correlation_df[key].extend(values) for measure, k in unfair_k_values.items(): unfair_k[key1][measure].extend(k) unfair_k[key2][measure].extend(k) progress_bar.update(1) # Now we can compare the feature importance measures to each other for (key1, key2) in self.correlation_combos: if 'attn' not in key1 and 'attn' not in key2: correlation_kwargs = defaultdict(list) # Unfair k strategy: take the average k used for each key for name, k_values in unfair_k.get(key1, {}).items(): correlation_kwargs[name].extend(k_values) for name, k_values in unfair_k.get(key2, {}).items(): correlation_kwargs[name].extend(k_values) for name, k_values in correlation_kwargs.items(): correlation_kwargs[name] = {"k": math.floor(statistics.mean(k_values))} correlations, _ = self._calculate_correlation_combo(key1, key2, correlation_kwargs=correlation_kwargs) for k, v in correlations.items(): correlation_df[k].extend(v) progress_bar.update(1) self.correlation_results = pd.DataFrame(correlation_df) utils.write_frame(self.correlation_results, self.serialization_dir, 'correlation')
def preprocess(filename, name): output_file = open(os.path.join(args.output, name + '.pickle'), 'wb') with open(filename) as file: docs = [] summs = [] for line in Tqdm.tqdm(file): doc, summ = line.strip().split('\t') docs.append(nlp(doc)) summs.append(nlp(summ)) docs_spacy = docs summs_spacy = summs dataset = {'docs': docs_spacy, 'summs': summs_spacy} pickle.dump(dataset, output_file) output_file.close()
def _read_embeddings_from_text_file(file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError("No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") return embeddings
def calculate_uncertainty(self, force: bool = False) -> None: pkl_exists = os.path.isfile(os.path.join(self.serialization_dir, 'uncertainty.pkl')) if pkl_exists and not force: self.logger.info("Uncertainty data exists and force was not specified. Loading from disk...") self.results = pd.read_pickle(os.path.join(self.serialization_dir, 'uncertainty.pkl')) else: uncertainty_df = defaultdict(list) self.logger.info('Calculating uncertainty...') progress_bar = Tqdm.tqdm(total=self.num_instances) for batch in self.dataset: uncertainty_scores = self._calculate_uncertainty_batch(batch, progress_bar) for k, v in uncertainty_scores.items(): uncertainty_df[k].extend(v) self.results = pd.DataFrame(uncertainty_df) utils.write_frame(self.results, self.serialization_dir, 'uncertainty')
def get_iter_norm_mean_eval( model: Model, data_loader: DataLoader, mean: torch.Tensor, cuda_device: int = -1 ) -> Dict[str, Any]: """ # Parameters model : `Model` The model to evaluate data_loader : `DataLoader` The `DataLoader` that will iterate over the evaluation data (data loaders already contain their data). cuda_device : `int`, optional (default=`-1`) The cuda device to use for this evaluation. The model is assumed to already be using this device; this parameter is only used for moving the input data to the correct device. batch_weight_key : `str`, optional (default=`None`) If given, this is a key in the output dictionary for each batch that specifies how to weight the loss for that batch. If this is not given, we use a weight of 1 for every batch. """ check_for_gpu(cuda_device) with torch.no_grad(): model.eval() iterator = iter(data_loader) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator) # mean_embeddings: [torch.Tensor, int] # mean_embeddings = [torch.tensor([0.], device=cuda_device), 0] embeddings = [] for batch in generator_tqdm: batch = nn_util.move_to_device(batch, cuda_device) batch_embeddings = model.forward_embeddings(batch['words'], mean) # mean_embeddings[0] = (mean_embeddings[0] + batch_embeddings.sum(dim=0)) # mean_embeddings[1] += batch_embeddings.shape[0] embeddings.append(batch_embeddings) # mean_embeddings[0] = mean_embeddings[0] / mean_embeddings[1] embeddings = torch.cat(embeddings, dim=0) return embeddings.mean(dim=0), embeddings # mean_embeddings[0]
def calculate_feature_importance(self, force: bool = False) -> None: pkl_exists = os.path.isfile(os.path.join(self.serialization_dir, 'feature_importance.pkl')) if pkl_exists and not force: self.logger.info("Feature importance scores exist and force was not specified. Loading from disk...") self.feature_importance_results = pd.read_pickle(os.path.join(self.serialization_dir, 'feature_importance.pkl')) else: feature_importance_df = defaultdict(list) self.logger.info('Calculating feature importance scores...') num_interpreters = len(self.feature_importance_interpreters) + len(self.attention_interpreters) progress_bar = Tqdm.tqdm(total=self.num_batches * num_interpreters) for batch in self.dataset: importance_scores = self._calculate_feature_importance_batch(batch, progress_bar) for k, v in importance_scores.items(): feature_importance_df[k].extend(v) self.feature_importance_results = pd.DataFrame(feature_importance_df) utils.write_frame(self.feature_importance_results, self.serialization_dir, 'feature_importance')
def _read_embeddings_from_text_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
def _validation_loss(self, epoch: int) -> Tuple[float, float, int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating") self._pytorch_model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_data_loader is not None: validation_data_loader = self._validation_data_loader else: raise ConfigurationError( "Validation results cannot be calculated without a validation_data_loader" ) val_generator_tqdm = Tqdm.tqdm(validation_data_loader) batches_this_epoch = 0 val_loss = 0 val_reg_loss = 0 done_early = False for batch in val_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing validation early! " "This implies that there is an imbalance in your validation " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batch_outputs = self.batch_outputs(batch, for_training=False) loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. batches_this_epoch += 1 val_loss += loss.detach().cpu().numpy() if reg_loss is not None: val_reg_loss += reg_loss.detach().cpu().numpy() # Update the description with the latest metrics val_metrics = training_util.get_metrics( self.model, val_loss, val_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) description = training_util.description_from_metrics(val_metrics) val_generator_tqdm.set_description(description, refresh=False) if self._master: for callback in self._batch_callbacks: callback( self, [batch], [batch_outputs], epoch, batches_this_epoch, is_training=False, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)." ) # Indicate that we're done so that any workers that have remaining data stop validation early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Now restore the original parameter values. if self._moving_average is not None: self._moving_average.restore() return val_loss, val_reg_loss, batches_this_epoch
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 train_reg_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) logger.info("Training") num_training_batches = math.ceil( len(self.data_loader) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() batch_group_outputs = [] for batch in batch_group: batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs["loss"] reg_loss = batch_outputs["reg_loss"] if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) reg_loss = reg_loss / len(batch_group) if self._opt_level is not None: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_reg_loss += reg_loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) param_updates = None if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # Get the magnitude of parameter updates for logging. We need to do some # computation before and after the optimizer step, and it's expensive because of # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so # we don't do this every batch, only when it's requested. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) self._tensorboard.log_batch(self.model, self.optimizer, batch_grad_norm, metrics, batch_group, param_updates) if self._master: self._checkpointer.maybe_save_checkpoint( self, epoch, batches_this_epoch) for callback in self._batch_callbacks: callback( self, batch_group, batch_group_outputs, epoch, batches_this_epoch, is_training=True, ) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
def _validation_loss_n_step(self, step: int) -> Tuple[float, Optional[float], int]: """ Computes the validation loss. Returns it and the number of batches. """ logger.info("Validating on %d steps" % step) self._pytorch_model.eval() # Replace parameter values with the shadow values from the moving averages. if self._moving_average is not None: self._moving_average.assign_average_value() if self._validation_data_loader is not None: validation_data_loader = self._validation_data_loader else: raise ConfigurationError( "Validation results cannot be calculated without a validation_data_loader" ) regularization_penalty = self.model.get_regularization_penalty() # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's # progress is shown val_batch_generator = iter(validation_data_loader) val_batch_group_generator = common_util.lazy_groups_of( val_batch_generator, 5 ) num_training_batches: Union[int, float] try: len_data_loader = len(validation_data_loader) num_training_batches = math.ceil( len_data_loader / 5 ) except TypeError: num_training_batches = float("inf") # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's # progress is shown if self._primary: val_generator_tqdm = Tqdm.tqdm( val_batch_group_generator, total=num_training_batches ) else: val_generator_tqdm = val_batch_group_generator batches_this_epoch = 0 val_loss = 0.0 val_reg_loss = None if regularization_penalty is None else 0.0 for val_batch_group in val_generator_tqdm: for val_batch in val_batch_group: with amp.autocast(self._use_amp): batches_this_epoch += 1 batch_outputs = self.batch_outputs(val_batch, for_training=False) loss = batch_outputs.get("loss") reg_loss = batch_outputs.get("reg_loss") if loss is not None: # You shouldn't necessarily have to compute a loss for validation, so we allow for # `loss` to be None. We need to be careful, though - `batches_this_epoch` is # currently only used as the divisor for the loss function, so we can safely only # count those batches for which we actually have a loss. If this variable ever # gets used for something else, we might need to change things around a bit. val_batch_loss = loss.item() val_loss += val_batch_loss if reg_loss is not None: val_batch_reg_loss = reg_loss.item() val_reg_loss += val_batch_reg_loss # type: ignore return val_loss, val_reg_loss, batches_this_epoch
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) cpu_memory_usage = [] for worker, memory in common_util.peak_cpu_memory().items(): cpu_memory_usage.append((worker, memory)) logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}") gpu_memory_usage = [] for gpu, memory in common_util.peak_gpu_memory().items(): gpu_memory_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}") regularization_penalty = self.model.get_regularization_penalty() train_loss = 0.0 batch_loss = 0.0 train_reg_loss = None if regularization_penalty is None else 0.0 batch_reg_loss = None if regularization_penalty is None else 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = iter(self.data_loader) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps ) logger.info("Training") num_training_batches: Union[int, float] try: len_data_loader = len(self.data_loader) num_training_batches = math.ceil( len_data_loader / self._num_gradient_accumulation_steps ) except TypeError: num_training_batches = float("inf") # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's # progress is shown if self._primary: batch_group_generator_tqdm = Tqdm.tqdm( batch_group_generator, total=num_training_batches ) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 done_early = False for batch_group in batch_group_generator_tqdm: if done_early: break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total # Zero gradients. # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()` # because it avoids a read op when the gradients are first updated below. for param_group in self.optimizer.param_groups: for p in param_group["params"]: p.grad = None batch_loss = 0.0 batch_group_outputs = [] for batch in batch_group: with amp.autocast(self._use_amp): batch_outputs = self.batch_outputs(batch, for_training=True) batch_group_outputs.append(batch_outputs) loss = batch_outputs["loss"] reg_loss = batch_outputs.get("reg_loss") if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) batch_loss += loss.item() if reg_loss is not None: reg_loss = reg_loss / len(batch_group) batch_reg_loss = reg_loss.item() train_reg_loss += batch_reg_loss # type: ignore if self._scaler is not None: self._scaler.scale(loss).backward() else: loss.backward() if len(batch_group_outputs) <= 0: continue train_loss += batch_loss batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._scaler is not None: self._scaler.step(self.optimizer) self._scaler.update() else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss, batch_reg_loss, batches_this_epoch, world_size=self._world_size, cuda_device=self.cuda_device, ) if batch_num_total % self.val_loss_steps == 0: logger.info("%s: %.4f" % ('train_loss', train_loss / batches_this_epoch)) if self._validation_data_loader is not None: with torch.no_grad(): # We have a validation set, so compute all the metrics on it. val_loss, val_reg_loss, num_batches = self._validation_loss_n_step(batch_num_total) val_metrics = training_util.get_metrics( self.model, val_loss, val_reg_loss, num_batches=num_batches, batch_loss=None, batch_reg_loss=None, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) # description = training_util.description_from_metrics(val_metrics) logger.info("%s: %.4f" % ('val_loss', val_loss / num_batches)) # batch_group_generator_tqdm.set_description(description, refresh=False) self._pytorch_model.train() if self._primary: # Updating tqdm only for the primary as the trainers wouldn't have one description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) if self._checkpointer is not None: self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch) for callback in self._callbacks: callback.on_batch( self, batch_group, batch_group_outputs, metrics, epoch, batches_this_epoch, is_training=True, is_primary=self._primary, batch_grad_norm=batch_grad_norm, ) metrics = training_util.get_metrics( self.model, train_loss, train_reg_loss, batch_loss=None, batch_reg_loss=None, num_batches=batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=self.cuda_device, ) for (worker, memory) in cpu_memory_usage: metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024) for (gpu_num, memory) in gpu_memory_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024) return metrics
def search_learning_rate(trainer: Trainer, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None) -> Tuple[List[float], List[float]]: """ Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer` increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses. Parameters ---------- trainer: :class:`~allennlp.training.trainer.Trainer` start_lr: ``float`` The learning rate to start the search. end_lr: ``float`` The learning rate upto which search is done. num_batches: ``int`` Number of batches to run the learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` Returns ------- (learning_rates, losses): ``Tuple[List[float], List[float]]`` Returns list of learning rates and corresponding losses. Note: The losses are recorded before applying the corresponding learning rate """ if num_batches <= 10: raise ConfigurationError('The number of iterations for learning rate finder should be greater than 10.') trainer.model.train() num_gpus = len(trainer._cuda_devices) # pylint: disable=protected-access raw_train_generator = trainer.iterator(trainer.train_data, shuffle=trainer.shuffle) train_generator = lazy_groups_of(raw_train_generator, num_gpus) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_batches) learning_rates = [] losses = [] best = 1e9 if linear_steps: lr_update_factor = (end_lr - start_lr) / num_batches else: lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches) for i, batch_group in enumerate(train_generator_tqdm): if linear_steps: current_lr = start_lr + (lr_update_factor * i) else: current_lr = start_lr * (lr_update_factor ** i) for param_group in trainer.optimizer.param_groups: param_group['lr'] = current_lr trainer.optimizer.zero_grad() loss = trainer.batch_loss(batch_group, for_training=True) loss.backward() loss = loss.detach().cpu().item() if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best): logger.info(f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.') break trainer.rescale_gradients() trainer.optimizer.step() learning_rates.append(current_lr) losses.append(loss) if loss < best and i > 10: best = loss if i == num_batches: break return learning_rates, losses
def search_learning_rate(trainer: Trainer, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None) -> Tuple[List[float], List[float]]: """ Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer` increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses. Parameters ---------- trainer: :class:`~allennlp.training.trainer.Trainer` start_lr: ``float`` The learning rate to start the search. end_lr: ``float`` The learning rate upto which search is done. num_batches: ``int`` Number of batches to run the learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` Returns ------- (learning_rates, losses): ``Tuple[List[float], List[float]]`` Returns list of learning rates and corresponding losses. Note: The losses are recorded before applying the corresponding learning rate """ if num_batches <= 10: raise ConfigurationError('The number of iterations for learning rate finder should be greater than 10.') trainer.model.train() train_generator = trainer.iterator(trainer.train_data, shuffle=trainer.shuffle) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_batches) learning_rates = [] losses = [] best = 1e9 if linear_steps: lr_update_factor = (end_lr - start_lr) / num_batches else: lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches) for i, batch in enumerate(train_generator_tqdm): if linear_steps: current_lr = start_lr + (lr_update_factor * i) else: current_lr = start_lr * (lr_update_factor ** i) for param_group in trainer.optimizer.param_groups: param_group['lr'] = current_lr trainer.optimizer.zero_grad() loss = trainer.batch_loss(batch, for_training=True) loss.backward() loss = loss.detach().cpu().item() if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best): logger.info(f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.') break trainer.rescale_gradients() trainer.optimizer.step() learning_rates.append(current_lr) losses.append(loss) if loss < best and i > 10: best = loss if i == num_batches: break return learning_rates, losses
parser.add_argument('--dataset-files', nargs='+', action='store', dest='dataset_files', help='file format <id>\t<sequence text>', required=True) args = parser.parse_args() # # load data & create vocab # ------------------------------- # loader = IrTupleDatasetReader(lazy=True,source_tokenizer=BlingFireTokenizer(),target_tokenizer=BlingFireTokenizer(),lowercase=args.lowercase) total_documents=0 all_tokens={} for file in args.dataset_files: for instance in Tqdm.tqdm(loader.read(file)): token_set = set([tok.text.lower() for tok in instance["target_tokens"].tokens]) for token_text in token_set: if token_text not in all_tokens: all_tokens[token_text]=0 all_tokens[token_text]+=1 total_documents += 1 with open(args.out_dir,"w",encoding="utf8") as out: for token,count in all_tokens.items(): out.write(token+" "+f'{math.log(total_documents/count):1.20f}'+"\n")
def _train_epoch(self, epoch: int) -> Dict[str, float]: """ Trains one epoch and returns metrics. """ logger.info("Epoch %d/%d", epoch, self._num_epochs - 1) peak_cpu_usage = common_util.peak_memory_mb() logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}") gpu_usage = [] for gpu, memory in common_util.gpu_memory_mb().items(): gpu_usage.append((gpu, memory)) logger.info(f"GPU {gpu} memory usage MB: {memory}") train_loss = 0.0 # Set the model to "train" mode. self._pytorch_model.train() # Get tqdm for the training batches batch_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle) batch_group_generator = common_util.lazy_groups_of( batch_generator, self._num_gradient_accumulation_steps) num_training_batches = math.ceil( self.iterator.get_num_batches(self.train_data) / self._num_gradient_accumulation_steps) # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's # progress is shown if self._master: batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator, total=num_training_batches) else: batch_group_generator_tqdm = batch_group_generator self._last_log = time.time() last_save_time = time.time() batches_this_epoch = 0 if self._batch_num_total is None: self._batch_num_total = 0 histogram_parameters = set( self.model.get_parameters_for_histogram_tensorboard_logging()) logger.info("Training") cumulative_batch_group_size = 0 done_early = False for batch_group in batch_group_generator_tqdm: if self._distributed: # Check whether the other workers have stopped already (due to differing amounts of # data in each). If so, we can't proceed because we would hang when we hit the # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor # here because NCCL process groups apparently don't support BoolTensor. done = torch.tensor(0, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) if done.item() > 0: done_early = True logger.warning( f"Worker {torch.distributed.get_rank()} finishing training early! " "This implies that there is an imbalance in your training " "data across the workers and that some amount of it will be " "ignored. A small amount of this is fine, but a major imbalance " "should be avoided. Note: This warning will appear unless your " "data is perfectly balanced.") break batches_this_epoch += 1 self._batch_num_total += 1 batch_num_total = self._batch_num_total self.optimizer.zero_grad() for batch in batch_group: loss = self.batch_loss(batch, for_training=True) if torch.isnan(loss): raise ValueError("nan loss encountered") loss = loss / len(batch_group) loss.backward() train_loss += loss.item() batch_grad_norm = self.rescale_gradients() # This does nothing if batch_num_total is None or you are using a # scheduler which doesn't update per batch. if self._learning_rate_scheduler: self._learning_rate_scheduler.step_batch(batch_num_total) if self._momentum_scheduler: self._momentum_scheduler.step_batch(batch_num_total) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: # get the magnitude of parameter updates for logging # We need a copy of current parameters to compute magnitude of updates, # and copy them to CPU so large models won't go OOM on the GPU. param_updates = { name: param.detach().cpu().clone() for name, param in self.model.named_parameters() } self.optimizer.step() for name, param in self.model.named_parameters(): param_updates[name].sub_(param.detach().cpu()) update_norm = torch.norm(param_updates[name].view(-1)) param_norm = torch.norm(param.view(-1)).cpu() self._tensorboard.add_train_scalar( "gradient_update/" + name, update_norm / (param_norm + 1e-7)) else: self.optimizer.step() # Update moving averages if self._moving_average is not None: self._moving_average.apply(batch_num_total) # Update the description with the latest metrics metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, world_size=self._world_size, cuda_device=[self.cuda_device], ) # Updating tqdm only for the master as the trainers wouldn't have one if self._master: description = training_util.description_from_metrics(metrics) batch_group_generator_tqdm.set_description(description, refresh=False) # Log parameter values to Tensorboard (only from the master) if self._tensorboard.should_log_this_batch() and self._master: self._tensorboard.log_parameter_and_gradient_statistics( self.model, batch_grad_norm) self._tensorboard.log_learning_rates(self.model, self.optimizer) self._tensorboard.add_train_scalar("loss/loss_train", metrics["loss"]) self._tensorboard.log_metrics( {"epoch_metrics/" + k: v for k, v in metrics.items()}) if self._tensorboard.should_log_histograms_this_batch( ) and self._master: self._tensorboard.log_histograms(self.model, histogram_parameters) if self._log_batch_size_period: batch_group_size = sum( training_util.get_batch_size(batch) for batch in batch_group) cumulative_batch_group_size += batch_group_size if (batches_this_epoch - 1) % self._log_batch_size_period == 0: average = cumulative_batch_group_size / batches_this_epoch logger.info( f"current batch size: {batch_group_size} mean batch size: {average}" ) self._tensorboard.add_train_scalar("current_batch_size", batch_group_size) self._tensorboard.add_train_scalar("mean_batch_size", average) # Save model if needed. if (self._model_save_interval is not None and (time.time() - last_save_time > self._model_save_interval) and self._master): last_save_time = time.time() self._save_checkpoint("{0}.{1}".format( epoch, training_util.time_to_str(int(last_save_time)))) if self._distributed and not done_early: logger.warning( f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)." ) # Indicate that we're done so that any workers that have remaining data stop the epoch early. done = torch.tensor(1, device=self.cuda_device) torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM) assert done.item() # Let all workers finish their epoch before computing # the final statistics for the epoch. if self._distributed: dist.barrier() metrics = training_util.get_metrics( self.model, train_loss, batches_this_epoch, reset=True, world_size=self._world_size, cuda_device=[self.cuda_device], ) metrics["cpu_memory_MB"] = peak_cpu_usage for (gpu_num, memory) in gpu_usage: metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory return metrics
loader = IrTupleDatasetReader(lazy=True, lowercase=True) vocab = Vocabulary.from_files(args.vocab) if args.qrel: qrels = load_reference(args.qrel) not_judged = 0 oov_queries = 0 non_oov_queries = 0 oov_count_list = [] instances = loader.read(args.query) with open(args.out_file_oov, "w", encoding="utf8") as out_file_oov: with open(args.out_file_no_oov, "w", encoding="utf8") as out_file_non_oov: for i in Tqdm.tqdm(instances): id_str = i["source_tokens"].tokens[0].text if args.qrel and int(id_str) not in qrels: not_judged += 1 continue i.index_fields(vocab) indexes = i["target_tokens"]._indexed_tokens["tokens"] if 1 in i["target_tokens"]._indexed_tokens["tokens"]: # we have a oov query oov_queries += 1 oov_count_list.append(sum(1 for t in indexes if t == 1)) out_file_oov.write(id_str + "\t" + " ".join(
def train_model(params: Params, serialization_dir: str, file_friendly_logging: bool = False, recover: bool = False) -> Model: """ Trains the model specified in the given :class:`Params` object, using the data and training parameters also specified in that object, and saves the results in ``serialization_dir``. Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results and logs. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. recover : ``bool`, optional (default=False) If ``True``, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see the ``fine-tune`` command. """ prepare_environment(params) create_serialization_dir(params, serialization_dir, recover) # TODO(mattg): pull this block out into a separate function (maybe just add this to # `prepare_environment`?) Tqdm.set_slower_interval(file_friendly_logging) sys.stdout = TeeLogger( os.path.join(serialization_dir, "stdout.log"), # type: ignore sys.stdout, file_friendly_logging) sys.stderr = TeeLogger( os.path.join(serialization_dir, "stderr.log"), # type: ignore sys.stderr, file_friendly_logging) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) evaluate_on_test = params.pop_bool("evaluate_on_test", False) params.assert_empty('base train command') metrics = trainer.train() # Now tar up results archive_model(serialization_dir, files_to_archive=params.files_to_archive) if test_data and evaluate_on_test: test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0]) # pylint: disable=protected-access for key, value in test_metrics.items(): metrics["test_" + key] = value elif test_data: logger.info( "To evaluate on the test set after training, pass the " "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.") metrics_json = json.dumps(metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def _read_embeddings_from_text_file(file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values()) vocab_size = vocab.get_vocab_size(namespace) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for line in Tqdm.tqdm(embeddings_file): token = line.split(' ', 1)[0] if token in tokens_to_keep: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if not embeddings: raise ConfigurationError("No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix