コード例 #1
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)

        if lazy is None:
            logger.warning(
                "DatasetReader.lazy is not set, "
                "did you forget to call the superclass constructor?")

        if self._cache_directory:
            cache_file = self._get_cache_location_for_file_path(file_path)
        else:
            cache_file = None

        if lazy:
            return _LazyInstances(lambda: self._read(file_path), cache_file,
                                  self.deserialize_instance,
                                  self.serialize_instance)
        else:
            # First we read the instances, either from a cache or from the original file.
            if cache_file and os.path.exists(cache_file):
                instances = self._instances_from_cache_file(cache_file)
            else:
                instances = self._read(file_path)

            # Then some validation.
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError(
                    "No instances were read from the given filepath {}. "
                    "Is the path correct?".format(file_path))

            # And finally we write to the cache if we need to.
            if cache_file and not os.path.exists(cache_file):
                logger.info(f"Caching instances to {cache_file}")
                with open(cache_file, 'w') as cache:
                    for instance in Tqdm.tqdm(instances):
                        cache.write(self.serialize_instance(instance) + '\n')
            return instances
コード例 #2
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             output_file: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1, cuda_device=cuda_device)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                id2label = model.vocab.get_index_to_token_vocabulary("labels")
                _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label)
            description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics()
コード例 #3
0
def re_read_embeddings_from_text_file(file_uri, embedding_dim, vocab, namespace):

    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(" ", 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(" ")
                if len(fields) - 1 != embedding_dim:
                    continue

                vector = np.asarray(fields[1:], dtype="float32")
                embeddings[token] = vector

    index_to_token = vocab.get_index_to_token_vocabulary(namespace)

    rows_not_to_optimize = []
    for i in range(vocab_size):
        token = index_to_token[i]

        if token in embeddings:
            rows_not_to_optimize.append(i)

    return rows_not_to_optimize
コード例 #4
0
    def __init__(self,
                 glove_path: str,
                 embedding_dim: int,
                 trainable: bool = False) -> None:
        super(GloveContextualizer, self).__init__()
        self.embedding_dim = embedding_dim
        self.trainable = trainable
        # Read the GloVe file, and produce a dictionary of tokens to indices, a dictionary
        # of indices to tokens, and a PyTorch Embedding object.
        self.token_to_idx = {DEFAULT_OOV_TOKEN: 0}
        self.idx_to_token = {0: DEFAULT_OOV_TOKEN}

        # First we read the embeddings from the file, only keeping vectors for the words we need.
        logger.info("Reading pretrained embeddings from file")
        embeddings = {}
        with EmbeddingsTextFile(glove_path) as embeddings_file:
            for line in Tqdm.tqdm(embeddings_file):
                token = line.split(' ', 1)[0]
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != self.embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        self.embedding_dim,
                        len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector
                self.token_to_idx[token] = len(self.token_to_idx)
                self.idx_to_token[len(self.idx_to_token)] = token
        if not embeddings:
            raise ConfigurationError(
                "No embeddings of correct dimension found; you probably "
                "misspecified your embedding_dim parameter, or didn't "
                "pre-populate your Vocabulary")

        all_embeddings = numpy.asarray(list(embeddings.values()))
        embeddings_mean = float(numpy.mean(all_embeddings))
        embeddings_std = float(numpy.std(all_embeddings))
        # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
        # then filling in the word vectors we just read.
        vocab_size = len(self.token_to_idx)
        logger.info("Initializing pre-trained embedding layer")
        embedding_matrix = torch.FloatTensor(vocab_size,
                                             self.embedding_dim).normal_(
                                                 embeddings_mean,
                                                 embeddings_std)
        # Start at 1, since the 0th token is OOV, and fill in the embedding matrix
        for i in range(1, vocab_size):
            embedding_matrix[i] = torch.FloatTensor(
                embeddings[self.idx_to_token[i]])
        self.weight = torch.nn.Parameter(embedding_matrix,
                                         requires_grad=self.trainable)
コード例 #5
0
    def read(self, file_path: str) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning("DatasetReader.lazy is not set, "
                           "did you forget to call the superclass constructor?")

        if lazy:
            return _LazyInstances(lambda: iter(self._read(file_path)))
        else:
            instances = self._read(file_path)
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError("No instances were read from the given filepath {}. "
                                         "Is the path correct?".format(file_path))
            return instances
コード例 #6
0
    def train_one_epoch(self) -> None:
        """
        Trains the model for a single epoch.
        Fires off the events EPOCH_START and EPOCH_END,
        and repeatedly calls self.train_one_batch().
        """
        self.handler.fire_event(Events.EPOCH_START)

        self.train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        self.last_log = time.time()

        logger.info("Training")
        self.batches_this_epoch = 0

        batches_tqdm = Tqdm.tqdm(self.training_batches,
                                 total=self.num_training_batches)

        for self.batch in batches_tqdm:
            description = self.train_one_batch(self.batch)
            batches_tqdm.set_description(description, refresh=False)

        self.handler.fire_event(Events.VALIDATE)
        self.handler.fire_event(Events.EPOCH_END)
コード例 #7
0
    def read(self, url: str, query: str, *args) -> Iterable[Instance]:
        """
        Returns an ``Iterable`` containing all the instances
        in the specified dataset.

        If ``self.lazy`` is False, this calls ``self._read()``,
        ensures that the result is a list, then returns the resulting list.

        If ``self.lazy`` is True, this returns an object whose
        ``__iter__`` method calls ``self._read()`` each iteration.
        In this case your implementation of ``_read()`` must also be lazy
        (that is, not load all instances into memory at once), otherwise
        you will get a ``ConfigurationError``.

        In either case, the returned ``Iterable`` can be iterated
        over multiple times. It's unlikely you want to override this function,
        but if you do your result should likewise be repeatedly iterable.
        """
        lazy = getattr(self, 'lazy', None)
        if lazy is None:
            logger.warning("DatasetReader.lazy is not set, "
                           "did you forget to call the superclass constructor?")

        if lazy:
            return _LazyInstances(lambda: iter(self._read(url=url, query=query, *args)))
        else:
            instances = self._read(url=url, query=query, *args)
            if not isinstance(instances, list):
                instances = [instance for instance in Tqdm.tqdm(instances)]
            if not instances:
                raise ConfigurationError("")  # TODO: fix error
            return instances
コード例 #8
0
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             cuda_device: int,
             output_file: str = None,
             eval_type: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator,
                               total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            ## made cuda compatible (if needed)
            batch = move_to_device(batch, cuda_device)

            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                _persist_data(file_handle, batch.get("metadata"), model_output,
                              eval_type)
            description = ', '.join([
                "%s: %.2f" % (name, value) for name, value in metrics.items()
            ]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics(reset=True)
コード例 #9
0
def main(_):
    process_flags()

    if FLAGS.manual_seed:
        set_manual_seeds(FLAGS.manual_seed)

    # Create folders and files to store results and configs
    run_dir = Path(FLAGS.output_folder, FLAGS.run_name)
    if not os.path.exists(run_dir):
        os.makedirs(run_dir)

    # Logging
    log_fh = log.FileHandler(Path(run_dir, 'log.log'))
    log_fmt = log.Formatter("%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p")
    log_fh.setFormatter(log_fmt)
    log.getLogger().addHandler(log_fh)

    #Store the run description, if any
    if FLAGS.description:
        with open(Path(run_dir,'description.txt'),'w') as f:
            f.write(FLAGS.description)
        log.info(f'DESCRIPTION: {FLAGS.description}')
    # Store configuration in same folder as logs and model
    flagfile = Path(run_dir, 'flagfile.txt')
    if os.path.exists(flagfile):
        os.remove(flagfile)
    open(flagfile, "x")
    FLAGS.append_flags_into_file(flagfile)

    if FLAGS.old_pretrain_data:
        data_dict = get_data_dict_old()
    else:
        data_dict = get_data_dict()
    train_dataset, test_dataset, val_dataset = (data_dict[key] for key in
                                                       ('train', 'test', 'val'))
    model = MLMModelWrapper(MODEL_MAPPING[FLAGS.model])
    distributed_wrapper(train,model, run_dir, train_dataset, val_dataset)
    model.cuda(FLAGS.device_idxs[0])

    log.info("Evaluating pretraining performance on test split")
    test_loader = get_loader(test_dataset)
    model.eval()
    batch_generator = iter(test_loader)
    batch_generator = Tqdm.tqdm(
        batch_generator)
    total_metrics = {}
    with torch.no_grad():
        for i, batch in enumerate(batch_generator):
            batch = move_to_device(batch, FLAGS.device_idxs[0])
            if isinstance(batch, torch.Tensor):
                model(batch)
            else:
                model(**batch)
            if i == 0:
                total_metrics = model.get_metrics()
            else:
                total_metrics = {m: total_metrics[m] + model.get_metrics()[m] for m in total_metrics.keys()}
        average_metrics = {k: v/(i+1) for k,v in total_metrics.items()}
        log.info(f"Average test metrics:{average_metrics}")
コード例 #10
0
 def _instances_to_cache_file(self, cache_filename, instances) -> None:
     # We serialize to a temp file first in case anything goes wrong while
     # writing to cache (e.g., the computer shuts down unexpectedly).
     # Then we just copy the file over to `cache_filename`.
     with CacheFile(cache_filename, mode="w+") as cache_handle:
         logger.info("Caching instances to temp file %s", cache_handle.name)
         for instance in Tqdm.tqdm(instances, desc="caching instances"):
             cache_handle.write(self.serialize_instance(instance) + "\n")
コード例 #11
0
 def _compute_metrics(self,
                      data_loader: DataLoader) -> Iterable[Tuple[str, Any]]:
     with torch.no_grad():
         # We use batches because the dataset may not fit in memory (the iterable was set up thus to batch)
         # and the model estimator may support `partial_fit`. If the estimator doesn't support it,
         # it's gonna raise an exception that the user can see.
         for tensor_dict_batch in Tqdm.tqdm(data_loader):
             self.model(**tensor_dict_batch)
         return self.model.get_metrics(reset=True).items()  # noqa
コード例 #12
0
    def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._pytorch_model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        val_generator = val_iterator(self._validation_data,
                                     num_epochs=1,
                                     shuffle=False)
        num_validation_batches = val_iterator.get_num_batches(
            self._validation_data)
        val_generator_tqdm = Tqdm.tqdm(val_generator,
                                       total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch in val_generator_tqdm:

            loss = self.batch_loss(batch, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(
                self.model,
                val_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch
コード例 #13
0
    def _calculate_uncertainty_batch(self, batch: InstanceBatch, progress_bar: Tqdm = None) -> None:
        uncertainty_df = defaultdict(list)
        ids, predictions, labels = batch
        for idx, prediction, label in zip(ids, predictions, labels):
            for w, word in enumerate(prediction['words']):
                for model in self.predictor._model.all_model_keys:

                    tag_mean_probability = prediction[f'{model}_class_probabilities'][w]
                    tag_std_probability  = prediction[f'{model}_class_prob_std'][w]
                    actual_label_idx = label[w]
                    predicted_label_idx = np.argmax(tag_mean_probability)

                    uncertainty_df['instance_id'].append(idx)
                    uncertainty_df['word_id'].append(w)
                    uncertainty_df['model'].append(model)
                    uncertainty_df['word'].append(word)

                    uncertainty_df['actual_tag'].append(
                        self.predictor._model.vocab.get_token_from_index(
                            actual_label_idx,
                            namespace=self.predictor._model.label_namespace
                        )
                    )

                    uncertainty_df['predicted_tag'].append(
                        self.predictor._model.vocab.get_token_from_index(
                            predicted_label_idx,
                            namespace=self.predictor._model.label_namespace
                        )
                    )

                    uncertainty_df['actual_confidence_mean'].append(tag_mean_probability[actual_label_idx])
                    uncertainty_df['actual_confidence_std'].append(tag_std_probability[actual_label_idx])
                    uncertainty_df['predicted_confidence_mean'].append(tag_mean_probability[predicted_label_idx])
                    uncertainty_df['predicted_confidence_std'].append(tag_std_probability[predicted_label_idx])

                    uncertainty_df['mean_probability_distribution'].append(tag_mean_probability)

            progress_bar.update(1)
        return uncertainty_df
コード例 #14
0
    def read(self, file_path, max_length=None):
        """Reads the data from a json file

        Converts the data into ``Instances`` of
        tokens and ``LabelField``

        Read the _read() function for more details

        """
        instances = self._read(file_path, max_length)
        if not isinstance(instances, list):
            instances = [instance for instance in Tqdm.tqdm(instances)]
        return instances
コード例 #15
0
    def _calculate_feature_importance_batch(self, batch: InstanceBatch, progress_bar: Tqdm = None) -> None:
        feature_importance_df = defaultdict(list)

        ids, labeled_batch, actual_labels = batch
        batch_text = [[li[fn].tokens for fn in self.field_names] for li in labeled_batch]
        fields = [list(self.field_names) for _ in range(len(labeled_batch))]
        predicted_labels = [li['label'].label for li in labeled_batch]
        seed = [self.seed for _ in range(len(labeled_batch))]

        for interpreter in self.feature_importance_interpreters + self.attention_interpreters:

            if progress_bar:
                progress_bar.set_description(f"{interpreter.id}: interpreting {len(labeled_batch)} instances")

            # Some feature importance measures are too memory-intensive to run with larger batch sizes
            # These numbers are based on empirical tests with a standard 16GB gpu
            if 'shap' in interpreter.id or 'deep' in interpreter.id or 'intgrad' in interpreter.id:
                batch_scores = []
                for sub_batch in utils.batch(labeled_batch, 2):
                    batch_scores.extend(interpreter.saliency_interpret_instances(sub_batch).values())
            else:
                batch_scores = interpreter.saliency_interpret_instances(labeled_batch).values()

            # # There can be more than one array of scores for an instance (e.g. in the pair sequence case)
            scores = [[np.asarray(scoreset) for scoreset in v.values()] for v in batch_scores]

            feature_importance_df['scores'].extend(scores)
            feature_importance_df['seed'].extend(seed)
            feature_importance_df['instance_id'].extend(ids)
            feature_importance_df['instance_text'].extend(batch_text)
            feature_importance_df['instance_fields'].extend(fields)
            feature_importance_df['feature_importance_measure'].extend([interpreter.id for _ in range(len(labeled_batch))])
            feature_importance_df['predicted'].extend(predicted_labels)
            feature_importance_df['actual'].extend(actual_labels)

            if progress_bar:
                progress_bar.update(1)

        return feature_importance_df
コード例 #16
0
    def calculate_correlation(self, force: bool = False) -> None:
        pkl_exists = os.path.isfile(os.path.join(self.serialization_dir, 'correlation.pkl'))

        if pkl_exists and not force:
            self.logger.info("Correlations exist and force was not specified. Loading from disk...")
            self.correlation_results = pd.read_pickle(os.path.join(self.serialization_dir, 'correlation.pkl'))
        else:
            correlation_df = defaultdict(list)
            self.logger.info('Calculating correlations...')

            progress_bar = Tqdm.tqdm(total=len(self.correlation_combos))

            # We need to compare combinations with at least one attention interpreter first to get the k_values
            # for an apples to apples comparison with combinations where both interpreters are
            # feature importance measures
            unfair_k = defaultdict(lambda: defaultdict(list))
            for (key1, key2) in self.correlation_combos:
                if 'attn' in key1 or 'attn' in key2:
                    correlations, unfair_k_values = self._calculate_correlation_combo(key1, key2)

                    for key, values in correlations.items():
                        correlation_df[key].extend(values)

                    for measure, k in unfair_k_values.items():
                        unfair_k[key1][measure].extend(k)
                        unfair_k[key2][measure].extend(k)

                    progress_bar.update(1)

            # Now we can compare the feature importance measures to each other
            for (key1, key2) in self.correlation_combos:
                if 'attn' not in key1 and 'attn' not in key2:
                    correlation_kwargs = defaultdict(list)

                    # Unfair k strategy: take the average k used for each key
                    for name, k_values in unfair_k.get(key1, {}).items():
                        correlation_kwargs[name].extend(k_values)
                    for name, k_values in unfair_k.get(key2, {}).items():
                        correlation_kwargs[name].extend(k_values)
                    for name, k_values in correlation_kwargs.items():
                        correlation_kwargs[name] = {"k": math.floor(statistics.mean(k_values))}

                    correlations, _ = self._calculate_correlation_combo(key1, key2, correlation_kwargs=correlation_kwargs)

                    for k, v in correlations.items():
                        correlation_df[k].extend(v)
                    progress_bar.update(1)

            self.correlation_results = pd.DataFrame(correlation_df)
            utils.write_frame(self.correlation_results, self.serialization_dir, 'correlation')
コード例 #17
0
def preprocess(filename, name):
    output_file = open(os.path.join(args.output, name + '.pickle'), 'wb')

    with open(filename) as file:
        docs = []
        summs = []
        for line in Tqdm.tqdm(file):
            doc, summ = line.strip().split('\t')
            docs.append(nlp(doc))
            summs.append(nlp(summ))
        docs_spacy = docs
        summs_spacy = summs
        dataset = {'docs': docs_spacy, 'summs': summs_spacy}
        pickle.dump(dataset, output_file)
        output_file.close()
コード例 #18
0
def _read_embeddings_from_text_file(file_uri: str,
                                    embedding_dim: int,
                                    vocab: Vocabulary,
                                    namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
 
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                                   embedding_dim, len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError("No embeddings of correct dimension found; you probably "
                                 "misspecified your embedding_dim parameter, or didn't "
                                 "pre-populate your Vocabulary")

    return embeddings
コード例 #19
0
    def calculate_uncertainty(self, force: bool = False) -> None:
        pkl_exists = os.path.isfile(os.path.join(self.serialization_dir, 'uncertainty.pkl'))

        if pkl_exists and not force:
            self.logger.info("Uncertainty data exists and force was not specified. Loading from disk...")
            self.results = pd.read_pickle(os.path.join(self.serialization_dir, 'uncertainty.pkl'))
        else:
            uncertainty_df = defaultdict(list)
            self.logger.info('Calculating uncertainty...')

            progress_bar = Tqdm.tqdm(total=self.num_instances)

            for batch in self.dataset:
                uncertainty_scores = self._calculate_uncertainty_batch(batch, progress_bar)
                for k, v in uncertainty_scores.items():
                    uncertainty_df[k].extend(v)

            self.results = pd.DataFrame(uncertainty_df)
            utils.write_frame(self.results, self.serialization_dir, 'uncertainty')
コード例 #20
0
def get_iter_norm_mean_eval(
    model: Model, 
    data_loader: DataLoader, 
    mean: torch.Tensor,
    cuda_device: int = -1
) -> Dict[str, Any]:
    """
    # Parameters

    model : `Model`
        The model to evaluate
    data_loader : `DataLoader`
        The `DataLoader` that will iterate over the evaluation data (data loaders already contain
        their data).
    cuda_device : `int`, optional (default=`-1`)
        The cuda device to use for this evaluation.  The model is assumed to already be using this
        device; this parameter is only used for moving the input data to the correct device.
    batch_weight_key : `str`, optional (default=`None`)
        If given, this is a key in the output dictionary for each batch that specifies how to weight
        the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
    """
    check_for_gpu(cuda_device)
    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator)

        # mean_embeddings: [torch.Tensor, int]
        # mean_embeddings = [torch.tensor([0.], device=cuda_device), 0]
        embeddings = []
        for batch in generator_tqdm:
            batch = nn_util.move_to_device(batch, cuda_device)
            batch_embeddings = model.forward_embeddings(batch['words'], mean)
            # mean_embeddings[0] = (mean_embeddings[0] + batch_embeddings.sum(dim=0)) 
            # mean_embeddings[1] += batch_embeddings.shape[0]
            embeddings.append(batch_embeddings)

        # mean_embeddings[0] = mean_embeddings[0] / mean_embeddings[1]
        embeddings = torch.cat(embeddings, dim=0)

    return embeddings.mean(dim=0), embeddings # mean_embeddings[0]
コード例 #21
0
    def calculate_feature_importance(self, force: bool = False) -> None:
        pkl_exists = os.path.isfile(os.path.join(self.serialization_dir, 'feature_importance.pkl'))

        if pkl_exists and not force:
            self.logger.info("Feature importance scores exist and force was not specified. Loading from disk...")
            self.feature_importance_results = pd.read_pickle(os.path.join(self.serialization_dir, 'feature_importance.pkl'))
        else:
            feature_importance_df = defaultdict(list)
            self.logger.info('Calculating feature importance scores...')

            num_interpreters = len(self.feature_importance_interpreters) + len(self.attention_interpreters)
            progress_bar = Tqdm.tqdm(total=self.num_batches * num_interpreters)

            for batch in self.dataset:
                importance_scores = self._calculate_feature_importance_batch(batch, progress_bar)
                for k, v in importance_scores.items():
                    feature_importance_df[k].extend(v)

            self.feature_importance_results = pd.DataFrame(feature_importance_df)
            utils.write_frame(self.feature_importance_results, self.serialization_dir, 'feature_importance')
コード例 #22
0
def _read_embeddings_from_text_file(
        file_uri: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.",
                token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
コード例 #23
0
ファイル: trainer.py プロジェクト: sun-xiaoyu/allennlp
    def _validation_loss(self, epoch: int) -> Tuple[float, float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self._pytorch_model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_data_loader is not None:
            validation_data_loader = self._validation_data_loader
        else:
            raise ConfigurationError(
                "Validation results cannot be calculated without a validation_data_loader"
            )

        val_generator_tqdm = Tqdm.tqdm(validation_data_loader)
        batches_this_epoch = 0
        val_loss = 0
        val_reg_loss = 0
        done_early = False
        for batch in val_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing validation early! "
                        "This implies that there is an imbalance in your validation "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batch_outputs = self.batch_outputs(batch, for_training=False)
            loss = batch_outputs.get("loss")
            reg_loss = batch_outputs.get("reg_loss")
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()
                if reg_loss is not None:
                    val_reg_loss += reg_loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(
                self.model,
                val_loss,
                val_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

            if self._master:
                for callback in self._batch_callbacks:
                    callback(
                        self,
                        [batch],
                        [batch_outputs],
                        epoch,
                        batches_this_epoch,
                        is_training=False,
                    )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (validation)."
            )
            # Indicate that we're done so that any workers that have remaining data stop validation early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, val_reg_loss, batches_this_epoch
コード例 #24
0
ファイル: trainer.py プロジェクト: sun-xiaoyu/allennlp
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        train_reg_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)

        logger.info("Training")

        num_training_batches = math.ceil(
            len(self.data_loader) / self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            batch_group_outputs = []
            for batch in batch_group:
                batch_outputs = self.batch_outputs(batch, for_training=True)
                batch_group_outputs.append(batch_outputs)
                loss = batch_outputs["loss"]
                reg_loss = batch_outputs["reg_loss"]
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                reg_loss = reg_loss / len(batch_group)
                if self._opt_level is not None:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                train_loss += loss.item()
                train_reg_loss += reg_loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            param_updates = None
            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # Get the magnitude of parameter updates for logging.  We need to do some
                # computation before and after the optimizer step, and it's expensive because of
                # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so
                # we don't do this every batch, only when it's requested.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)
                self._tensorboard.log_batch(self.model, self.optimizer,
                                            batch_grad_norm, metrics,
                                            batch_group, param_updates)

            if self._master:
                self._checkpointer.maybe_save_checkpoint(
                    self, epoch, batches_this_epoch)
                for callback in self._batch_callbacks:
                    callback(
                        self,
                        batch_group,
                        batch_group_outputs,
                        epoch,
                        batches_this_epoch,
                        is_training=True,
                    )

        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
コード例 #25
0
    def _validation_loss_n_step(self, step: int) -> Tuple[float, Optional[float], int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating on %d steps" % step)

        self._pytorch_model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_data_loader is not None:
            validation_data_loader = self._validation_data_loader
        else:
            raise ConfigurationError(
                "Validation results cannot be calculated without a validation_data_loader"
            )

        regularization_penalty = self.model.get_regularization_penalty()

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's
        # progress is shown

        val_batch_generator = iter(validation_data_loader)
        val_batch_group_generator = common_util.lazy_groups_of(
            val_batch_generator, 5
        )
        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(validation_data_loader)
            num_training_batches = math.ceil(
                len_data_loader / 5
            )
        except TypeError:
            num_training_batches = float("inf")

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's
        # progress is shown
        if self._primary:
            val_generator_tqdm = Tqdm.tqdm(
                val_batch_group_generator, total=num_training_batches
            )
        else:
            val_generator_tqdm = val_batch_group_generator

        batches_this_epoch = 0
        val_loss = 0.0
        val_reg_loss = None if regularization_penalty is None else 0.0
        for val_batch_group in val_generator_tqdm:
            for val_batch in val_batch_group:
                with amp.autocast(self._use_amp):
                    batches_this_epoch += 1
                    batch_outputs = self.batch_outputs(val_batch, for_training=False)
                    loss = batch_outputs.get("loss")
                    reg_loss = batch_outputs.get("reg_loss")
                    if loss is not None:
                        # You shouldn't necessarily have to compute a loss for validation, so we allow for
                        # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                        # currently only used as the divisor for the loss function, so we can safely only
                        # count those batches for which we actually have a loss.  If this variable ever
                        # gets used for something else, we might need to change things around a bit.
                        val_batch_loss = loss.item()
                        val_loss += val_batch_loss
                        if reg_loss is not None:
                            val_batch_reg_loss = reg_loss.item()
                            val_reg_loss += val_batch_reg_loss  # type: ignore

        return val_loss, val_reg_loss, batches_this_epoch
コード例 #26
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        cpu_memory_usage = []
        for worker, memory in common_util.peak_cpu_memory().items():
            cpu_memory_usage.append((worker, memory))
            logger.info(f"Worker {worker} memory usage: {common_util.format_size(memory)}")
        gpu_memory_usage = []
        for gpu, memory in common_util.peak_gpu_memory().items():
            gpu_memory_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage: {common_util.format_size(memory)}")

        regularization_penalty = self.model.get_regularization_penalty()

        train_loss = 0.0
        batch_loss = 0.0
        train_reg_loss = None if regularization_penalty is None else 0.0
        batch_reg_loss = None if regularization_penalty is None else 0.0

        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = iter(self.data_loader)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps
        )

        logger.info("Training")

        num_training_batches: Union[int, float]
        try:
            len_data_loader = len(self.data_loader)
            num_training_batches = math.ceil(
                len_data_loader / self._num_gradient_accumulation_steps
            )
        except TypeError:
            num_training_batches = float("inf")

        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's
        # progress is shown
        if self._primary:
            batch_group_generator_tqdm = Tqdm.tqdm(
                batch_group_generator, total=num_training_batches
            )
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if done_early:
                break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            # Zero gradients.
            # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()`
            # because it avoids a read op when the gradients are first updated below.
            for param_group in self.optimizer.param_groups:
                for p in param_group["params"]:
                    p.grad = None

            batch_loss = 0.0
            batch_group_outputs = []
            for batch in batch_group:
                with amp.autocast(self._use_amp):
                    batch_outputs = self.batch_outputs(batch, for_training=True)
                    batch_group_outputs.append(batch_outputs)
                    loss = batch_outputs["loss"]
                    reg_loss = batch_outputs.get("reg_loss")
                    if torch.isnan(loss):
                        raise ValueError("nan loss encountered")
                    loss = loss / len(batch_group)

                    batch_loss += loss.item()
                    if reg_loss is not None:
                        reg_loss = reg_loss / len(batch_group)
                        batch_reg_loss = reg_loss.item()
                        train_reg_loss += batch_reg_loss  # type: ignore

                if self._scaler is not None:
                    self._scaler.scale(loss).backward()
                else:
                    loss.backward()
            if len(batch_group_outputs) <= 0:
                continue

            train_loss += batch_loss

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._scaler is not None:
                self._scaler.step(self.optimizer)
                self._scaler.update()
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                train_reg_loss,
                batch_loss,
                batch_reg_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=self.cuda_device,
            )

            if batch_num_total % self.val_loss_steps == 0:
                logger.info("%s: %.4f" % ('train_loss', train_loss / batches_this_epoch))
                if self._validation_data_loader is not None:
                    with torch.no_grad():
                        # We have a validation set, so compute all the metrics on it.
                        val_loss, val_reg_loss, num_batches = self._validation_loss_n_step(batch_num_total)

                val_metrics = training_util.get_metrics(
                    self.model,
                    val_loss,
                    val_reg_loss,
                    num_batches=num_batches,
                    batch_loss=None,
                    batch_reg_loss=None,
                    reset=True,
                    world_size=self._world_size,
                    cuda_device=self.cuda_device,
                )
                # description = training_util.description_from_metrics(val_metrics)
                logger.info("%s: %.4f" % ('val_loss', val_loss / num_batches))
                # batch_group_generator_tqdm.set_description(description, refresh=False)

                self._pytorch_model.train()

            if self._primary:
                # Updating tqdm only for the primary as the trainers wouldn't have one
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description, refresh=False)

                if self._checkpointer is not None:
                    self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch)

            for callback in self._callbacks:
                callback.on_batch(
                    self,
                    batch_group,
                    batch_group_outputs,
                    metrics,
                    epoch,
                    batches_this_epoch,
                    is_training=True,
                    is_primary=self._primary,
                    batch_grad_norm=batch_grad_norm,
                )

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            train_reg_loss,
            batch_loss=None,
            batch_reg_loss=None,
            num_batches=batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=self.cuda_device,
        )

        for (worker, memory) in cpu_memory_usage:
            metrics["worker_" + str(worker) + "_memory_MB"] = memory / (1024 * 1024)
        for (gpu_num, memory) in gpu_memory_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory / (1024 * 1024)
        return metrics
コード例 #27
0
def search_learning_rate(trainer: Trainer,
                         start_lr: float = 1e-5,
                         end_lr: float = 10,
                         num_batches: int = 100,
                         linear_steps: bool = False,
                         stopping_factor: float = None) -> Tuple[List[float], List[float]]:
    """
    Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer`
    increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses.
    Parameters
    ----------
    trainer: :class:`~allennlp.training.trainer.Trainer`
    start_lr: ``float``
        The learning rate to start the search.
    end_lr: ``float``
        The learning rate upto which search is done.
    num_batches: ``int``
        Number of batches to run the learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``
    Returns
    -------
    (learning_rates, losses): ``Tuple[List[float], List[float]]``
        Returns list of learning rates and corresponding losses.
        Note: The losses are recorded before applying the corresponding learning rate
    """
    if num_batches <= 10:
        raise ConfigurationError('The number of iterations for learning rate finder should be greater than 10.')

    trainer.model.train()

    num_gpus = len(trainer._cuda_devices) # pylint: disable=protected-access

    raw_train_generator = trainer.iterator(trainer.train_data,
                                           shuffle=trainer.shuffle)
    train_generator = lazy_groups_of(raw_train_generator, num_gpus)
    train_generator_tqdm = Tqdm.tqdm(train_generator,
                                     total=num_batches)

    learning_rates = []
    losses = []
    best = 1e9
    if linear_steps:
        lr_update_factor = (end_lr - start_lr) / num_batches
    else:
        lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches)

    for i, batch_group in enumerate(train_generator_tqdm):

        if linear_steps:
            current_lr = start_lr + (lr_update_factor * i)
        else:
            current_lr = start_lr * (lr_update_factor ** i)

        for param_group in trainer.optimizer.param_groups:
            param_group['lr'] = current_lr

        trainer.optimizer.zero_grad()
        loss = trainer.batch_loss(batch_group, for_training=True)
        loss.backward()
        loss = loss.detach().cpu().item()

        if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best):
            logger.info(f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.')
            break

        trainer.rescale_gradients()
        trainer.optimizer.step()

        learning_rates.append(current_lr)
        losses.append(loss)

        if loss < best and i > 10:
            best = loss

        if i == num_batches:
            break

    return learning_rates, losses
コード例 #28
0
def search_learning_rate(trainer: Trainer,
                         start_lr: float = 1e-5,
                         end_lr: float = 10,
                         num_batches: int = 100,
                         linear_steps: bool = False,
                         stopping_factor: float = None) -> Tuple[List[float], List[float]]:
    """
    Runs training loop on the model using :class:`~allennlp.training.trainer.Trainer`
    increasing learning rate from ``start_lr`` to ``end_lr`` recording the losses.

    Parameters
    ----------
    trainer: :class:`~allennlp.training.trainer.Trainer`
    start_lr: ``float``
        The learning rate to start the search.
    end_lr: ``float``
        The learning rate upto which search is done.
    num_batches: ``int``
        Number of batches to run the learning rate finder.
    linear_steps: ``bool``
        Increase learning rate linearly if False exponentially.
    stopping_factor: ``float``
        Stop the search when the current loss exceeds the best loss recorded by
        multiple of stopping factor. If ``None`` search proceeds till the ``end_lr``

    Returns
    -------
    (learning_rates, losses): ``Tuple[List[float], List[float]]``
        Returns list of learning rates and corresponding losses.
        Note: The losses are recorded before applying the corresponding learning rate
    """
    if num_batches <= 10:
        raise ConfigurationError('The number of iterations for learning rate finder should be greater than 10.')

    trainer.model.train()

    train_generator = trainer.iterator(trainer.train_data,
                                       shuffle=trainer.shuffle)
    train_generator_tqdm = Tqdm.tqdm(train_generator,
                                     total=num_batches)

    learning_rates = []
    losses = []
    best = 1e9
    if linear_steps:
        lr_update_factor = (end_lr - start_lr) / num_batches
    else:
        lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches)

    for i, batch in enumerate(train_generator_tqdm):

        if linear_steps:
            current_lr = start_lr + (lr_update_factor * i)
        else:
            current_lr = start_lr * (lr_update_factor ** i)

        for param_group in trainer.optimizer.param_groups:
            param_group['lr'] = current_lr

        trainer.optimizer.zero_grad()
        loss = trainer.batch_loss(batch, for_training=True)
        loss.backward()
        loss = loss.detach().cpu().item()

        if stopping_factor is not None and (math.isnan(loss) or loss > stopping_factor * best):
            logger.info(f'Loss ({loss}) exceeds stopping_factor * lowest recorded loss.')
            break

        trainer.rescale_gradients()
        trainer.optimizer.step()

        learning_rates.append(current_lr)
        losses.append(loss)

        if loss < best and i > 10:
            best = loss

        if i == num_batches:
            break

    return learning_rates, losses
コード例 #29
0
parser.add_argument('--dataset-files', nargs='+', action='store', dest='dataset_files',
                    help='file format <id>\t<sequence text>', required=True)

args = parser.parse_args()


#
# load data & create vocab
# -------------------------------
#  

loader = IrTupleDatasetReader(lazy=True,source_tokenizer=BlingFireTokenizer(),target_tokenizer=BlingFireTokenizer(),lowercase=args.lowercase)

total_documents=0
all_tokens={}

for file in args.dataset_files:
    for instance in Tqdm.tqdm(loader.read(file)):

        token_set = set([tok.text.lower() for tok in instance["target_tokens"].tokens])
        for token_text in token_set:
            if token_text not in all_tokens:
                all_tokens[token_text]=0
            all_tokens[token_text]+=1

        total_documents += 1

with open(args.out_dir,"w",encoding="utf8") as out:
    for token,count in all_tokens.items():
        out.write(token+" "+f'{math.log(total_documents/count):1.20f}'+"\n")
コード例 #30
0
    def _train_epoch(self, epoch: int) -> Dict[str, float]:
        """
        Trains one epoch and returns metrics.
        """
        logger.info("Epoch %d/%d", epoch, self._num_epochs - 1)
        peak_cpu_usage = common_util.peak_memory_mb()
        logger.info(f"Peak CPU memory usage MB: {peak_cpu_usage}")
        gpu_usage = []
        for gpu, memory in common_util.gpu_memory_mb().items():
            gpu_usage.append((gpu, memory))
            logger.info(f"GPU {gpu} memory usage MB: {memory}")

        train_loss = 0.0
        # Set the model to "train" mode.
        self._pytorch_model.train()

        # Get tqdm for the training batches
        batch_generator = self.iterator(self.train_data,
                                        num_epochs=1,
                                        shuffle=self.shuffle)
        batch_group_generator = common_util.lazy_groups_of(
            batch_generator, self._num_gradient_accumulation_steps)
        num_training_batches = math.ceil(
            self.iterator.get_num_batches(self.train_data) /
            self._num_gradient_accumulation_steps)
        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
        # progress is shown
        if self._master:
            batch_group_generator_tqdm = Tqdm.tqdm(batch_group_generator,
                                                   total=num_training_batches)
        else:
            batch_group_generator_tqdm = batch_group_generator

        self._last_log = time.time()
        last_save_time = time.time()

        batches_this_epoch = 0
        if self._batch_num_total is None:
            self._batch_num_total = 0

        histogram_parameters = set(
            self.model.get_parameters_for_histogram_tensorboard_logging())

        logger.info("Training")

        cumulative_batch_group_size = 0
        done_early = False
        for batch_group in batch_group_generator_tqdm:
            if self._distributed:
                # Check whether the other workers have stopped already (due to differing amounts of
                # data in each). If so, we can't proceed because we would hang when we hit the
                # barrier implicit in Model.forward. We use a IntTensor instead a BoolTensor
                # here because NCCL process groups apparently don't support BoolTensor.
                done = torch.tensor(0, device=self.cuda_device)
                torch.distributed.all_reduce(done,
                                             torch.distributed.ReduceOp.SUM)
                if done.item() > 0:
                    done_early = True
                    logger.warning(
                        f"Worker {torch.distributed.get_rank()} finishing training early! "
                        "This implies that there is an imbalance in your training "
                        "data across the workers and that some amount of it will be "
                        "ignored. A small amount of this is fine, but a major imbalance "
                        "should be avoided. Note: This warning will appear unless your "
                        "data is perfectly balanced.")
                    break

            batches_this_epoch += 1
            self._batch_num_total += 1
            batch_num_total = self._batch_num_total

            self.optimizer.zero_grad()

            for batch in batch_group:
                loss = self.batch_loss(batch, for_training=True)
                if torch.isnan(loss):
                    raise ValueError("nan loss encountered")
                loss = loss / len(batch_group)
                loss.backward()
                train_loss += loss.item()

            batch_grad_norm = self.rescale_gradients()

            # This does nothing if batch_num_total is None or you are using a
            # scheduler which doesn't update per batch.
            if self._learning_rate_scheduler:
                self._learning_rate_scheduler.step_batch(batch_num_total)
            if self._momentum_scheduler:
                self._momentum_scheduler.step_batch(batch_num_total)

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                # get the magnitude of parameter updates for logging
                # We need a copy of current parameters to compute magnitude of updates,
                # and copy them to CPU so large models won't go OOM on the GPU.
                param_updates = {
                    name: param.detach().cpu().clone()
                    for name, param in self.model.named_parameters()
                }
                self.optimizer.step()
                for name, param in self.model.named_parameters():
                    param_updates[name].sub_(param.detach().cpu())
                    update_norm = torch.norm(param_updates[name].view(-1))
                    param_norm = torch.norm(param.view(-1)).cpu()
                    self._tensorboard.add_train_scalar(
                        "gradient_update/" + name,
                        update_norm / (param_norm + 1e-7))
            else:
                self.optimizer.step()

            # Update moving averages
            if self._moving_average is not None:
                self._moving_average.apply(batch_num_total)

            # Update the description with the latest metrics
            metrics = training_util.get_metrics(
                self.model,
                train_loss,
                batches_this_epoch,
                world_size=self._world_size,
                cuda_device=[self.cuda_device],
            )

            # Updating tqdm only for the master as the trainers wouldn't have one
            if self._master:
                description = training_util.description_from_metrics(metrics)
                batch_group_generator_tqdm.set_description(description,
                                                           refresh=False)

            # Log parameter values to Tensorboard (only from the master)
            if self._tensorboard.should_log_this_batch() and self._master:
                self._tensorboard.log_parameter_and_gradient_statistics(
                    self.model, batch_grad_norm)
                self._tensorboard.log_learning_rates(self.model,
                                                     self.optimizer)

                self._tensorboard.add_train_scalar("loss/loss_train",
                                                   metrics["loss"])
                self._tensorboard.log_metrics(
                    {"epoch_metrics/" + k: v
                     for k, v in metrics.items()})

            if self._tensorboard.should_log_histograms_this_batch(
            ) and self._master:
                self._tensorboard.log_histograms(self.model,
                                                 histogram_parameters)

            if self._log_batch_size_period:
                batch_group_size = sum(
                    training_util.get_batch_size(batch)
                    for batch in batch_group)
                cumulative_batch_group_size += batch_group_size
                if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                    average = cumulative_batch_group_size / batches_this_epoch
                    logger.info(
                        f"current batch size: {batch_group_size} mean batch size: {average}"
                    )
                    self._tensorboard.add_train_scalar("current_batch_size",
                                                       batch_group_size)
                    self._tensorboard.add_train_scalar("mean_batch_size",
                                                       average)

            # Save model if needed.
            if (self._model_save_interval is not None and
                (time.time() - last_save_time > self._model_save_interval)
                    and self._master):
                last_save_time = time.time()
                self._save_checkpoint("{0}.{1}".format(
                    epoch, training_util.time_to_str(int(last_save_time))))
        if self._distributed and not done_early:
            logger.warning(
                f"Worker {torch.distributed.get_rank()} completed its entire epoch (training)."
            )
            # Indicate that we're done so that any workers that have remaining data stop the epoch early.
            done = torch.tensor(1, device=self.cuda_device)
            torch.distributed.all_reduce(done, torch.distributed.ReduceOp.SUM)
            assert done.item()

        # Let all workers finish their epoch before computing
        # the final statistics for the epoch.
        if self._distributed:
            dist.barrier()

        metrics = training_util.get_metrics(
            self.model,
            train_loss,
            batches_this_epoch,
            reset=True,
            world_size=self._world_size,
            cuda_device=[self.cuda_device],
        )
        metrics["cpu_memory_MB"] = peak_cpu_usage
        for (gpu_num, memory) in gpu_usage:
            metrics["gpu_" + str(gpu_num) + "_memory_MB"] = memory
        return metrics
loader = IrTupleDatasetReader(lazy=True, lowercase=True)
vocab = Vocabulary.from_files(args.vocab)
if args.qrel:
    qrels = load_reference(args.qrel)

not_judged = 0
oov_queries = 0
non_oov_queries = 0
oov_count_list = []
instances = loader.read(args.query)

with open(args.out_file_oov, "w", encoding="utf8") as out_file_oov:
    with open(args.out_file_no_oov, "w", encoding="utf8") as out_file_non_oov:

        for i in Tqdm.tqdm(instances):
            id_str = i["source_tokens"].tokens[0].text
            if args.qrel and int(id_str) not in qrels:
                not_judged += 1
                continue

            i.index_fields(vocab)

            indexes = i["target_tokens"]._indexed_tokens["tokens"]

            if 1 in i["target_tokens"]._indexed_tokens["tokens"]:
                # we have a oov query
                oov_queries += 1
                oov_count_list.append(sum(1 for t in indexes if t == 1))

                out_file_oov.write(id_str + "\t" + " ".join(
コード例 #32
0
ファイル: train.py プロジェクト: sebastianGehrmann/allennlp
def train_model(params: Params,
                serialization_dir: str,
                file_friendly_logging: bool = False,
                recover: bool = False) -> Model:
    """
    Trains the model specified in the given :class:`Params` object, using the data and training
    parameters also specified in that object, and saves the results in ``serialization_dir``.

    Parameters
    ----------
    params : ``Params``
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir : ``str``
        The directory in which to save results and logs.
    file_friendly_logging : ``bool``, optional (default=False)
        If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow
        down tqdm's output to only once every 10 seconds.
    recover : ``bool`, optional (default=False)
        If ``True``, we will try to recover a training run from an existing serialization
        directory.  This is only intended for use when something actually crashed during the middle
        of a run.  For continuing training a model on new data, see the ``fine-tune`` command.
    """
    prepare_environment(params)

    create_serialization_dir(params, serialization_dir, recover)

    # TODO(mattg): pull this block out into a separate function (maybe just add this to
    # `prepare_environment`?)
    Tqdm.set_slower_interval(file_friendly_logging)
    sys.stdout = TeeLogger(
        os.path.join(serialization_dir, "stdout.log"),  # type: ignore
        sys.stdout,
        file_friendly_logging)
    sys.stderr = TeeLogger(
        os.path.join(serialization_dir, "stderr.log"),  # type: ignore
        sys.stderr,
        file_friendly_logging)
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)

    serialization_params = deepcopy(params).as_dict(quiet=True)
    with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(
        params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(
                f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        (instance for key, dataset in all_datasets.items()
         for instance in dataset if key in datasets_for_vocab_creation))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
    test_data = all_datasets.get('test')

    trainer_params = params.pop("trainer")
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    evaluate_on_test = params.pop_bool("evaluate_on_test", False)
    params.assert_empty('base train command')
    metrics = trainer.train()

    # Now tar up results
    archive_model(serialization_dir, files_to_archive=params.files_to_archive)

    if test_data and evaluate_on_test:
        test_metrics = evaluate(model,
                                test_data,
                                iterator,
                                cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
        for key, value in test_metrics.items():
            metrics["test_" + key] = value

    elif test_data:
        logger.info(
            "To evaluate on the test set after training, pass the "
            "'evaluate_on_test' flag, or use the 'allennlp evaluate' command.")

    metrics_json = json.dumps(metrics, indent=2)
    with open(os.path.join(serialization_dir, "metrics.json"),
              "w") as metrics_file:
        metrics_file.write(metrics_json)
    logger.info("Metrics: %s", metrics_json)

    return model
コード例 #33
0
ファイル: embedding.py プロジェクト: pyknife/allennlp
def _read_embeddings_from_text_file(file_uri: str,
                                    embedding_dim: int,
                                    vocab: Vocabulary,
                                    namespace: str = "tokens") -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
    vocab_size = vocab.get_vocab_size(namespace)
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for line in Tqdm.tqdm(embeddings_file):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning("Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                                   embedding_dim, len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector

    if not embeddings:
        raise ConfigurationError("No embeddings of correct dimension found; you probably "
                                 "misspecified your embedding_dim parameter, or didn't "
                                 "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(embeddings_mean,
                                                                            embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug("Token %s was not found in the embedding file. Initialising randomly.", token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix