Esempio n. 1
0
    def evaluation_loop(
            self,
            loader,
            use_tqdm: bool = False,
            single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]:
        meter = Meter()

        with torch.no_grad():
            self.model.eval()
            disable_tqdm = not use_tqdm or not is_master()
            combined_report = None

            for batch in tqdm.tqdm(loader, disable=disable_tqdm):
                report = self._forward(batch)
                self.update_meter(report, meter)

                # accumulate necessary params for metric calculation
                if combined_report is None:
                    combined_report = report
                else:
                    combined_report.accumulate_tensor_fields_and_loss(
                        report, self.metrics.required_params)
                    combined_report.batch_size += report.batch_size

                if single_batch is True:
                    break

            combined_report.metrics = self.metrics(combined_report,
                                                   combined_report)
            self.update_meter(combined_report, meter, eval_mode=True)

            # enable train mode again
            self.model.train()

        return combined_report, meter
    def _try_download(self):
        _is_master = is_master()

        if self._already_downloaded:
            return

        needs_download = False

        if not hasattr(self.config, "model_file"):
            if _is_master:
                warnings.warn("'model_file' key is required but missing "
                              "from FastTextProcessor's config.")
            needs_download = True

        model_file = self.config.model_file
        # If model_file is already an existing path don't join to cache dir
        if not PathManager.exists(model_file):
            model_file = os.path.join(get_mmf_cache_dir(), model_file)

        if not PathManager.exists(model_file):
            if _is_master:
                warnings.warn(f"No model file present at {model_file}.")
            needs_download = True

        if needs_download:
            logger.info("Downloading FastText bin")
            model_file = self._download_model()

        self.model_file = model_file
        self._already_downloaded = True
        synchronize()
    def flush_report(self):
        if not is_master():
            return

        name = self.current_dataset.dataset_name
        time_format = "%Y-%m-%dT%H:%M:%S"
        time = self.timer.get_time_hhmmss(None, format=time_format)

        filename = name + "_"

        if len(self.experiment_name) > 0:
            filename += self.experiment_name + "_"

        filename += self.task_type + "_"
        filename += time

        if self.config.evaluation.predict_file_format == "csv":
            filepath = os.path.join(self.report_folder, filename + ".csv")
            self.csv_dump(filepath)
        else:
            filepath = os.path.join(self.report_folder, filename + ".json")
            self.json_dump(filepath)

        logger.info(
            f"Wrote predictions for {name} to {os.path.abspath(filepath)}")
        self.report = []
    def _threaded_read(self):
        elements = [idx for idx in range(1, len(self.annotation_db))]
        pool = ThreadPool(processes=4)

        with tqdm.tqdm(total=len(elements), disable=not is_master()) as pbar:
            for i, _ in enumerate(
                    pool.imap_unordered(self._fill_cache, elements)):
                if i % 100 == 0:
                    pbar.update(100)
        pool.close()
    def __init__(self, embedding_name, *args, **kwargs):
        """Use this if you want to use pretrained embedding. See description
        of IntersectedVocab to get a list of the embedding available from
        torchtext

        Parameters
        ----------
        embedding_name : str
            Name of the pretrained alias for the embedding to used
        """
        self.type = "pretrained"

        if embedding_name not in vocab.pretrained_aliases:
            raise RuntimeError(f"Unknown embedding type: {embedding_name}")

        vector_cache = get_mmf_cache_dir()

        # First test loading the vectors in master so that everybody doesn't
        # download it in case it doesn't exist
        if is_master():
            vocab.pretrained_aliases[embedding_name](cache=vector_cache)
        synchronize()

        embedding = vocab.pretrained_aliases[embedding_name](
            cache=vector_cache)

        self.UNK_INDEX = 3
        self.stoi = defaultdict(lambda: self.UNK_INDEX)
        self.itos = {}

        self.itos[self.PAD_INDEX] = self.PAD_TOKEN
        self.itos[self.SOS_INDEX] = self.SOS_TOKEN
        self.itos[self.EOS_INDEX] = self.EOS_TOKEN
        self.itos[self.UNK_INDEX] = self.UNK_TOKEN

        self.stoi[self.SOS_TOKEN] = self.SOS_INDEX
        self.stoi[self.EOS_TOKEN] = self.EOS_INDEX
        self.stoi[self.PAD_TOKEN] = self.PAD_INDEX
        self.stoi[self.UNK_TOKEN] = self.UNK_INDEX

        self.vectors = torch.FloatTensor(
            len(self.itos.keys()) + len(embedding.itos),
            len(embedding.vectors[0]))

        for i in range(4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        index = 4
        for word in embedding.stoi:
            self.itos[index] = word
            self.stoi[word] = index
            actual_index = embedding.stoi[word]
            self.vectors[index] = embedding.vectors[actual_index]
            index += 1
    def __init__(self, dataset_type="train"):
        self._dataset_type = dataset_type
        self._is_master = is_master()

        self._datasets = []
        self._loaders = []
        self._samplers = []
        self._iterators = []

        self._total_length = 0
        self._per_dataset_lengths = []
        self._num_datasets = 0
        self._finished_iterators = {}
def download_pretrained_model(model_name, *args, **kwargs):
    import omegaconf
    from omegaconf import OmegaConf

    from VisualBERT.mmf.utils.configuration import get_mmf_env, load_yaml

    model_zoo = load_yaml(get_mmf_env(key="model_zoo"))
    OmegaConf.set_struct(model_zoo, True)
    OmegaConf.set_readonly(model_zoo, True)

    data_dir = get_absolute_path(get_mmf_env("data_dir"))
    model_data_dir = os.path.join(data_dir, "models")
    download_path = os.path.join(model_data_dir, model_name)

    try:
        model_config = OmegaConf.select(model_zoo, model_name)
    except omegaconf.errors.OmegaConfBaseException as e:
        print(f"No such model name {model_name} defined in mmf zoo")
        raise e

    if "version" not in model_config or "resources" not in model_config:
        # Version and Resources are not present time to try the defaults
        try:
            model_config = model_config.defaults
            download_path = os.path.join(model_data_dir,
                                         model_name + ".defaults")
        except omegaconf.errors.OmegaConfBaseException as e:
            print(
                f"Model name {model_name} doesn't specify 'resources' and 'version' "
                "while no defaults have been provided")
            raise e

    # Download requirements if any specified by "zoo_requirements" field
    # This can either be a list or a string
    if "zoo_requirements" in model_config:
        requirements = model_config.zoo_requirements
        if isinstance(requirements, str):
            requirements = [requirements]
        for item in requirements:
            download_pretrained_model(item, *args, **kwargs)

    version = model_config.version
    resources = model_config.resources

    if is_master():
        download_resources(resources, download_path, version)
    synchronize()

    return download_path
Esempio n. 8
0
    def try_fast_read(self):
        # Don't fast read in case of test set.
        if self._dataset_type == "test":
            return

        if hasattr(self, "_should_fast_read") and self._should_fast_read is True:
            logger.info(
                f"Starting to fast read {self.dataset_name} {self.dataset_type} "
                + "dataset"
            )
            self.cache = {}
            for idx in tqdm.tqdm(
                range(len(self.annotation_db)), miniters=100, disable=not is_master()
            ):
                self.cache[idx] = self.load_item(idx)
    def __init__(self, log_folder="./logs", iteration=0):
        # This would handle warning of missing tensorboard
        from torch.utils.tensorboard import SummaryWriter

        self.summary_writer = None
        self._is_master = is_master()
        self.timer = Timer()
        self.log_folder = log_folder
        self.time_format = "%Y-%m-%dT%H:%M:%S"

        if self._is_master:
            current_time = self.timer.get_time_hhmmss(None,
                                                      format=self.time_format)
            tensorboard_folder = os.path.join(self.log_folder,
                                              f"tensorboard_{current_time}")
            self.summary_writer = SummaryWriter(tensorboard_folder)
Esempio n. 10
0
    def __call__(self, update, iteration, meter):
        """
        Method to be called everytime you need to check whether to
        early stop or not
        Arguments:
            update {number}: Current update number
            iteration {number}: Current iteration number
        Returns:
            bool -- Tells whether early stopping occurred or not
        """
        if not is_master():
            return False

        value = meter.meters.get(self.early_stop_criteria, None)
        if value is None:
            raise ValueError(
                "Criteria used for early stopping ({}) is not "
                "present in meter.".format(self.early_stop_criteria)
            )

        value = value.global_avg

        if isinstance(value, torch.Tensor):
            value = value.item()

        if (self.minimize and value < self.best_monitored_value) or (
            not self.minimize and value > self.best_monitored_value
        ):
            self.best_monitored_value = value
            self.best_monitored_iteration = iteration
            self.best_monitored_update = update
            self.checkpoint.save(update, iteration, update_best=True)

        elif self.best_monitored_update + self.patience < update:
            self.activated = True
            if self.should_stop is True:
                self.checkpoint.restore()
                self.checkpoint.finalize()
                return True
            else:
                return False
        else:
            self.checkpoint.save(update, iteration, update_best=False)

        return False
    def add_to_report(self, report, model):
        keys = [
            "id", "question_id", "image_id", "context_tokens", "captions",
            "scores"
        ]
        for key in keys:
            report = self.reshape_and_gather(report, key)

        if not is_master():
            return

        results = self.current_dataset.format_for_prediction(report)

        if hasattr(model, "format_for_prediction"):
            results = model.format_for_prediction(results, report)
        elif hasattr(model.module, "format_for_prediction"):
            results = model.module.format_for_prediction(results, report)

        self.report = self.report + results
    def load(self):
        self.image_path = os.path.join(
            self._data_folder, _CONSTANTS["images_folder"], self._dataset_type
        )

        with open(
            os.path.join(
                self._data_folder,
                _CONSTANTS["questions_folder"],
                _TEMPLATES["question_json_file"].format(self._dataset_type),
            )
        ) as f:
            self.questions = json.load(f)[_CONSTANTS["questions_key"]]

            # Vocab should only be built in main process, as it will repetition of same task
            if is_master():
                self._build_vocab(self.questions, _CONSTANTS["question_key"])
                self._build_vocab(self.questions, _CONSTANTS["answer_key"])
            synchronize()
Esempio n. 13
0
    def build_dataset(self, config, dataset_type="train", *args, **kwargs):
        """
        Similar to load function, used by MMF to build a dataset for first
        time when it is not available. This internally calls 'build' function.
        Override that function in your child class.

        Args:
            config (DictConfig): Configuration of this dataset loaded from
                                 config.
            dataset_type (str): Type of dataset, train|val|test

        .. warning::

            DO NOT OVERRIDE in child class. Instead override ``build``.
        """
        # Only build in main process, so none of the others have to build
        if is_master():
            self.build(config, dataset_type, *args, **kwargs)
        synchronize()
    def _download_model(self):
        _is_master = is_master()

        model_file_path = os.path.join(get_mmf_cache_dir(), "wiki.en.bin")

        if not _is_master:
            return model_file_path

        if PathManager.exists(model_file_path):
            logger.info(f"Vectors already present at {model_file_path}.")
            return model_file_path

        import requests
        from tqdm import tqdm

        from VisualBERT.mmf.common.constants import FASTTEXT_WIKI_URL

        PathManager.mkdirs(os.path.dirname(model_file_path))
        response = requests.get(FASTTEXT_WIKI_URL, stream=True)

        with PathManager.open(model_file_path, "wb") as f:
            pbar = tqdm(
                total=int(response.headers["Content-Length"]) / 4096,
                miniters=50,
                disable=not _is_master,
            )

            idx = 0
            for data in response.iter_content(chunk_size=4096):
                if data:
                    if idx % 50 == 0:
                        pbar.update(len(data))
                    f.write(data)
                    idx += 1

            pbar.close()

        logger.info(f"fastText bin downloaded at {model_file_path}.")

        return model_file_path
    def _summarize_report(self, meter, should_print=True, extra=None):
        if extra is None:
            extra = {}
        if not is_master():
            return

        if self.training_config.tensorboard:
            scalar_dict = meter.get_scalar_dict()
            self.tb_writer.add_scalars(scalar_dict,
                                       self.trainer.current_iteration)

        if not should_print:
            return
        log_dict = {}
        if hasattr(self.trainer, "num_updates") and hasattr(
                self.trainer, "max_updates"):
            log_dict.update({
                "progress":
                f"{self.trainer.num_updates}/{self.trainer.max_updates}"
            })
        log_dict.update(meter.get_log_dict())
        log_dict.update(extra)

        log_progress(log_dict)
    def save(self, update, iteration=None, update_best=False):
        # Only save in main process
        if not is_master():
            return

        if not iteration:
            iteration = update

        ckpt_filepath = os.path.join(self.models_foldername,
                                     "model_%d.ckpt" % update)
        best_ckpt_filepath = os.path.join(self.ckpt_foldername,
                                          self.ckpt_prefix + "best.ckpt")
        current_ckpt_filepath = os.path.join(self.ckpt_foldername,
                                             self.ckpt_prefix + "current.ckpt")

        best_iteration = (self.trainer.early_stop_callback.early_stopping.
                          best_monitored_iteration)
        best_update = (self.trainer.early_stop_callback.early_stopping.
                       best_monitored_update)
        best_metric = (self.trainer.early_stop_callback.early_stopping.
                       best_monitored_value)
        model = self.trainer.model
        data_parallel = registry.get("data_parallel") or registry.get(
            "distributed")
        fp16_scaler = getattr(self.trainer, "scaler", None)
        fp16_scaler_dict = None

        if fp16_scaler is not None:
            fp16_scaler_dict = fp16_scaler.state_dict()

        if data_parallel is True:
            model = model.module

        ckpt = {
            "model": model.state_dict(),
            "optimizer": self.trainer.optimizer.state_dict(),
            "best_iteration": best_iteration,
            "current_iteration": iteration,
            "current_epoch": self.trainer.current_epoch,
            "num_updates": update,
            "best_update": best_update,
            "best_metric_value": best_metric,
            "fp16_scaler": fp16_scaler_dict,
            # Convert to container to avoid any dependencies
            "config": OmegaConf.to_container(self.config, resolve=True),
        }

        lr_scheduler = self.trainer.lr_scheduler_callback._scheduler
        if lr_scheduler is not None:
            ckpt["lr_scheduler"] = lr_scheduler.state_dict()

        if self.git_repo:
            git_metadata_dict = self._get_vcs_fields()
            ckpt.update(git_metadata_dict)

        with PathManager.open(ckpt_filepath, "wb") as f:
            torch.save(ckpt, f)

        if update_best:
            with PathManager.open(best_ckpt_filepath, "wb") as f:
                torch.save(ckpt, f)

        # Save current always
        with PathManager.open(current_ckpt_filepath, "wb") as f:
            torch.save(ckpt, f)

        # Remove old checkpoints if max_to_keep is set
        if self.max_to_keep > 0:
            if len(self.saved_iterations) == self.max_to_keep:
                self.remove(self.saved_iterations.pop(0))
            self.saved_iterations.append(update)
Esempio n. 17
0
    def __init__(self, vocab_file, embedding_name, *args, **kwargs):
        """Use this vocab class when you have a custom vocabulary class but you
        want to use pretrained embedding vectos for it. This will only load
        the vectors which intersect with your vocabulary. Use the
        embedding_name specified in torchtext's pretrained aliases:
        ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d',
         'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d',
         'glove.twitter.27B.50d', 'glove.twitter.27B.100d',
         'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d',
         'glove.6B.200d', 'glove.6B.300d']

        Parameters
        ----------
        vocab_file : str
            Vocabulary file containing list of words with one word per line
            which will be used to collect vectors
        embedding_name : str
            Embedding name picked up from the list of the pretrained aliases
            mentioned above
        """
        super().__init__(vocab_file, *args, **kwargs)

        self.type = "intersected"

        name = embedding_name.split(".")[0]
        dim = embedding_name.split(".")[2][:-1]
        middle = embedding_name.split(".")[1]

        class_name = EMBEDDING_NAME_CLASS_MAPPING[name]

        if not hasattr(vocab, class_name):
            raise RuntimeError(f"Unknown embedding type: {name}")

        params = [middle]

        if name == "glove":
            params.append(int(dim))

        vector_cache = get_mmf_cache_dir()

        # First test loading the vectors in master so that everybody doesn't
        # download it in case it doesn't exist
        if is_master():
            vocab.pretrained_aliases[embedding_name](cache=vector_cache)
        synchronize()

        embedding = getattr(vocab, class_name)(*params, cache=vector_cache)

        self.vectors = torch.empty(
            (self.get_size(), len(embedding.vectors[0])), dtype=torch.float)

        self.embedding_dim = len(embedding.vectors[0])

        for i in range(0, 4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        for i in range(4, self.get_size()):
            word = self.itos[i]
            embedding_index = embedding.stoi.get(word, None)

            if embedding_index is None:
                self.vectors[i] = self.vectors[self.UNK_INDEX]
            else:
                self.vectors[i] = embedding.vectors[embedding_index]
Esempio n. 18
0
    def evaluation_loop(self, loader, on_test_end, use_tqdm: bool = False):
        self.model.eval()
        expl = ExplanationGenerator.SelfAttentionGenerator(self.model)

        method = perturbation_arguments.args.method
        pert_type = "pos" if perturbation_arguments.args.is_positive_pert else "neg"
        modality = "text" if perturbation_arguments.args.is_text_pert else "image"
        num_samples = perturbation_arguments.args.num_samples
        method_expl = {
            "transformer_attribution": expl.generate_transformer_att,
            "ours_no_lrp": expl.generate_ours,
            "partial_lrp": expl.generate_partial_lrp,
            "raw_attn": expl.generate_raw_attn,
            "attn_gradcam": expl.generate_attn_gradcam,
            "rollout": expl.generate_rollout
        }

        i = 0
        # saving cams per method for all the samples
        self.model.eval()
        disable_tqdm = not use_tqdm or not is_master()
        if modality == "image":
            steps = [0, 0.5, 0.75, 0.95, 0.96, 0.97, 0.98, 0.99, 1]
        else:
            steps = [0, 0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
        step_acc = [0] * len(steps)
        print("test type {0} pert type {1} expl type {2}".format(
            modality, pert_type, method))
        for batch in tqdm.tqdm(loader, disable=disable_tqdm):
            method_cam = method_expl[method](batch)

            if pert_type == "pos":
                method_cam *= -1
            if modality == "image":
                input_mask = batch['input_mask']
                bbox_scores = method_cam[0, input_mask.sum(1):]
                image_boxes_len = len(bbox_scores)
                image_features = batch['image_feature_0'].clone()
                image_bboxes = batch['image_info_0']['bbox'][0].copy()
                for step_idx, step in enumerate(steps):
                    curr_num_tokens = int((1 - step) * image_boxes_len)
                    # find top step boxes
                    _, top_bboxes_indices = bbox_scores.topk(k=curr_num_tokens,
                                                             dim=-1)
                    top_bboxes_indices = top_bboxes_indices.cpu().data.numpy()

                    # remove the top step boxes from the batch info
                    batch[
                        'image_feature_0'] = image_features[:,
                                                            top_bboxes_indices, :]
                    batch['image_info_0']['bbox'][0] = image_bboxes[
                        top_bboxes_indices]
                    batch['image_info_0']['max_features'] = torch.tensor(
                        curr_num_tokens).to(
                            batch['image_feature_0'].device).view(1)
                    batch['image_info_0']['num_boxes'][0] = curr_num_tokens

                    report = self._forward(batch)
                    step_acc[step_idx] += report["targets"][
                        0, report["scores"].argmax()].item()

                i += 1
                if i > num_samples:
                    break
            else:
                input_mask = batch['input_mask'].clone()
                # the CLS here is ?
                cls_index = (input_mask.sum(1) - 2).item()
                seg_ids = batch["segment_ids"].clone()
                # we don't count the ? token since it's the equivalent to CLS here
                # and we want to keep the CLS intact
                text_scores = method_cam[0, 1:cls_index]
                text_len = len(text_scores)
                input_ids = batch['input_ids'].clone()
                tokens = batch['tokens'].copy()
                for step_idx, step in enumerate(steps):
                    curr_num_tokens = int((1 - step) * text_len)
                    # find top step tokens
                    _, top_bboxes_indices = text_scores.topk(k=curr_num_tokens,
                                                             dim=-1)
                    top_bboxes_indices = top_bboxes_indices.cpu().data.numpy()
                    # sorting for positional embedding
                    top_bboxes_indices = list(top_bboxes_indices)
                    # add the last 2 tokens (CLS+SEP)
                    top_bboxes_indices = [0, cls_index, cls_index+1] +\
                                 [top_bboxes_indices[i] + 1 for i in range(len(top_bboxes_indices))]
                    top_bboxes_indices = sorted(top_bboxes_indices)

                    # modify the first tokens of the input mask
                    input_mask_indices = top_bboxes_indices + \
                                         [i for i in range(input_mask.sum(1), input_mask.shape[1])]

                    # remove the top step boxes from the batch info
                    batch['input_ids'] = input_ids[:, top_bboxes_indices]
                    batch['tokens'] = [[
                        tokens[0][i] for i in top_bboxes_indices
                    ]]
                    batch['input_mask'] = input_mask[:, input_mask_indices]
                    batch["segment_ids"] = seg_ids[:, input_mask_indices]

                    report = self._forward(batch)
                    step_acc[step_idx] += report["targets"][
                        0, report["scores"].argmax()].item()

                i += 1
                if i > num_samples:
                    break
        print("pert type {0}".format(pert_type))
        step_acc = [acc / num_samples * 100 for acc in step_acc]
        print(step_acc)
 def finalize(self):
     if is_master():
         with PathManager.open(self.pth_filepath, "wb") as f:
             torch.save(self.trainer.model.state_dict(), f)