def evaluation_loop( self, loader, use_tqdm: bool = False, single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]: meter = Meter() with torch.no_grad(): self.model.eval() disable_tqdm = not use_tqdm or not is_master() combined_report = None for batch in tqdm.tqdm(loader, disable=disable_tqdm): report = self._forward(batch) self.update_meter(report, meter) # accumulate necessary params for metric calculation if combined_report is None: combined_report = report else: combined_report.accumulate_tensor_fields_and_loss( report, self.metrics.required_params) combined_report.batch_size += report.batch_size if single_batch is True: break combined_report.metrics = self.metrics(combined_report, combined_report) self.update_meter(combined_report, meter, eval_mode=True) # enable train mode again self.model.train() return combined_report, meter
def _try_download(self): _is_master = is_master() if self._already_downloaded: return needs_download = False if not hasattr(self.config, "model_file"): if _is_master: warnings.warn("'model_file' key is required but missing " "from FastTextProcessor's config.") needs_download = True model_file = self.config.model_file # If model_file is already an existing path don't join to cache dir if not PathManager.exists(model_file): model_file = os.path.join(get_mmf_cache_dir(), model_file) if not PathManager.exists(model_file): if _is_master: warnings.warn(f"No model file present at {model_file}.") needs_download = True if needs_download: logger.info("Downloading FastText bin") model_file = self._download_model() self.model_file = model_file self._already_downloaded = True synchronize()
def flush_report(self): if not is_master(): return name = self.current_dataset.dataset_name time_format = "%Y-%m-%dT%H:%M:%S" time = self.timer.get_time_hhmmss(None, format=time_format) filename = name + "_" if len(self.experiment_name) > 0: filename += self.experiment_name + "_" filename += self.task_type + "_" filename += time if self.config.evaluation.predict_file_format == "csv": filepath = os.path.join(self.report_folder, filename + ".csv") self.csv_dump(filepath) else: filepath = os.path.join(self.report_folder, filename + ".json") self.json_dump(filepath) logger.info( f"Wrote predictions for {name} to {os.path.abspath(filepath)}") self.report = []
def _threaded_read(self): elements = [idx for idx in range(1, len(self.annotation_db))] pool = ThreadPool(processes=4) with tqdm.tqdm(total=len(elements), disable=not is_master()) as pbar: for i, _ in enumerate( pool.imap_unordered(self._fill_cache, elements)): if i % 100 == 0: pbar.update(100) pool.close()
def __init__(self, embedding_name, *args, **kwargs): """Use this if you want to use pretrained embedding. See description of IntersectedVocab to get a list of the embedding available from torchtext Parameters ---------- embedding_name : str Name of the pretrained alias for the embedding to used """ self.type = "pretrained" if embedding_name not in vocab.pretrained_aliases: raise RuntimeError(f"Unknown embedding type: {embedding_name}") vector_cache = get_mmf_cache_dir() # First test loading the vectors in master so that everybody doesn't # download it in case it doesn't exist if is_master(): vocab.pretrained_aliases[embedding_name](cache=vector_cache) synchronize() embedding = vocab.pretrained_aliases[embedding_name]( cache=vector_cache) self.UNK_INDEX = 3 self.stoi = defaultdict(lambda: self.UNK_INDEX) self.itos = {} self.itos[self.PAD_INDEX] = self.PAD_TOKEN self.itos[self.SOS_INDEX] = self.SOS_TOKEN self.itos[self.EOS_INDEX] = self.EOS_TOKEN self.itos[self.UNK_INDEX] = self.UNK_TOKEN self.stoi[self.SOS_TOKEN] = self.SOS_INDEX self.stoi[self.EOS_TOKEN] = self.EOS_INDEX self.stoi[self.PAD_TOKEN] = self.PAD_INDEX self.stoi[self.UNK_TOKEN] = self.UNK_INDEX self.vectors = torch.FloatTensor( len(self.itos.keys()) + len(embedding.itos), len(embedding.vectors[0])) for i in range(4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i index = 4 for word in embedding.stoi: self.itos[index] = word self.stoi[word] = index actual_index = embedding.stoi[word] self.vectors[index] = embedding.vectors[actual_index] index += 1
def __init__(self, dataset_type="train"): self._dataset_type = dataset_type self._is_master = is_master() self._datasets = [] self._loaders = [] self._samplers = [] self._iterators = [] self._total_length = 0 self._per_dataset_lengths = [] self._num_datasets = 0 self._finished_iterators = {}
def download_pretrained_model(model_name, *args, **kwargs): import omegaconf from omegaconf import OmegaConf from VisualBERT.mmf.utils.configuration import get_mmf_env, load_yaml model_zoo = load_yaml(get_mmf_env(key="model_zoo")) OmegaConf.set_struct(model_zoo, True) OmegaConf.set_readonly(model_zoo, True) data_dir = get_absolute_path(get_mmf_env("data_dir")) model_data_dir = os.path.join(data_dir, "models") download_path = os.path.join(model_data_dir, model_name) try: model_config = OmegaConf.select(model_zoo, model_name) except omegaconf.errors.OmegaConfBaseException as e: print(f"No such model name {model_name} defined in mmf zoo") raise e if "version" not in model_config or "resources" not in model_config: # Version and Resources are not present time to try the defaults try: model_config = model_config.defaults download_path = os.path.join(model_data_dir, model_name + ".defaults") except omegaconf.errors.OmegaConfBaseException as e: print( f"Model name {model_name} doesn't specify 'resources' and 'version' " "while no defaults have been provided") raise e # Download requirements if any specified by "zoo_requirements" field # This can either be a list or a string if "zoo_requirements" in model_config: requirements = model_config.zoo_requirements if isinstance(requirements, str): requirements = [requirements] for item in requirements: download_pretrained_model(item, *args, **kwargs) version = model_config.version resources = model_config.resources if is_master(): download_resources(resources, download_path, version) synchronize() return download_path
def try_fast_read(self): # Don't fast read in case of test set. if self._dataset_type == "test": return if hasattr(self, "_should_fast_read") and self._should_fast_read is True: logger.info( f"Starting to fast read {self.dataset_name} {self.dataset_type} " + "dataset" ) self.cache = {} for idx in tqdm.tqdm( range(len(self.annotation_db)), miniters=100, disable=not is_master() ): self.cache[idx] = self.load_item(idx)
def __init__(self, log_folder="./logs", iteration=0): # This would handle warning of missing tensorboard from torch.utils.tensorboard import SummaryWriter self.summary_writer = None self._is_master = is_master() self.timer = Timer() self.log_folder = log_folder self.time_format = "%Y-%m-%dT%H:%M:%S" if self._is_master: current_time = self.timer.get_time_hhmmss(None, format=self.time_format) tensorboard_folder = os.path.join(self.log_folder, f"tensorboard_{current_time}") self.summary_writer = SummaryWriter(tensorboard_folder)
def __call__(self, update, iteration, meter): """ Method to be called everytime you need to check whether to early stop or not Arguments: update {number}: Current update number iteration {number}: Current iteration number Returns: bool -- Tells whether early stopping occurred or not """ if not is_master(): return False value = meter.meters.get(self.early_stop_criteria, None) if value is None: raise ValueError( "Criteria used for early stopping ({}) is not " "present in meter.".format(self.early_stop_criteria) ) value = value.global_avg if isinstance(value, torch.Tensor): value = value.item() if (self.minimize and value < self.best_monitored_value) or ( not self.minimize and value > self.best_monitored_value ): self.best_monitored_value = value self.best_monitored_iteration = iteration self.best_monitored_update = update self.checkpoint.save(update, iteration, update_best=True) elif self.best_monitored_update + self.patience < update: self.activated = True if self.should_stop is True: self.checkpoint.restore() self.checkpoint.finalize() return True else: return False else: self.checkpoint.save(update, iteration, update_best=False) return False
def add_to_report(self, report, model): keys = [ "id", "question_id", "image_id", "context_tokens", "captions", "scores" ] for key in keys: report = self.reshape_and_gather(report, key) if not is_master(): return results = self.current_dataset.format_for_prediction(report) if hasattr(model, "format_for_prediction"): results = model.format_for_prediction(results, report) elif hasattr(model.module, "format_for_prediction"): results = model.module.format_for_prediction(results, report) self.report = self.report + results
def load(self): self.image_path = os.path.join( self._data_folder, _CONSTANTS["images_folder"], self._dataset_type ) with open( os.path.join( self._data_folder, _CONSTANTS["questions_folder"], _TEMPLATES["question_json_file"].format(self._dataset_type), ) ) as f: self.questions = json.load(f)[_CONSTANTS["questions_key"]] # Vocab should only be built in main process, as it will repetition of same task if is_master(): self._build_vocab(self.questions, _CONSTANTS["question_key"]) self._build_vocab(self.questions, _CONSTANTS["answer_key"]) synchronize()
def build_dataset(self, config, dataset_type="train", *args, **kwargs): """ Similar to load function, used by MMF to build a dataset for first time when it is not available. This internally calls 'build' function. Override that function in your child class. Args: config (DictConfig): Configuration of this dataset loaded from config. dataset_type (str): Type of dataset, train|val|test .. warning:: DO NOT OVERRIDE in child class. Instead override ``build``. """ # Only build in main process, so none of the others have to build if is_master(): self.build(config, dataset_type, *args, **kwargs) synchronize()
def _download_model(self): _is_master = is_master() model_file_path = os.path.join(get_mmf_cache_dir(), "wiki.en.bin") if not _is_master: return model_file_path if PathManager.exists(model_file_path): logger.info(f"Vectors already present at {model_file_path}.") return model_file_path import requests from tqdm import tqdm from VisualBERT.mmf.common.constants import FASTTEXT_WIKI_URL PathManager.mkdirs(os.path.dirname(model_file_path)) response = requests.get(FASTTEXT_WIKI_URL, stream=True) with PathManager.open(model_file_path, "wb") as f: pbar = tqdm( total=int(response.headers["Content-Length"]) / 4096, miniters=50, disable=not _is_master, ) idx = 0 for data in response.iter_content(chunk_size=4096): if data: if idx % 50 == 0: pbar.update(len(data)) f.write(data) idx += 1 pbar.close() logger.info(f"fastText bin downloaded at {model_file_path}.") return model_file_path
def _summarize_report(self, meter, should_print=True, extra=None): if extra is None: extra = {} if not is_master(): return if self.training_config.tensorboard: scalar_dict = meter.get_scalar_dict() self.tb_writer.add_scalars(scalar_dict, self.trainer.current_iteration) if not should_print: return log_dict = {} if hasattr(self.trainer, "num_updates") and hasattr( self.trainer, "max_updates"): log_dict.update({ "progress": f"{self.trainer.num_updates}/{self.trainer.max_updates}" }) log_dict.update(meter.get_log_dict()) log_dict.update(extra) log_progress(log_dict)
def save(self, update, iteration=None, update_best=False): # Only save in main process if not is_master(): return if not iteration: iteration = update ckpt_filepath = os.path.join(self.models_foldername, "model_%d.ckpt" % update) best_ckpt_filepath = os.path.join(self.ckpt_foldername, self.ckpt_prefix + "best.ckpt") current_ckpt_filepath = os.path.join(self.ckpt_foldername, self.ckpt_prefix + "current.ckpt") best_iteration = (self.trainer.early_stop_callback.early_stopping. best_monitored_iteration) best_update = (self.trainer.early_stop_callback.early_stopping. best_monitored_update) best_metric = (self.trainer.early_stop_callback.early_stopping. best_monitored_value) model = self.trainer.model data_parallel = registry.get("data_parallel") or registry.get( "distributed") fp16_scaler = getattr(self.trainer, "scaler", None) fp16_scaler_dict = None if fp16_scaler is not None: fp16_scaler_dict = fp16_scaler.state_dict() if data_parallel is True: model = model.module ckpt = { "model": model.state_dict(), "optimizer": self.trainer.optimizer.state_dict(), "best_iteration": best_iteration, "current_iteration": iteration, "current_epoch": self.trainer.current_epoch, "num_updates": update, "best_update": best_update, "best_metric_value": best_metric, "fp16_scaler": fp16_scaler_dict, # Convert to container to avoid any dependencies "config": OmegaConf.to_container(self.config, resolve=True), } lr_scheduler = self.trainer.lr_scheduler_callback._scheduler if lr_scheduler is not None: ckpt["lr_scheduler"] = lr_scheduler.state_dict() if self.git_repo: git_metadata_dict = self._get_vcs_fields() ckpt.update(git_metadata_dict) with PathManager.open(ckpt_filepath, "wb") as f: torch.save(ckpt, f) if update_best: with PathManager.open(best_ckpt_filepath, "wb") as f: torch.save(ckpt, f) # Save current always with PathManager.open(current_ckpt_filepath, "wb") as f: torch.save(ckpt, f) # Remove old checkpoints if max_to_keep is set if self.max_to_keep > 0: if len(self.saved_iterations) == self.max_to_keep: self.remove(self.saved_iterations.pop(0)) self.saved_iterations.append(update)
def __init__(self, vocab_file, embedding_name, *args, **kwargs): """Use this vocab class when you have a custom vocabulary class but you want to use pretrained embedding vectos for it. This will only load the vectors which intersect with your vocabulary. Use the embedding_name specified in torchtext's pretrained aliases: ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d', 'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d', 'glove.twitter.27B.50d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d'] Parameters ---------- vocab_file : str Vocabulary file containing list of words with one word per line which will be used to collect vectors embedding_name : str Embedding name picked up from the list of the pretrained aliases mentioned above """ super().__init__(vocab_file, *args, **kwargs) self.type = "intersected" name = embedding_name.split(".")[0] dim = embedding_name.split(".")[2][:-1] middle = embedding_name.split(".")[1] class_name = EMBEDDING_NAME_CLASS_MAPPING[name] if not hasattr(vocab, class_name): raise RuntimeError(f"Unknown embedding type: {name}") params = [middle] if name == "glove": params.append(int(dim)) vector_cache = get_mmf_cache_dir() # First test loading the vectors in master so that everybody doesn't # download it in case it doesn't exist if is_master(): vocab.pretrained_aliases[embedding_name](cache=vector_cache) synchronize() embedding = getattr(vocab, class_name)(*params, cache=vector_cache) self.vectors = torch.empty( (self.get_size(), len(embedding.vectors[0])), dtype=torch.float) self.embedding_dim = len(embedding.vectors[0]) for i in range(0, 4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i for i in range(4, self.get_size()): word = self.itos[i] embedding_index = embedding.stoi.get(word, None) if embedding_index is None: self.vectors[i] = self.vectors[self.UNK_INDEX] else: self.vectors[i] = embedding.vectors[embedding_index]
def evaluation_loop(self, loader, on_test_end, use_tqdm: bool = False): self.model.eval() expl = ExplanationGenerator.SelfAttentionGenerator(self.model) method = perturbation_arguments.args.method pert_type = "pos" if perturbation_arguments.args.is_positive_pert else "neg" modality = "text" if perturbation_arguments.args.is_text_pert else "image" num_samples = perturbation_arguments.args.num_samples method_expl = { "transformer_attribution": expl.generate_transformer_att, "ours_no_lrp": expl.generate_ours, "partial_lrp": expl.generate_partial_lrp, "raw_attn": expl.generate_raw_attn, "attn_gradcam": expl.generate_attn_gradcam, "rollout": expl.generate_rollout } i = 0 # saving cams per method for all the samples self.model.eval() disable_tqdm = not use_tqdm or not is_master() if modality == "image": steps = [0, 0.5, 0.75, 0.95, 0.96, 0.97, 0.98, 0.99, 1] else: steps = [0, 0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95, 1] step_acc = [0] * len(steps) print("test type {0} pert type {1} expl type {2}".format( modality, pert_type, method)) for batch in tqdm.tqdm(loader, disable=disable_tqdm): method_cam = method_expl[method](batch) if pert_type == "pos": method_cam *= -1 if modality == "image": input_mask = batch['input_mask'] bbox_scores = method_cam[0, input_mask.sum(1):] image_boxes_len = len(bbox_scores) image_features = batch['image_feature_0'].clone() image_bboxes = batch['image_info_0']['bbox'][0].copy() for step_idx, step in enumerate(steps): curr_num_tokens = int((1 - step) * image_boxes_len) # find top step boxes _, top_bboxes_indices = bbox_scores.topk(k=curr_num_tokens, dim=-1) top_bboxes_indices = top_bboxes_indices.cpu().data.numpy() # remove the top step boxes from the batch info batch[ 'image_feature_0'] = image_features[:, top_bboxes_indices, :] batch['image_info_0']['bbox'][0] = image_bboxes[ top_bboxes_indices] batch['image_info_0']['max_features'] = torch.tensor( curr_num_tokens).to( batch['image_feature_0'].device).view(1) batch['image_info_0']['num_boxes'][0] = curr_num_tokens report = self._forward(batch) step_acc[step_idx] += report["targets"][ 0, report["scores"].argmax()].item() i += 1 if i > num_samples: break else: input_mask = batch['input_mask'].clone() # the CLS here is ? cls_index = (input_mask.sum(1) - 2).item() seg_ids = batch["segment_ids"].clone() # we don't count the ? token since it's the equivalent to CLS here # and we want to keep the CLS intact text_scores = method_cam[0, 1:cls_index] text_len = len(text_scores) input_ids = batch['input_ids'].clone() tokens = batch['tokens'].copy() for step_idx, step in enumerate(steps): curr_num_tokens = int((1 - step) * text_len) # find top step tokens _, top_bboxes_indices = text_scores.topk(k=curr_num_tokens, dim=-1) top_bboxes_indices = top_bboxes_indices.cpu().data.numpy() # sorting for positional embedding top_bboxes_indices = list(top_bboxes_indices) # add the last 2 tokens (CLS+SEP) top_bboxes_indices = [0, cls_index, cls_index+1] +\ [top_bboxes_indices[i] + 1 for i in range(len(top_bboxes_indices))] top_bboxes_indices = sorted(top_bboxes_indices) # modify the first tokens of the input mask input_mask_indices = top_bboxes_indices + \ [i for i in range(input_mask.sum(1), input_mask.shape[1])] # remove the top step boxes from the batch info batch['input_ids'] = input_ids[:, top_bboxes_indices] batch['tokens'] = [[ tokens[0][i] for i in top_bboxes_indices ]] batch['input_mask'] = input_mask[:, input_mask_indices] batch["segment_ids"] = seg_ids[:, input_mask_indices] report = self._forward(batch) step_acc[step_idx] += report["targets"][ 0, report["scores"].argmax()].item() i += 1 if i > num_samples: break print("pert type {0}".format(pert_type)) step_acc = [acc / num_samples * 100 for acc in step_acc] print(step_acc)
def finalize(self): if is_master(): with PathManager.open(self.pth_filepath, "wb") as f: torch.save(self.trainer.model.state_dict(), f)