def download_from_google_drive(gd_id, destination, redownload=True): """ Use the requests package to download a file from Google Drive. """ download = not PathManager.isfile(destination) or redownload URL = "https://docs.google.com/uc?export=download" if not download: return download else: # Check first if link is live check_header(gd_id, from_google=True) with requests.Session() as session: response = session.get(URL, params={"id": gd_id}, stream=True) token = _get_confirm_token(response) if token: response.close() params = {"id": gd_id, "confirm": token} response = session.get(URL, params=params, stream=True) CHUNK_SIZE = 32768 with PathManager.open(destination, "wb") as f: for chunk in response.iter_content(CHUNK_SIZE): if chunk: # filter out keep-alive new chunks f.write(chunk) response.close() return download
def __init__(self, config: Config, *args, **kwargs): super().__init__() model_data_dir = get_absolute_path(config.model_data_dir) if not os.path.isabs(config.weights_file): weights_file = os.path.join(model_data_dir, config.weights_file) if not os.path.isabs(config.bias_file): bias_file = os.path.join(model_data_dir, config.bias_file) if not PathManager.exists(bias_file) or not PathManager.exists( weights_file): download_path = download_pretrained_model("detectron.vmb_weights") weights_file = get_absolute_path( os.path.join(download_path, "fc7_w.pkl")) bias_file = get_absolute_path( os.path.join(download_path, "fc7_b.pkl")) with PathManager.open(weights_file, "rb") as w: weights = pickle.load(w) with PathManager.open(bias_file, "rb") as b: bias = pickle.load(b) out_dim = bias.shape[0] self.lc = nn.Linear(config.in_dim, out_dim) self.lc.weight.data.copy_(torch.from_numpy(weights)) self.lc.bias.data.copy_(torch.from_numpy(bias)) self.out_dim = out_dim
def assert_files(self, folder): files_needed = self.JSONL_PHASE_ONE_FILES phase_one = True for file in files_needed: try: assert PathManager.exists( os.path.join(folder, "data", file) ), f"{file} doesn't exist in {folder}" except AssertionError: phase_one = False if not phase_one: files_needed = self.JSONL_PHASE_TWO_FILES for file in files_needed: assert PathManager.exists( os.path.join(folder, "data", file) ), f"{file} doesn't exist in {folder}" else: warnings.warn( "You are on Phase 1 of the Hateful Memes Challenge. " "Please update to Phase 2" ) files_needed = self.IMAGE_FILES exists = False for file in files_needed: exists = exists or PathManager.exists(os.path.join(folder, "data", file)) if not exists: raise AssertionError("Neither img or img.tar.gz exists in current zip") return phase_one
def _try_download(self): _is_master = is_master() if self._already_downloaded: return needs_download = False if not hasattr(self.config, "model_file"): if _is_master: warnings.warn("'model_file' key is required but missing " "from FastTextProcessor's config.") needs_download = True model_file = self.config.model_file # If model_file is already an existing path don't join to cache dir if not PathManager.exists(model_file): model_file = os.path.join(get_multimodelity_cache_dir(), model_file) if not PathManager.exists(model_file): if _is_master: warnings.warn(f"No model file present at {model_file}.") needs_download = True if needs_download: logger.info("Downloading FastText bin") model_file = self._download_model() self.model_file = model_file self._already_downloaded = True synchronize()
def __init__(self, multi_task_instance): self.test_task = multi_task_instance self.task_type = multi_task_instance.dataset_type self.config = registry.get("config") self.report = [] self.timer = Timer() self.training_config = self.config.training self.num_workers = self.training_config.num_workers self.batch_size = self.training_config.batch_size self.report_folder_arg = get_multimodelity_env(key="report_dir") self.experiment_name = self.training_config.experiment_name self.datasets = [] for dataset in self.test_task.get_datasets(): self.datasets.append(dataset) self.current_dataset_idx = -1 self.current_dataset = self.datasets[self.current_dataset_idx] self.save_dir = get_multimodelity_env(key="save_dir") self.report_folder = ckpt_name_from_core_args(self.config) self.report_folder += foldername_from_config_override(self.config) self.report_folder = os.path.join(self.save_dir, self.report_folder) self.report_folder = os.path.join(self.report_folder, "reports") if self.report_folder_arg: self.report_folder = self.report_folder_arg PathManager.mkdirs(self.report_folder)
def make_dir(path): """ Make the directory and any nonexistent parent directories (`mkdir -p`). """ # the current working directory is a fine path if path != "": PathManager.mkdirs(path)
def resolve_cache_dir(env_variable="multimodelity_CACHE_DIR", default="multimodelity"): # Some of this follow what "transformers" does for there cache resolving try: from torch.hub import _get_torch_home torch_cache_home = _get_torch_home() except ImportError: torch_cache_home = os.path.expanduser( os.getenv( "TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"), )) default_cache_path = os.path.join(torch_cache_home, default) cache_path = os.getenv(env_variable, default_cache_path) if not PathManager.exists(cache_path): try: PathManager.mkdirs(cache_path) except PermissionError: cache_path = os.path.join(get_multimodelity_root(), ".multimodelity_cache") PathManager.mkdirs(cache_path) return cache_path
def setup_output_folder(folder_only: bool = False): """Sets up and returns the output file where the logs will be placed based on the configuration passed. Usually "save_dir/logs/log_<timestamp>.txt". If env.log_dir is passed, logs will be directly saved in this folder. Args: folder_only (bool, optional): If folder should be returned and not the file. Defaults to False. Returns: str: folder or file path depending on folder_only flag """ save_dir = get_multimodelity_env(key="save_dir") time_format = "%Y_%m_%dT%H_%M_%S" log_filename = "train_" log_filename += Timer().get_time_hhmmss(None, format=time_format) log_filename += ".log" log_folder = os.path.join(save_dir, "logs") env_log_dir = get_multimodelity_env(key="log_dir") if env_log_dir: log_folder = env_log_dir if not PathManager.exists(log_folder): PathManager.mkdirs(log_folder) if folder_only: return log_folder log_filename = os.path.join(log_folder, log_filename) return log_filename
def test_logger_files(self) -> None: self.assertTrue( PathManager.exists( glob.glob(os.path.join(self._tmpdir, "logs", "train*"))[0])) self.assertTrue( PathManager.exists(os.path.join(self._tmpdir, "train.log"))) self.assertTrue(PathManager.exists(os.path.join(self._tmpdir, "logs")))
def load_annotation_db(self, path): # Expect two paths, one to questions and one to annotations assert ( len(path) == 2 ), "VQACPv2 requires 2 paths; one to questions and one to annotations" with PathManager.open(path[0]) as f: path_0 = json.load(f) with PathManager.open(path[1]) as f: path_1 = json.load(f) if "annotations" in path[0]: annotations = path_0 questions = path_1 else: annotations = path_1 questions = path_0 # Convert to linear format data = [] question_dict = {} for question in questions: question_dict[question["question_id"]] = question["question"] for annotation in annotations: annotation["question"] = question_dict[annotation["question_id"]] answers = [] for answer in annotation["answers"]: answers.append(answer["answer"]) annotation["answers"] = answers data.append(copy.deepcopy(annotation)) self.data = data
def checksum(self, download_path): """ Checksum on a given file. Args: download_path (string): path to the downloaded file. """ if self._hashcode is None: print(f"[ Checksum not provided, skipping for {self._file_name}]") return sha256_hash = hashlib.sha256() destination = os.path.join(download_path, self._file_name) if not PathManager.isfile(destination): # File is not present, nothing to checksum return with PathManager.open(destination, "rb") as f: print(f"[ Starting checksum for {self._file_name}]") for byte_block in iter(lambda: f.read(65536), b""): sha256_hash.update(byte_block) if sha256_hash.hexdigest() != self._hashcode: # remove_dir(download_path) raise AssertionError( f"[ Checksum for {self._file_name} from \n{self._url}\n" "does not match the expected checksum. Please try again. ]" ) else: print(f"[ Checksum successful for {self._file_name}]")
def resolve_dir(env_variable, default="data"): default_dir = os.path.join(resolve_cache_dir(), default) dir_path = os.getenv(env_variable, default_dir) if not PathManager.exists(dir_path): PathManager.mkdirs(dir_path) return dir_path
def test_on_update_end(self): self.cb.on_train_start() self.cb.on_update_end(meter=self.trainer.meter, should_log=False) f = PathManager.open(os.path.join(self.tmpdir, "train.log")) self.assertFalse( any("time_since_start" in line for line in f.readlines())) self.cb.on_update_end(meter=self.trainer.meter, should_log=True) f = PathManager.open(os.path.join(self.tmpdir, "train.log")) self.assertTrue( any("time_since_start" in line for line in f.readlines()))
def test_log_writer(self) -> None: self.writer.info(self._tmpfile_write_contents) f = PathManager.open( glob.glob(os.path.join(self._tmpdir, "logs", "train*"))[0]) self.assertTrue( any(self._tmpfile_write_contents in line for line in f.readlines())) f = PathManager.open(os.path.join(self._tmpdir, "train.log")) self.assertTrue( any(self._tmpfile_write_contents in line for line in f.readlines()))
def __init__(self, in_dim, weights_file, bias_file): super().__init__() with PathManager.open(weights_file, "rb") as w: weights = pickle.load(w) with PathManager.open(bias_file, "rb") as b: bias = pickle.load(b) out_dim = bias.shape[0] self.lc = nn.Linear(in_dim, out_dim) self.lc.weight.data.copy_(torch.from_numpy(weights)) self.lc.bias.data.copy_(torch.from_numpy(bias)) self.out_dim = out_dim
def _load_pretrained_checkpoint(checkpoint_path, *args, **kwargs): assert (os.path.splitext(checkpoint_path)[1] in ALLOWED_CHECKPOINT_EXTS ), f"Checkpoint must have extensions: {ALLOWED_CHECKPOINT_EXTS}" _hack_imports() with PathManager.open(checkpoint_path, "rb") as f: ckpt = torch.load(f, map_location=lambda storage, loc: storage) assert "config" in ckpt, ( "No configs provided with pretrained model " " while checkpoint also doesn't have configuration.") config = ckpt.pop("config", None) model_config = config.get("model_config", config) ckpt = ckpt.get("model", ckpt) if "model_name" in kwargs: model_name = kwargs["model_name"] else: assert len(model_config.keys() ) == 1, "Only one model type should be specified." model_name = list(model_config.keys())[0] model_config = model_config.get(model_name) return {"config": model_config, "checkpoint": ckpt, "full_config": config}
def _load_jsonl(self, path): with PathManager.open(path, "r") as f: db = f.readlines() for idx, line in enumerate(db): db[idx] = json.loads(line.strip("\n")) self.data = db self.start_idx = 0
def test_finalize_and_resume_file(self): with mock_env_with_temp() as d: checkpoint = Checkpoint(self.trainer) self._init_early_stopping(checkpoint) self._do_a_pass() checkpoint.finalize() original = deepcopy(self.trainer.model) pth_path = os.path.join(d, "simple_final.pth") self.assertTrue(PathManager.exists(pth_path)) self._do_a_pass() after_a_pass = deepcopy(self.trainer.model) original_optimizer = deepcopy(self.trainer.optimizer) self.trainer.config.checkpoint.resume_file = pth_path with contextlib.redirect_stdout(StringIO()): checkpoint.load_state_dict() self.assertTrue( compare_state_dicts(self.trainer.model.state_dict(), original.state_dict())) self.assertFalse( compare_state_dicts(self.trainer.model.state_dict(), after_a_pass.state_dict())) self.assertTrue( self._compare_optimizers(self.trainer.optimizer, original_optimizer))
def load_yaml(f): # Convert to absolute path for loading includes abs_f = get_absolute_path(f) try: mapping = OmegaConf.load(abs_f) f = abs_f except FileNotFoundError as e: # Check if this file might be relative to root? # TODO: Later test if this can be removed relative = os.path.abspath(os.path.join(get_multimodelity_root(), f)) if not PathManager.isfile(relative): raise e else: f = relative mapping = OmegaConf.load(f) if mapping is None: mapping = OmegaConf.create() includes = mapping.get("includes", []) if not isinstance(includes, collections.abc.Sequence): raise AttributeError("Includes must be a list, {} provided".format( type(includes))) include_mapping = OmegaConf.create() multimodelity_root_dir = get_multimodelity_root() for include in includes: original_include_path = include include = os.path.join(multimodelity_root_dir, include) # If path doesn't exist relative to multimodelity root, try relative to current file if not PathManager.exists(include): include = os.path.join(os.path.dirname(f), original_include_path) current_include_mapping = load_yaml(include) include_mapping = OmegaConf.merge(include_mapping, current_include_mapping) mapping.pop("includes", None) mapping = OmegaConf.merge(include_mapping, mapping) return mapping
def load_pretrained_model(model_name_or_path_or_checkpoint, *args, **kwargs): # If this is a file, then load this directly else download and load if PathManager.isfile(model_name_or_path_or_checkpoint): return _load_pretrained_checkpoint(model_name_or_path_or_checkpoint, args, kwargs) else: return _load_pretrained_model(model_name_or_path_or_checkpoint, args, kwargs)
def _load_json(self, path): with PathManager.open(path, "r") as f: data = json.load(f) self.metadata = data.get("metadata", {}) self.data = data.get("data", []) if len(self.data) == 0: raise RuntimeError("Dataset is empty")
def restore(self): synchronize() logger.info("Restoring checkpoint") best_path = os.path.join(self.ckpt_foldername, self.ckpt_prefix + "best.ckpt") if PathManager.exists(best_path): self._load(best_path, force=True)
def _torch_load(self, file): # Backwards compatibility to Pythia _hack_imports() with PathManager.open(file, "rb") as f: if "cuda" in str(self.device): return torch.load(f, map_location=self.device) else: return torch.load(f, map_location=lambda storage, loc: storage)
def load_state_dict(self): ckpt_config = self.config.checkpoint suffix = "best.ckpt" if ckpt_config.resume_best else "current.ckpt" reverse_suffix = "best.ckpt" if not ckpt_config.resume_best else "current.ckpt" ckpt_filepath = os.path.join(self.ckpt_foldername, self.ckpt_prefix + suffix) # In case of interrupts and resume, ckpt_config.resume_file would be there # But, if the checkpoints are already created in the save dir # and resume is true signifying the interrupt resume, we should skip # loading the resume file. if (ckpt_config.resume_file is not None or ckpt_config.resume_zoo is not None) and (not ckpt_config.resume or not PathManager.exists(ckpt_filepath)): if ckpt_config.resume_file and PathManager.exists( ckpt_config.resume_file): self._load( ckpt_config.resume_file, load_pretrained=ckpt_config.resume_pretrained, ) return # resume_file doesn't exist, try from zoo now elif ckpt_config.resume_zoo is not None: self._load( ckpt_config.resume_zoo, load_zoo=True, load_pretrained=ckpt_config.resume_pretrained, ) return else: raise RuntimeError(f"{ckpt_config.resume_file} doesn't exist") if ckpt_config.resume: if PathManager.exists(ckpt_filepath): self._load(ckpt_filepath) else: warnings.warn( "Tried to resume but checkpoint filepath {} " "is not present. Trying {}, otherwise skipping.".format( ckpt_filepath, reverse_suffix)) ckpt_filepath = ckpt_filepath.replace(suffix, reverse_suffix) if PathManager.exists(ckpt_filepath): self._load(ckpt_filepath)
def __init__(self, vocab_file, data_dir=None): if not os.path.isabs(vocab_file) and data_dir is not None: vocab_file = get_absolute_path(os.path.join(data_dir, vocab_file)) if not PathManager.exists(vocab_file): raise RuntimeError( f"Vocab file {vocab_file} for vocab dict doesn't exist") self.word_list = load_str_list(vocab_file) self._build()
def get_default_config_path(): directory = os.path.dirname(os.path.abspath(__file__)) configs_dir = os.path.join(directory, "..", "configs") # Check for fb defaults fb_defaults = os.path.join(configs_dir, "fb_defaults.yaml") if PathManager.exists(fb_defaults): return fb_defaults else: return os.path.join(configs_dir, "defaults.yaml")
def _load_pretrained_model(model_name_or_path, *args, **kwargs): if PathManager.exists(model_name_or_path): download_path = model_name_or_path model_name = model_name_or_path else: download_path = download_pretrained_model(model_name_or_path, *args, **kwargs) model_name = model_name_or_path configs = glob.glob(os.path.join(download_path, "*.yaml")) assert len(configs) <= 1, ( "Multiple yaml files with the pretrained model. " + "multimodelity doesn't know what to do.") ckpts = [] allowed_ckpt_types = [f"*{ext}" for ext in ALLOWED_CHECKPOINT_EXTS] for ckpt_type in allowed_ckpt_types: ckpts.extend(glob.glob(os.path.join(download_path, ckpt_type))) assert ( len(ckpts) == 1 ), "None or multiple checkpoints files. multimodelity doesn't know what to do." _hack_imports() with PathManager.open(ckpts[0], "rb") as f: ckpt = torch.load(f, map_location=lambda storage, loc: storage) # If configs are not present, will ckpt provide the config? if len(configs) == 0: assert "config" in ckpt, ( "No configs provided with pretrained model" " while checkpoint also doesn't have configuration.") config = ckpt["config"] else: config = load_yaml(configs[0]) model_config = config.get("model_config", config) ckpt = ckpt.get("model", ckpt) # Also handle the case of model_name is path model_config = model_config.get( model_name.split(os.path.sep)[-1].split(".")[0]) return {"config": model_config, "checkpoint": ckpt, "full_config": config}
def get_possible_image_paths(path): image_path = path.split(".") # Image path might contain file extension (e.g. .jpg), # In this case, we want the path without the extension image_path = image_path if len(image_path) == 1 else image_path[:-1] for ext in tv_helpers.IMG_EXTENSIONS: image_ext = ".".join(image_path) + ext if PathManager.isfile(image_ext): path = image_ext break return path
def _download_model(self): _is_master = is_master() model_file_path = os.path.join(get_multimodelity_cache_dir(), "wiki.en.bin") if not _is_master: return model_file_path if PathManager.exists(model_file_path): logger.info(f"Vectors already present at {model_file_path}.") return model_file_path import requests from tqdm import tqdm from multimodelity.common.constants import FASTTEXT_WIKI_URL PathManager.mkdirs(os.path.dirname(model_file_path)) response = requests.get(FASTTEXT_WIKI_URL, stream=True) with PathManager.open(model_file_path, "wb") as f: pbar = tqdm( total=int(response.headers["Content-Length"]) / 4096, miniters=50, disable=not _is_master, ) idx = 0 for data in response.iter_content(chunk_size=4096): if data: if idx % 50 == 0: pbar.update(len(data)) f.write(data) idx += 1 pbar.close() logger.info(f"fastText bin downloaded at {model_file_path}.") return model_file_path
def _create_checkpoint_file(self, path): home = str(Path.home()) data_dir = get_multimodelity_env(key="data_dir") model_folder = os.path.join(home, data_dir, "models", "mmbt.hateful_memes.images") model_file = os.path.join(model_folder, "model.pth") config_file = os.path.join(model_folder, "config.yaml") config = load_yaml(config_file) with PathManager.open(model_file, "rb") as f: ckpt = torch.load(f) ckpt["config"] = config torch.save(ckpt, path)