Exemple #1
0
    def _try_download(self):
        _is_master = is_master()

        if self._already_downloaded:
            return

        needs_download = False

        if not hasattr(self.config, "model_file"):
            if _is_master:
                warnings.warn("'model_file' key is required but missing "
                              "from FastTextProcessor's config.")
            needs_download = True

        model_file = self.config.model_file
        # If model_file is already an existing path don't join to cache dir
        if not PathManager.exists(model_file):
            model_file = os.path.join(get_multimodelity_cache_dir(),
                                      model_file)

        if not PathManager.exists(model_file):
            if _is_master:
                warnings.warn(f"No model file present at {model_file}.")
            needs_download = True

        if needs_download:
            logger.info("Downloading FastText bin")
            model_file = self._download_model()

        self.model_file = model_file
        self._already_downloaded = True
        synchronize()
Exemple #2
0
    def evaluation_loop(
            self,
            loader,
            use_tqdm: bool = False,
            single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]:
        meter = Meter()

        with torch.no_grad():
            self.model.eval()
            disable_tqdm = not use_tqdm or not is_master()
            combined_report = None

            for batch in tqdm.tqdm(loader, disable=disable_tqdm):
                report = self._forward(batch)
                self.update_meter(report, meter)

                # accumulate necessary params for metric calculation
                if combined_report is None:
                    combined_report = report
                else:
                    combined_report.accumulate_tensor_fields(
                        report, self.metrics.required_params)
                    combined_report.batch_size += report.batch_size

                if single_batch is True:
                    break

            combined_report.metrics = self.metrics(combined_report,
                                                   combined_report)
            self.update_meter(combined_report, meter, eval_mode=True)

            # enable train mode again
            self.model.train()

        return combined_report, meter
Exemple #3
0
    def flush_report(self):
        if not is_master():
            return

        name = self.current_dataset.dataset_name
        time_format = "%Y-%m-%dT%H:%M:%S"
        time = self.timer.get_time_hhmmss(None, format=time_format)

        filename = name + "_"

        if len(self.experiment_name) > 0:
            filename += self.experiment_name + "_"

        filename += self.task_type + "_"
        filename += time

        if self.config.evaluation.predict_file_format == "csv":
            filepath = os.path.join(self.report_folder, filename + ".csv")
            self.csv_dump(filepath)
        else:
            filepath = os.path.join(self.report_folder, filename + ".json")
            self.json_dump(filepath)

        logger.info(f"Wrote predictions for {name} to {os.path.abspath(filepath)}")
        self.report = []
Exemple #4
0
    def _threaded_read(self):
        elements = [idx for idx in range(1, len(self.annotation_db))]
        pool = ThreadPool(processes=4)

        with tqdm.tqdm(total=len(elements), disable=not is_master()) as pbar:
            for i, _ in enumerate(
                    pool.imap_unordered(self._fill_cache, elements)):
                if i % 100 == 0:
                    pbar.update(100)
        pool.close()
Exemple #5
0
    def __init__(self, embedding_name, *args, **kwargs):
        """Use this if you want to use pretrained embedding. See description
        of IntersectedVocab to get a list of the embedding available from
        torchtext

        Parameters
        ----------
        embedding_name : str
            Name of the pretrained alias for the embedding to used
        """
        self.type = "pretrained"

        if embedding_name not in vocab.pretrained_aliases:
            raise RuntimeError(f"Unknown embedding type: {embedding_name}")

        vector_cache = get_multimodelity_cache_dir()

        # First test loading the vectors in master so that everybody doesn't
        # download it in case it doesn't exist
        if is_master():
            vocab.pretrained_aliases[embedding_name](cache=vector_cache)
        synchronize()

        embedding = vocab.pretrained_aliases[embedding_name](
            cache=vector_cache)

        self.UNK_INDEX = 3
        self.stoi = defaultdict(lambda: self.UNK_INDEX)
        self.itos = {}

        self.itos[self.PAD_INDEX] = self.PAD_TOKEN
        self.itos[self.SOS_INDEX] = self.SOS_TOKEN
        self.itos[self.EOS_INDEX] = self.EOS_TOKEN
        self.itos[self.UNK_INDEX] = self.UNK_TOKEN

        self.stoi[self.SOS_TOKEN] = self.SOS_INDEX
        self.stoi[self.EOS_TOKEN] = self.EOS_INDEX
        self.stoi[self.PAD_TOKEN] = self.PAD_INDEX
        self.stoi[self.UNK_TOKEN] = self.UNK_INDEX

        self.vectors = torch.FloatTensor(
            len(self.itos.keys()) + len(embedding.itos),
            len(embedding.vectors[0]))

        for i in range(4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        index = 4
        for word in embedding.stoi:
            self.itos[index] = word
            self.stoi[word] = index
            actual_index = embedding.stoi[word]
            self.vectors[index] = embedding.vectors[actual_index]
            index += 1
Exemple #6
0
    def __init__(self, dataset_type="train"):
        self._dataset_type = dataset_type
        self._is_master = is_master()

        self._datasets = []
        self._loaders = []
        self._samplers = []
        self._iterators = []

        self._total_length = 0
        self._per_dataset_lengths = []
        self._num_datasets = 0
        self._finished_iterators = {}
Exemple #7
0
    def try_fast_read(self):
        # Don't fast read in case of test set.
        if self._dataset_type == "test":
            return

        if hasattr(self,
                   "_should_fast_read") and self._should_fast_read is True:
            logger.info(
                f"Starting to fast read {self.dataset_name} {self.dataset_type} "
                + "dataset")
            self.cache = {}
            for idx in tqdm.tqdm(range(len(self.annotation_db)),
                                 miniters=100,
                                 disable=not is_master()):
                self.cache[idx] = self.load_item(idx)
Exemple #8
0
def download_pretrained_model(model_name, *args, **kwargs):
    import omegaconf
    from omegaconf import OmegaConf
    from multimodelity.utils.configuration import get_multimodelity_env, load_yaml

    model_zoo = load_yaml(get_multimodelity_env(key="model_zoo"))
    OmegaConf.set_struct(model_zoo, True)
    OmegaConf.set_readonly(model_zoo, True)

    data_dir = get_absolute_path(get_multimodelity_env("data_dir"))
    model_data_dir = os.path.join(data_dir, "models")
    download_path = os.path.join(model_data_dir, model_name)

    try:
        model_config = OmegaConf.select(model_zoo, model_name)
    except omegaconf.errors.OmegaConfBaseException as e:
        print(f"No such model name {model_name} defined in multimodelity zoo")
        raise e

    if "version" not in model_config or "resources" not in model_config:
        # Version and Resources are not present time to try the defaults
        try:
            model_config = model_config.defaults
            download_path = os.path.join(model_data_dir,
                                         model_name + ".defaults")
        except omegaconf.errors.OmegaConfBaseException as e:
            print(
                f"Model name {model_name} doesn't specify 'resources' and 'version' "
                "while no defaults have been provided")
            raise e

    # Download requirements if any specified by "zoo_requirements" field
    # This can either be a list or a string
    if "zoo_requirements" in model_config:
        requirements = model_config.zoo_requirements
        if isinstance(requirements, str):
            requirements = [requirements]
        for item in requirements:
            download_pretrained_model(item, *args, **kwargs)

    version = model_config.version
    resources = model_config.resources

    if is_master():
        download_resources(resources, download_path, version)
    synchronize()

    return download_path
Exemple #9
0
    def __init__(self, log_folder="./logs", iteration=0):
        # This would handle warning of missing tensorboard
        from torch.utils.tensorboard import SummaryWriter

        self.summary_writer = None
        self._is_master = is_master()
        self.timer = Timer()
        self.log_folder = log_folder
        self.time_format = "%Y-%m-%dT%H:%M:%S"

        if self._is_master:
            current_time = self.timer.get_time_hhmmss(None, format=self.time_format)
            tensorboard_folder = os.path.join(
                self.log_folder, f"tensorboard_{current_time}"
            )
            self.summary_writer = SummaryWriter(tensorboard_folder)
Exemple #10
0
    def add_to_report(self, report, model):
        keys = ["id", "question_id", "image_id", "context_tokens", "captions", "scores"]
        for key in keys:
            report = self.reshape_and_gather(report, key)

        if not is_master():
            return

        results = self.current_dataset.format_for_prediction(report)

        if hasattr(model, "format_for_prediction"):
            results = model.format_for_prediction(results, report)
        elif hasattr(model.module, "format_for_prediction"):
            results = model.module.format_for_prediction(results, report)

        self.report = self.report + results
Exemple #11
0
    def build_dataset(self, config, dataset_type="train", *args, **kwargs):
        """
        Similar to load function, used by multimodelity to build a dataset for first
        time when it is not available. This internally calls 'build' function.
        Override that function in your child class.

        Args:
            config (DictConfig): Configuration of this dataset loaded from
                                 config.
            dataset_type (str): Type of dataset, train|val|test

        .. warning::

            DO NOT OVERRIDE in child class. Instead override ``build``.
        """
        # Only build in main process, so none of the others have to build
        if is_master():
            self.build(config, dataset_type, *args, **kwargs)
        synchronize()
Exemple #12
0
    def load(self):
        self.image_path = os.path.join(self._data_folder,
                                       _CONSTANTS["images_folder"],
                                       self._dataset_type)

        with open(
                os.path.join(
                    self._data_folder,
                    _CONSTANTS["questions_folder"],
                    _TEMPLATES["question_json_file"].format(
                        self._dataset_type),
                )) as f:
            self.questions = json.load(f)[_CONSTANTS["questions_key"]]

            # Vocab should only be built in main process, as it will repetition of same task
            if is_master():
                self._build_vocab(self.questions, _CONSTANTS["question_key"])
                self._build_vocab(self.questions, _CONSTANTS["answer_key"])
            synchronize()
Exemple #13
0
    def __call__(self, update, iteration, meter):
        """
        Method to be called everytime you need to check whether to
        early stop or not
        Arguments:
            update {number}: Current update number
            iteration {number}: Current iteration number
        Returns:
            bool -- Tells whether early stopping occurred or not
        """
        if not is_master():
            return False

        value = meter.meters.get(self.early_stop_criteria, None)
        if value is None:
            raise ValueError("Criteria used for early stopping ({}) is not "
                             "present in meter.".format(
                                 self.early_stop_criteria))

        value = value.global_avg

        if isinstance(value, torch.Tensor):
            value = value.item()

        if (self.minimize and value < self.best_monitored_value) or (
                not self.minimize and value > self.best_monitored_value):
            self.best_monitored_value = value
            self.best_monitored_iteration = iteration
            self.best_monitored_update = update
            self.checkpoint.save(update, iteration, update_best=True)

        elif self.best_monitored_update + self.patience < update:
            self.activated = True
            if self.should_stop is True:
                self.checkpoint.restore()
                self.checkpoint.finalize()
                return True
            else:
                return False
        else:
            self.checkpoint.save(update, iteration, update_best=False)

        return False
Exemple #14
0
    def _download_model(self):
        _is_master = is_master()

        model_file_path = os.path.join(get_multimodelity_cache_dir(),
                                       "wiki.en.bin")

        if not _is_master:
            return model_file_path

        if PathManager.exists(model_file_path):
            logger.info(f"Vectors already present at {model_file_path}.")
            return model_file_path

        import requests
        from tqdm import tqdm

        from multimodelity.common.constants import FASTTEXT_WIKI_URL

        PathManager.mkdirs(os.path.dirname(model_file_path))
        response = requests.get(FASTTEXT_WIKI_URL, stream=True)

        with PathManager.open(model_file_path, "wb") as f:
            pbar = tqdm(
                total=int(response.headers["Content-Length"]) / 4096,
                miniters=50,
                disable=not _is_master,
            )

            idx = 0
            for data in response.iter_content(chunk_size=4096):
                if data:
                    if idx % 50 == 0:
                        pbar.update(len(data))
                    f.write(data)
                    idx += 1

            pbar.close()

        logger.info(f"fastText bin downloaded at {model_file_path}.")

        return model_file_path
Exemple #15
0
 def finalize(self):
     if is_master():
         with PathManager.open(self.pth_filepath, "wb") as f:
             torch.save(self.trainer.model.state_dict(), f)
Exemple #16
0
    def save(self, update, iteration=None, update_best=False):
        # Only save in main process
        if not is_master():
            return

        if not iteration:
            iteration = update

        ckpt_filepath = os.path.join(self.models_foldername,
                                     "model_%d.ckpt" % update)
        best_ckpt_filepath = os.path.join(self.ckpt_foldername,
                                          self.ckpt_prefix + "best.ckpt")
        current_ckpt_filepath = os.path.join(self.ckpt_foldername,
                                             self.ckpt_prefix + "current.ckpt")

        best_iteration = (self.trainer.early_stop_callback.early_stopping.
                          best_monitored_iteration)
        best_update = (self.trainer.early_stop_callback.early_stopping.
                       best_monitored_update)
        best_metric = (self.trainer.early_stop_callback.early_stopping.
                       best_monitored_value)
        model = self.trainer.model
        data_parallel = registry.get("data_parallel") or registry.get(
            "distributed")
        fp16_scaler = getattr(self.trainer, "scaler", None)
        fp16_scaler_dict = None

        if fp16_scaler is not None:
            fp16_scaler_dict = fp16_scaler.state_dict()

        if data_parallel is True:
            model = model.module

        ckpt = {
            "model": model.state_dict(),
            "optimizer": self.trainer.optimizer.state_dict(),
            "best_iteration": best_iteration,
            "current_iteration": iteration,
            "current_epoch": self.trainer.current_epoch,
            "num_updates": update,
            "best_update": best_update,
            "best_metric_value": best_metric,
            "fp16_scaler": fp16_scaler_dict,
            # Convert to container to avoid any dependencies
            "config": OmegaConf.to_container(self.config, resolve=True),
        }

        lr_scheduler = self.trainer.lr_scheduler_callback._scheduler
        if lr_scheduler is not None:
            ckpt["lr_scheduler"] = lr_scheduler.state_dict()

        if self.git_repo:
            git_metadata_dict = self._get_vcs_fields()
            ckpt.update(git_metadata_dict)

        with PathManager.open(ckpt_filepath, "wb") as f:
            torch.save(ckpt, f)

        if update_best:
            with PathManager.open(best_ckpt_filepath, "wb") as f:
                torch.save(ckpt, f)

        # Save current always
        with PathManager.open(current_ckpt_filepath, "wb") as f:
            torch.save(ckpt, f)

        # Remove old checkpoints if max_to_keep is set
        if self.max_to_keep > 0:
            if len(self.saved_iterations) == self.max_to_keep:
                self.remove(self.saved_iterations.pop(0))
            self.saved_iterations.append(update)
Exemple #17
0
    def __init__(self, vocab_file, embedding_name, *args, **kwargs):
        """Use this vocab class when you have a custom vocabulary class but you
        want to use pretrained embedding vectos for it. This will only load
        the vectors which intersect with your vocabulary. Use the
        embedding_name specified in torchtext's pretrained aliases:
        ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d',
         'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d',
         'glove.twitter.27B.50d', 'glove.twitter.27B.100d',
         'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d',
         'glove.6B.200d', 'glove.6B.300d']

        Parameters
        ----------
        vocab_file : str
            Vocabulary file containing list of words with one word per line
            which will be used to collect vectors
        embedding_name : str
            Embedding name picked up from the list of the pretrained aliases
            mentioned above
        """
        super().__init__(vocab_file, *args, **kwargs)

        self.type = "intersected"

        name = embedding_name.split(".")[0]
        dim = embedding_name.split(".")[2][:-1]
        middle = embedding_name.split(".")[1]

        class_name = EMBEDDING_NAME_CLASS_MAPPING[name]

        if not hasattr(vocab, class_name):
            raise RuntimeError(f"Unknown embedding type: {name}")

        params = [middle]

        if name == "glove":
            params.append(int(dim))

        vector_cache = get_multimodelity_cache_dir()

        # First test loading the vectors in master so that everybody doesn't
        # download it in case it doesn't exist
        if is_master():
            vocab.pretrained_aliases[embedding_name](cache=vector_cache)
        synchronize()

        embedding = getattr(vocab, class_name)(*params, cache=vector_cache)

        self.vectors = torch.empty(
            (self.get_size(), len(embedding.vectors[0])), dtype=torch.float)

        self.embedding_dim = len(embedding.vectors[0])

        for i in range(0, 4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        for i in range(4, self.get_size()):
            word = self.itos[i]
            embedding_index = embedding.stoi.get(word, None)

            if embedding_index is None:
                self.vectors[i] = self.vectors[self.UNK_INDEX]
            else:
                self.vectors[i] = embedding.vectors[embedding_index]