Example #1
0
    def _try_download(self):
        is_main_process = self._is_main_process()

        if self._already_downloaded:
            return

        if is_main_process:
            self.writer.write("Fetching fastText model for OCR processing")

        needs_download = False

        if not hasattr(self.config, "model_file"):
            if is_main_process:
                warnings.warn("'model_file' key is required but missing "
                              "from FastTextProcessor's config.")
            needs_download = True

        model_file = self.config.model_file
        model_file = os.path.join(get_pythia_root(), model_file)

        if not os.path.exists(model_file):
            if is_main_process:
                warnings.warn(
                    "No model file present at {}.".format(model_file))
            needs_download = True

        if needs_download:
            if is_main_process:
                self.writer.write("Downloading FastText bin", "info")
            model_file = self._download_model()

        synchronize()

        self._load_fasttext_model(model_file)
        self._already_downloaded = True
Example #2
0
    def __init__(self,
                 dataset_type,
                 config,
                 data_folder=None,
                 *args,
                 **kwargs):
        super().__init__(_CONSTANTS["dataset_key"], dataset_type, config)
        self._data_folder = data_folder
        self._data_root_dir = os.path.join(get_pythia_root(),
                                           config.data_root_dir)

        if not self._data_folder:
            self._data_folder = os.path.join(self._data_root_dir,
                                             config.data_folder)

        if not os.path.exists(self._data_folder):
            raise RuntimeError(_TEMPLATES["data_folder_missing_error"].format(
                self._data_folder))

        # Check if the folder was actually extracted in the subfolder
        if config.data_folder in os.listdir(self._data_folder):
            self._data_folder = os.path.join(self._data_folder,
                                             config.data_folder)

        if len(os.listdir(self._data_folder)) == 0:
            raise FileNotFoundError(_CONSTANTS["empty_folder_error"])

        self._load()
Example #3
0
    def _build(self, dataset_type, config):
        download_folder = os.path.join(get_pythia_root(), config.data_root_dir, config.data_folder)

        file_name = CLEVR_DOWNLOAD_URL.split("/")[-1]
        local_filename = os.path.join(download_folder, file_name)

        extraction_folder = os.path.join(download_folder, ".".join(file_name.split(".")[:-1]))
        self.data_folder = extraction_folder

        # Either if the zip file is already present or if there are some
        # files inside the folder we don't continue download process
        if os.path.exists(local_filename):
            self.writer.write("CLEVR dataset is already present. Skipping download.")
            return

        if os.path.exists(extraction_folder) and \
            len(os.listdir(extraction_folder)) != 0:
            return

        self.writer.write("Downloading the CLEVR dataset now")
        download_file(CLEVR_DOWNLOAD_URL, output_dir=download_folder)

        self.writer.write("Downloaded. Extracting now. This can take time.")
        with zipfile.ZipFile(local_filename, "r") as zip_ref:
            zip_ref.extractall(download_folder)
    def load_yaml(self, file):
        with open(file, "r") as stream:
            mapping = yaml.safe_load(stream)

            if mapping is None:
                mapping = {}

            includes = mapping.get("includes", [])

            if not isinstance(includes, list):
                raise AttributeError(
                    "Includes must be a list, {} provided".format(
                        type(includes)))
            include_mapping = {}

            pythia_root_dir = get_pythia_root()

            for include in includes:
                include = os.path.join(pythia_root_dir, include)
                current_include_mapping = self.load_yaml(include)
                include_mapping = self.nested_dict_update(
                    include_mapping, current_include_mapping)

            mapping.pop("includes", None)

            mapping = self.nested_dict_update(include_mapping, mapping)

            return mapping
Example #5
0
    def __init__(self,
                 vocab_file=None,
                 embedding_dim=300,
                 data_root_dir=None,
                 *args,
                 **kwargs):
        """Vocab class to be used when you want to train word embeddings from
        scratch based on a custom vocab. This will initialize the random
        vectors for the vocabulary you pass. Get the vectors using
        `get_vectors` function. This will also create random embeddings for
        some predefined words like PAD - <pad>, SOS - <s>, EOS - </s>,
        UNK - <unk>.

        Parameters
        ----------
        vocab_file : str
            Path of the vocabulary file containing one word per line
        embedding_dim : int
            Size of the embedding

        """
        self.type = "base"
        self.word_dict = {}
        self.itos = {}

        self.itos[self.PAD_INDEX] = self.PAD_TOKEN
        self.itos[self.SOS_INDEX] = self.SOS_TOKEN
        self.itos[self.EOS_INDEX] = self.EOS_TOKEN
        self.itos[self.UNK_INDEX] = self.UNK_TOKEN

        self.word_dict[self.SOS_TOKEN] = self.SOS_INDEX
        self.word_dict[self.EOS_TOKEN] = self.EOS_INDEX
        self.word_dict[self.PAD_TOKEN] = self.PAD_INDEX
        self.word_dict[self.UNK_TOKEN] = self.UNK_INDEX

        index = len(self.itos.keys())

        self.total_predefined = len(self.itos.keys())

        if vocab_file is not None:
            if not os.path.isabs(vocab_file) and data_root_dir is not None:
                pythia_root = get_pythia_root()
                vocab_file = os.path.join(pythia_root, data_root_dir,
                                          vocab_file)
            if not os.path.exists(vocab_file):
                raise RuntimeError("Vocab not found at " + vocab_file)

            with open(vocab_file, "r") as f:
                for line in f:
                    self.itos[index] = line.strip()
                    self.word_dict[line.strip()] = index
                    index += 1

        self.word_dict[self.UNK_TOKEN] = self.UNK_INDEX
        # Return unk index by default
        self.stoi = defaultdict(lambda: self.UNK_INDEX)
        self.stoi.update(self.word_dict)

        self.vectors = torch.FloatTensor(self.get_size(), embedding_dim)
Example #6
0
 def __init__(self, config, *args, **kwargs):
     self.max_length = config.max_length
     pythia_root = get_pythia_root()
     VOCAB = 'bert-base-uncased-vocab.txt'
     self.bert_tokenizer = BertTokenizer.from_pretrained(
         os.path.join(pythia_root, config.model_data_dir, 'bert', VOCAB))
     assert self.bert_tokenizer.encode(self.bert_tokenizer.pad_token) == [0]
     self.get_qgen_inds = getattr(config, 'get_qgen_inds', False)
     if self.get_qgen_inds:
         print('computing question generation indices in bert tokenizer')
Example #7
0
    def _load_fasttext_model(self, model_file):
        from fasttext import load_model
        from pythia.common.registry import registry

        pythia_root = get_pythia_root()
        model_file = os.path.join(pythia_root, model_file)

        registry.get("writer").write("Loading fasttext model now from %s" % model_file)

        self.model = load_model(model_file)
        self.stoi = WordToVectorDict(self.model)
Example #8
0
    def __init__(self, embedding_name, *args, **kwargs):
        """Use this if you want to use pretrained embedding. See description
        of IntersectedVocab to get a list of the embedding available from
        torchtext

        Parameters
        ----------
        embedding_name : str
            Name of the pretrained alias for the embedding to used
        """
        self.type = "pretrained"

        if embedding_name not in vocab.pretrained_aliases:
            from pythia.common.registry import registry

            writer = registry.get("writer")
            error = "Unknown embedding type: %s" % embedding_name, "error"
            if writer is not None:
                writer.write(error, "error")
            raise RuntimeError(error)

        vector_cache = os.path.join(get_pythia_root(), ".vector_cache")

        embedding = vocab.pretrained_aliases[embedding_name](
            cache=vector_cache)

        self.UNK_INDEX = 3
        self.stoi = defaultdict(lambda: self.UNK_INDEX)
        self.itos = {}

        self.itos[self.PAD_INDEX] = self.PAD_TOKEN
        self.itos[self.SOS_INDEX] = self.SOS_TOKEN
        self.itos[self.EOS_INDEX] = self.EOS_TOKEN
        self.itos[self.UNK_INDEX] = self.UNK_TOKEN

        self.stoi[self.SOS_TOKEN] = self.SOS_INDEX
        self.stoi[self.EOS_TOKEN] = self.EOS_INDEX
        self.stoi[self.PAD_TOKEN] = self.PAD_INDEX
        self.stoi[self.UNK_TOKEN] = self.UNK_INDEX

        self.vectors = torch.FloatTensor(
            len(self.itos.keys()) + len(embedding.itos),
            len(embedding.vectors[0]))

        for i in range(4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        index = 4
        for word in embedding.stoi:
            self.itos[index] = word
            self.stoi[word] = index
            actual_index = embedding.stoi[word]
            self.vectors[index] = embedding.vectors[actual_index]
            index += 1
Example #9
0
 def __init__(self, dataset_type, config):
     super().__init__('imagenet', dataset_type, config)
     self.feature_extractor = VQAMaskRCNNBenchmark()
     self.feature_extractor.to(device)
     self.config = config
     # directly to store annotations(captions)
     self.annotation_dir = os.path.join(get_pythia_root(),
                                        config.data_root_dir,
                                        config.annotation_dir)
     # directory to store images
     self.image_dir = os.path.join(get_pythia_root(), config.data_root_dir,
                                   config.image_dir)
     self.annotations = []
     for annotation_file in os.listdir(self.annotation_dir):
         with open(os.path.join(self.annotation_dir, annotation_file)) as f:
             annotation = json.load(f)
             for item in annotation.items():
                 # each item in annotations is a (image_id, caption) tuple
                 self.annotations.append(item)
     self.init_processors()
Example #10
0
 def _get_absolute_path(self, paths):
     if isinstance(paths, list):
         return [self._get_absolute_path(path) for path in paths]
     elif isinstance(paths, str):
         if not os.path.isabs(paths):
             pythia_root = get_pythia_root()
             paths = os.path.join(pythia_root, self.config.data_root_dir,
                                  paths)
         return paths
     else:
         raise TypeError("Paths passed to dataset should either be "
                         "string or list")
Example #11
0
    def __init__(self, vocab_file, data_root_dir=None):
        if not os.path.isabs(vocab_file) and data_root_dir is not None:
            pythia_root = get_pythia_root()
            vocab_file = os.path.abspath(os.path.join("data", vocab_file))

        if not os.path.exists(vocab_file):
            raise RuntimeError(
                "Vocab file {} for vocab dict doesn't exist".format(
                    vocab_file))

        self.word_list = load_str_list(vocab_file)
        self._build()
Example #12
0
 def setUp(self):
     torch.manual_seed(1234)
     registry.register("clevr_text_vocab_size", 80)
     registry.register("clevr_num_final_outputs", 32)
     config_path = os.path.join(get_pythia_root(), "..", "configs", "vqa",
                                "clevr", "cnn_lstm.yml")
     config_path = os.path.abspath(config_path)
     configuration = Configuration(config_path)
     configuration.config["datasets"] = "clevr"
     configuration.freeze()
     self.config = configuration.config
     registry.register("config", self.config)
Example #13
0
    def __init__(self,
                 vocab_file,
                 embedding_file,
                 data_root_dir=None,
                 *args,
                 **kwargs):
        """Use this vocab class when you have a custom vocab as well as a
        custom embeddings file.

        This will inherit vocab class, so you will get predefined tokens with
        this one.

        IMPORTANT: To init your embedding, get your vectors from this class's
        object by calling `get_vectors` function

        Parameters
        ----------
        vocab_file : str
            Path of custom vocabulary
        embedding_file : str
            Path to custom embedding inititalization file
        data_root_dir : str
            Path to data directory if embedding file is not an absolute path.
            Default: None
        """
        super(CustomVocab, self).__init__(vocab_file)
        self.type = "custom"

        if not os.path.isabs(embedding_file) and data_root_dir is not None:
            pythia_root = get_pythia_root()
            embedding_file = os.path.join(pythia_root, data_root_dir,
                                          embedding_file)

        if not os.path.exists(embedding_file):
            from pythia.common.registry import registry

            writer = registry.get("writer")
            error = "Embedding file path %s doesn't exist" % embedding_file
            if writer is not None:
                writer.write(error, "error")
            raise RuntimeError(error)

        embedding_vectors = torch.from_numpy(np.load(embedding_file))

        self.vectors = torch.FloatTensor(self.get_size(),
                                         len(embedding_vectors[0]))

        for i in range(0, 4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        for i in range(4, self.get_size()):
            self.vectors[i] = embedding_vectors[i - 4]
Example #14
0
 def setUp(self):
     torch.manual_seed(1234)
     config_path = os.path.join(get_pythia_root(), "..", "configs",
                                "captioning", "coco",
                                "butd_nucleus_sampling.yml")
     config_path = os.path.abspath(config_path)
     configuration = Configuration(config_path)
     configuration.config["datasets"] = "coco"
     configuration.config["model_attributes"]["butd"]["inference"][
         "params"]["sum_threshold"] = 0.5
     configuration.freeze()
     self.config = configuration.config
     registry.register("config", self.config)
Example #15
0
    def _build(self, dataset_type, config):
        self._dataset_type = dataset_type
        self._config = config
        data_folder = os.path.join(get_pythia_root(),
                                   self._config.data_root_dir)

        # Since the imdb tar file contains all of the sets, we won't download them
        # except in case of train
        if self._dataset_type != "train":
            return

        self._download_and_extract_imdb(data_folder)
        self._download_and_extract_features(data_folder)
Example #16
0
    def __init__(self, vocab_file, data_root_dir=None):
        if not os.path.isabs(vocab_file) and data_root_dir is not None:
            pythia_root = get_pythia_root()
            vocab_file = os.path.abspath(
                os.path.join(pythia_root, data_root_dir, vocab_file))

        if not os.path.exists(vocab_file):
            raise RuntimeError(
                "Vocab file {} for vocab dict doesn't exist".format(
                    vocab_file))

        self.word_list = load_str_list(vocab_file)
        self.word2idx_dict = {w: n_w for n_w, w in enumerate(self.word_list)}
        self.num_vocab = len(self.word_list)
        self.UNK_INDEX = (self.word2idx_dict["<unk>"]
                          if "<unk>" in self.word2idx_dict else None)
Example #17
0
    def _download_model(self):
        is_main_process = self._is_main_process()

        model_file_path = os.path.join(
            get_pythia_root(), ".vector_cache", "wiki.en.bin"
        )

        if is_main_process:
            return model_file_path

        if os.path.exists(model_file_path):
            if is_main_process:
                self.writer.write(
                    "Vectors already present at {}.".format(model_file_path), "info"
                )
            return model_file_path

        import requests
        from pythia.common.constants import FASTTEXT_WIKI_URL
        from tqdm import tqdm

        os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
        response = requests.get(FASTTEXT_WIKI_URL, stream=True)

        with open(model_file_path, "wb") as f:
            pbar = tqdm(
                total=int(response.headers["Content-Length"]) / 4096,
                miniters=50,
                disable=not is_main_process,
            )

            idx = 0
            for data in response.iter_content(chunk_size=4096):
                if data:
                    if idx % 50 == 0:
                        pbar.update(len(data))
                    f.write(data)
                    idx += 1

            pbar.close()

        if is_main_process:
            self.writer.write(
                "fastText bin downloaded at {}.".format(model_file_path), "info"
            )

        return model_file_path
Example #18
0
    def __init__(self, in_dim, weights_file, bias_file, model_data_dir):
        super(FinetuneFasterRcnnFpnFc7, self).__init__()
        pythia_root = get_pythia_root()
        model_data_dir = os.path.join(pythia_root, model_data_dir)

        if not os.path.isabs(weights_file):
            weights_file = os.path.join(model_data_dir, weights_file)
        if not os.path.isabs(bias_file):
            bias_file = os.path.join(model_data_dir, bias_file)
        with open(weights_file, "rb") as w:
            weights = pickle.load(w)
        with open(bias_file, "rb") as b:
            bias = pickle.load(b)
        out_dim = bias.shape[0]

        self.lc = nn.Linear(in_dim, out_dim)
        self.lc.weight.data.copy_(torch.from_numpy(weights))
        self.lc.bias.data.copy_(torch.from_numpy(bias))
        self.out_dim = out_dim
Example #19
0
    def _build_txt_encoding(self):
        TEXT_BERT_HIDDEN_SIZE = 768

        self.text_bert_config = BertConfig(**self.config.text_bert)
        if self.config.text_bert_init_from_bert_base:
            pythia_root = get_pythia_root()
            self.text_bert = TextBert.from_pretrained(
                os.path.join(pythia_root, self.config.model_data_dir, 'bert'),
                config=self.text_bert_config)
            # Use a smaller learning rate on text bert when initializing
            # from BERT_BASE
            self.finetune_modules.append({
                'module':
                self.text_bert,
                'lr_scale':
                self.config.lr_scale_text_bert,
            })
        else:
            self.writer.write('NOT initializing text_bert from BERT_BASE')
            self.text_bert = TextBert(self.text_bert_config)
Example #20
0
    def __init__(self, vocab_file, embedding_name, *args, **kwargs):
        """Use this vocab class when you have a custom vocabulary class but you
        want to use pretrained embedding vectos for it. This will only load
        the vectors which intersect with your vocabulary. Use the
        embedding_name specified in torchtext's pretrained aliases:
        ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d',
         'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d',
         'glove.twitter.27B.50d', 'glove.twitter.27B.100d',
         'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d',
         'glove.6B.200d', 'glove.6B.300d']

        Parameters
        ----------
        vocab_file : str
            Vocabulary file containing list of words with one word per line
            which will be used to collect vectors
        embedding_name : str
            Embedding name picked up from the list of the pretrained aliases
            mentioned above
        """
        super(IntersectedVocab, self).__init__(vocab_file, *args, **kwargs)

        self.type = "intersected"

        name = embedding_name.split(".")[0]
        dim = embedding_name.split(".")[2][:-1]
        middle = embedding_name.split(".")[1]

        class_name = EMBEDDING_NAME_CLASS_MAPPING[name]

        if not hasattr(vocab, class_name):
            from pythia.common.registry import registry

            writer = registry.get("writer")
            error = "Unknown embedding type: %s" % name, "error"
            if writer is not None:
                writer.write(error, "error")
            raise RuntimeError(error)

        params = [middle]

        if name == "glove":
            params.append(int(dim))

        vector_cache = os.path.join(get_pythia_root(), ".vector_cache")
        embedding = getattr(vocab, class_name)(*params, cache=vector_cache)

        self.vectors = torch.empty(
            (self.get_size(), len(embedding.vectors[0])), dtype=torch.float
        )

        self.embedding_dim = len(embedding.vectors[0])

        for i in range(0, 4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        for i in range(4, self.get_size()):
            word = self.itos[i]
            embedding_index = embedding.stoi.get(word, None)

            if embedding_index is None:
                self.vectors[i] = self.vectors[self.UNK_INDEX].clone()
            else:
                self.vectors[i] = embedding.vectors[embedding_index]
Example #21
0
from pythia.common.registry import registry
from pythia.common.sample import Sample
from pythia.tasks.base_dataset import BaseDataset
from pythia.utils.general import get_pythia_root
from pythia.utils.text_utils import VocabFromText, tokenize

from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.structures.image_list import to_image_list
from maskrcnn_benchmark.utils.model_serialization import load_state_dict
from maskrcnn_benchmark.layers import nms

device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')

maskrcnn_checkpoint = os.path.join(get_pythia_root(), '../data',
                                   'model_data/detectron_model.pth')

cfg.merge_from_file(
    os.path.join(get_pythia_root(), '../data',
                 'model_data/detectron_model.yaml'))
cfg.freeze()


class VQAMaskRCNNBenchmark(nn.Module):
    def __init__(self):
        super(VQAMaskRCNNBenchmark, self).__init__()
        # self.avgpool = nn.AdaptiveAvgPool2d(output_size=1)

        self.model = build_detection_model(cfg)