def _try_download(self): is_main_process = self._is_main_process() if self._already_downloaded: return if is_main_process: self.writer.write("Fetching fastText model for OCR processing") needs_download = False if not hasattr(self.config, "model_file"): if is_main_process: warnings.warn("'model_file' key is required but missing " "from FastTextProcessor's config.") needs_download = True model_file = self.config.model_file model_file = os.path.join(get_pythia_root(), model_file) if not os.path.exists(model_file): if is_main_process: warnings.warn( "No model file present at {}.".format(model_file)) needs_download = True if needs_download: if is_main_process: self.writer.write("Downloading FastText bin", "info") model_file = self._download_model() synchronize() self._load_fasttext_model(model_file) self._already_downloaded = True
def __init__(self, dataset_type, config, data_folder=None, *args, **kwargs): super().__init__(_CONSTANTS["dataset_key"], dataset_type, config) self._data_folder = data_folder self._data_root_dir = os.path.join(get_pythia_root(), config.data_root_dir) if not self._data_folder: self._data_folder = os.path.join(self._data_root_dir, config.data_folder) if not os.path.exists(self._data_folder): raise RuntimeError(_TEMPLATES["data_folder_missing_error"].format( self._data_folder)) # Check if the folder was actually extracted in the subfolder if config.data_folder in os.listdir(self._data_folder): self._data_folder = os.path.join(self._data_folder, config.data_folder) if len(os.listdir(self._data_folder)) == 0: raise FileNotFoundError(_CONSTANTS["empty_folder_error"]) self._load()
def _build(self, dataset_type, config): download_folder = os.path.join(get_pythia_root(), config.data_root_dir, config.data_folder) file_name = CLEVR_DOWNLOAD_URL.split("/")[-1] local_filename = os.path.join(download_folder, file_name) extraction_folder = os.path.join(download_folder, ".".join(file_name.split(".")[:-1])) self.data_folder = extraction_folder # Either if the zip file is already present or if there are some # files inside the folder we don't continue download process if os.path.exists(local_filename): self.writer.write("CLEVR dataset is already present. Skipping download.") return if os.path.exists(extraction_folder) and \ len(os.listdir(extraction_folder)) != 0: return self.writer.write("Downloading the CLEVR dataset now") download_file(CLEVR_DOWNLOAD_URL, output_dir=download_folder) self.writer.write("Downloaded. Extracting now. This can take time.") with zipfile.ZipFile(local_filename, "r") as zip_ref: zip_ref.extractall(download_folder)
def load_yaml(self, file): with open(file, "r") as stream: mapping = yaml.safe_load(stream) if mapping is None: mapping = {} includes = mapping.get("includes", []) if not isinstance(includes, list): raise AttributeError( "Includes must be a list, {} provided".format( type(includes))) include_mapping = {} pythia_root_dir = get_pythia_root() for include in includes: include = os.path.join(pythia_root_dir, include) current_include_mapping = self.load_yaml(include) include_mapping = self.nested_dict_update( include_mapping, current_include_mapping) mapping.pop("includes", None) mapping = self.nested_dict_update(include_mapping, mapping) return mapping
def __init__(self, vocab_file=None, embedding_dim=300, data_root_dir=None, *args, **kwargs): """Vocab class to be used when you want to train word embeddings from scratch based on a custom vocab. This will initialize the random vectors for the vocabulary you pass. Get the vectors using `get_vectors` function. This will also create random embeddings for some predefined words like PAD - <pad>, SOS - <s>, EOS - </s>, UNK - <unk>. Parameters ---------- vocab_file : str Path of the vocabulary file containing one word per line embedding_dim : int Size of the embedding """ self.type = "base" self.word_dict = {} self.itos = {} self.itos[self.PAD_INDEX] = self.PAD_TOKEN self.itos[self.SOS_INDEX] = self.SOS_TOKEN self.itos[self.EOS_INDEX] = self.EOS_TOKEN self.itos[self.UNK_INDEX] = self.UNK_TOKEN self.word_dict[self.SOS_TOKEN] = self.SOS_INDEX self.word_dict[self.EOS_TOKEN] = self.EOS_INDEX self.word_dict[self.PAD_TOKEN] = self.PAD_INDEX self.word_dict[self.UNK_TOKEN] = self.UNK_INDEX index = len(self.itos.keys()) self.total_predefined = len(self.itos.keys()) if vocab_file is not None: if not os.path.isabs(vocab_file) and data_root_dir is not None: pythia_root = get_pythia_root() vocab_file = os.path.join(pythia_root, data_root_dir, vocab_file) if not os.path.exists(vocab_file): raise RuntimeError("Vocab not found at " + vocab_file) with open(vocab_file, "r") as f: for line in f: self.itos[index] = line.strip() self.word_dict[line.strip()] = index index += 1 self.word_dict[self.UNK_TOKEN] = self.UNK_INDEX # Return unk index by default self.stoi = defaultdict(lambda: self.UNK_INDEX) self.stoi.update(self.word_dict) self.vectors = torch.FloatTensor(self.get_size(), embedding_dim)
def __init__(self, config, *args, **kwargs): self.max_length = config.max_length pythia_root = get_pythia_root() VOCAB = 'bert-base-uncased-vocab.txt' self.bert_tokenizer = BertTokenizer.from_pretrained( os.path.join(pythia_root, config.model_data_dir, 'bert', VOCAB)) assert self.bert_tokenizer.encode(self.bert_tokenizer.pad_token) == [0] self.get_qgen_inds = getattr(config, 'get_qgen_inds', False) if self.get_qgen_inds: print('computing question generation indices in bert tokenizer')
def _load_fasttext_model(self, model_file): from fasttext import load_model from pythia.common.registry import registry pythia_root = get_pythia_root() model_file = os.path.join(pythia_root, model_file) registry.get("writer").write("Loading fasttext model now from %s" % model_file) self.model = load_model(model_file) self.stoi = WordToVectorDict(self.model)
def __init__(self, embedding_name, *args, **kwargs): """Use this if you want to use pretrained embedding. See description of IntersectedVocab to get a list of the embedding available from torchtext Parameters ---------- embedding_name : str Name of the pretrained alias for the embedding to used """ self.type = "pretrained" if embedding_name not in vocab.pretrained_aliases: from pythia.common.registry import registry writer = registry.get("writer") error = "Unknown embedding type: %s" % embedding_name, "error" if writer is not None: writer.write(error, "error") raise RuntimeError(error) vector_cache = os.path.join(get_pythia_root(), ".vector_cache") embedding = vocab.pretrained_aliases[embedding_name]( cache=vector_cache) self.UNK_INDEX = 3 self.stoi = defaultdict(lambda: self.UNK_INDEX) self.itos = {} self.itos[self.PAD_INDEX] = self.PAD_TOKEN self.itos[self.SOS_INDEX] = self.SOS_TOKEN self.itos[self.EOS_INDEX] = self.EOS_TOKEN self.itos[self.UNK_INDEX] = self.UNK_TOKEN self.stoi[self.SOS_TOKEN] = self.SOS_INDEX self.stoi[self.EOS_TOKEN] = self.EOS_INDEX self.stoi[self.PAD_TOKEN] = self.PAD_INDEX self.stoi[self.UNK_TOKEN] = self.UNK_INDEX self.vectors = torch.FloatTensor( len(self.itos.keys()) + len(embedding.itos), len(embedding.vectors[0])) for i in range(4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i index = 4 for word in embedding.stoi: self.itos[index] = word self.stoi[word] = index actual_index = embedding.stoi[word] self.vectors[index] = embedding.vectors[actual_index] index += 1
def __init__(self, dataset_type, config): super().__init__('imagenet', dataset_type, config) self.feature_extractor = VQAMaskRCNNBenchmark() self.feature_extractor.to(device) self.config = config # directly to store annotations(captions) self.annotation_dir = os.path.join(get_pythia_root(), config.data_root_dir, config.annotation_dir) # directory to store images self.image_dir = os.path.join(get_pythia_root(), config.data_root_dir, config.image_dir) self.annotations = [] for annotation_file in os.listdir(self.annotation_dir): with open(os.path.join(self.annotation_dir, annotation_file)) as f: annotation = json.load(f) for item in annotation.items(): # each item in annotations is a (image_id, caption) tuple self.annotations.append(item) self.init_processors()
def _get_absolute_path(self, paths): if isinstance(paths, list): return [self._get_absolute_path(path) for path in paths] elif isinstance(paths, str): if not os.path.isabs(paths): pythia_root = get_pythia_root() paths = os.path.join(pythia_root, self.config.data_root_dir, paths) return paths else: raise TypeError("Paths passed to dataset should either be " "string or list")
def __init__(self, vocab_file, data_root_dir=None): if not os.path.isabs(vocab_file) and data_root_dir is not None: pythia_root = get_pythia_root() vocab_file = os.path.abspath(os.path.join("data", vocab_file)) if not os.path.exists(vocab_file): raise RuntimeError( "Vocab file {} for vocab dict doesn't exist".format( vocab_file)) self.word_list = load_str_list(vocab_file) self._build()
def setUp(self): torch.manual_seed(1234) registry.register("clevr_text_vocab_size", 80) registry.register("clevr_num_final_outputs", 32) config_path = os.path.join(get_pythia_root(), "..", "configs", "vqa", "clevr", "cnn_lstm.yml") config_path = os.path.abspath(config_path) configuration = Configuration(config_path) configuration.config["datasets"] = "clevr" configuration.freeze() self.config = configuration.config registry.register("config", self.config)
def __init__(self, vocab_file, embedding_file, data_root_dir=None, *args, **kwargs): """Use this vocab class when you have a custom vocab as well as a custom embeddings file. This will inherit vocab class, so you will get predefined tokens with this one. IMPORTANT: To init your embedding, get your vectors from this class's object by calling `get_vectors` function Parameters ---------- vocab_file : str Path of custom vocabulary embedding_file : str Path to custom embedding inititalization file data_root_dir : str Path to data directory if embedding file is not an absolute path. Default: None """ super(CustomVocab, self).__init__(vocab_file) self.type = "custom" if not os.path.isabs(embedding_file) and data_root_dir is not None: pythia_root = get_pythia_root() embedding_file = os.path.join(pythia_root, data_root_dir, embedding_file) if not os.path.exists(embedding_file): from pythia.common.registry import registry writer = registry.get("writer") error = "Embedding file path %s doesn't exist" % embedding_file if writer is not None: writer.write(error, "error") raise RuntimeError(error) embedding_vectors = torch.from_numpy(np.load(embedding_file)) self.vectors = torch.FloatTensor(self.get_size(), len(embedding_vectors[0])) for i in range(0, 4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i for i in range(4, self.get_size()): self.vectors[i] = embedding_vectors[i - 4]
def setUp(self): torch.manual_seed(1234) config_path = os.path.join(get_pythia_root(), "..", "configs", "captioning", "coco", "butd_nucleus_sampling.yml") config_path = os.path.abspath(config_path) configuration = Configuration(config_path) configuration.config["datasets"] = "coco" configuration.config["model_attributes"]["butd"]["inference"][ "params"]["sum_threshold"] = 0.5 configuration.freeze() self.config = configuration.config registry.register("config", self.config)
def _build(self, dataset_type, config): self._dataset_type = dataset_type self._config = config data_folder = os.path.join(get_pythia_root(), self._config.data_root_dir) # Since the imdb tar file contains all of the sets, we won't download them # except in case of train if self._dataset_type != "train": return self._download_and_extract_imdb(data_folder) self._download_and_extract_features(data_folder)
def __init__(self, vocab_file, data_root_dir=None): if not os.path.isabs(vocab_file) and data_root_dir is not None: pythia_root = get_pythia_root() vocab_file = os.path.abspath( os.path.join(pythia_root, data_root_dir, vocab_file)) if not os.path.exists(vocab_file): raise RuntimeError( "Vocab file {} for vocab dict doesn't exist".format( vocab_file)) self.word_list = load_str_list(vocab_file) self.word2idx_dict = {w: n_w for n_w, w in enumerate(self.word_list)} self.num_vocab = len(self.word_list) self.UNK_INDEX = (self.word2idx_dict["<unk>"] if "<unk>" in self.word2idx_dict else None)
def _download_model(self): is_main_process = self._is_main_process() model_file_path = os.path.join( get_pythia_root(), ".vector_cache", "wiki.en.bin" ) if is_main_process: return model_file_path if os.path.exists(model_file_path): if is_main_process: self.writer.write( "Vectors already present at {}.".format(model_file_path), "info" ) return model_file_path import requests from pythia.common.constants import FASTTEXT_WIKI_URL from tqdm import tqdm os.makedirs(os.path.dirname(model_file_path), exist_ok=True) response = requests.get(FASTTEXT_WIKI_URL, stream=True) with open(model_file_path, "wb") as f: pbar = tqdm( total=int(response.headers["Content-Length"]) / 4096, miniters=50, disable=not is_main_process, ) idx = 0 for data in response.iter_content(chunk_size=4096): if data: if idx % 50 == 0: pbar.update(len(data)) f.write(data) idx += 1 pbar.close() if is_main_process: self.writer.write( "fastText bin downloaded at {}.".format(model_file_path), "info" ) return model_file_path
def __init__(self, in_dim, weights_file, bias_file, model_data_dir): super(FinetuneFasterRcnnFpnFc7, self).__init__() pythia_root = get_pythia_root() model_data_dir = os.path.join(pythia_root, model_data_dir) if not os.path.isabs(weights_file): weights_file = os.path.join(model_data_dir, weights_file) if not os.path.isabs(bias_file): bias_file = os.path.join(model_data_dir, bias_file) with open(weights_file, "rb") as w: weights = pickle.load(w) with open(bias_file, "rb") as b: bias = pickle.load(b) out_dim = bias.shape[0] self.lc = nn.Linear(in_dim, out_dim) self.lc.weight.data.copy_(torch.from_numpy(weights)) self.lc.bias.data.copy_(torch.from_numpy(bias)) self.out_dim = out_dim
def _build_txt_encoding(self): TEXT_BERT_HIDDEN_SIZE = 768 self.text_bert_config = BertConfig(**self.config.text_bert) if self.config.text_bert_init_from_bert_base: pythia_root = get_pythia_root() self.text_bert = TextBert.from_pretrained( os.path.join(pythia_root, self.config.model_data_dir, 'bert'), config=self.text_bert_config) # Use a smaller learning rate on text bert when initializing # from BERT_BASE self.finetune_modules.append({ 'module': self.text_bert, 'lr_scale': self.config.lr_scale_text_bert, }) else: self.writer.write('NOT initializing text_bert from BERT_BASE') self.text_bert = TextBert(self.text_bert_config)
def __init__(self, vocab_file, embedding_name, *args, **kwargs): """Use this vocab class when you have a custom vocabulary class but you want to use pretrained embedding vectos for it. This will only load the vectors which intersect with your vocabulary. Use the embedding_name specified in torchtext's pretrained aliases: ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d', 'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d', 'glove.twitter.27B.50d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d'] Parameters ---------- vocab_file : str Vocabulary file containing list of words with one word per line which will be used to collect vectors embedding_name : str Embedding name picked up from the list of the pretrained aliases mentioned above """ super(IntersectedVocab, self).__init__(vocab_file, *args, **kwargs) self.type = "intersected" name = embedding_name.split(".")[0] dim = embedding_name.split(".")[2][:-1] middle = embedding_name.split(".")[1] class_name = EMBEDDING_NAME_CLASS_MAPPING[name] if not hasattr(vocab, class_name): from pythia.common.registry import registry writer = registry.get("writer") error = "Unknown embedding type: %s" % name, "error" if writer is not None: writer.write(error, "error") raise RuntimeError(error) params = [middle] if name == "glove": params.append(int(dim)) vector_cache = os.path.join(get_pythia_root(), ".vector_cache") embedding = getattr(vocab, class_name)(*params, cache=vector_cache) self.vectors = torch.empty( (self.get_size(), len(embedding.vectors[0])), dtype=torch.float ) self.embedding_dim = len(embedding.vectors[0]) for i in range(0, 4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i for i in range(4, self.get_size()): word = self.itos[i] embedding_index = embedding.stoi.get(word, None) if embedding_index is None: self.vectors[i] = self.vectors[self.UNK_INDEX].clone() else: self.vectors[i] = embedding.vectors[embedding_index]
from pythia.common.registry import registry from pythia.common.sample import Sample from pythia.tasks.base_dataset import BaseDataset from pythia.utils.general import get_pythia_root from pythia.utils.text_utils import VocabFromText, tokenize from maskrcnn_benchmark.modeling.detector import build_detection_model from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.structures.image_list import to_image_list from maskrcnn_benchmark.utils.model_serialization import load_state_dict from maskrcnn_benchmark.layers import nms device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') maskrcnn_checkpoint = os.path.join(get_pythia_root(), '../data', 'model_data/detectron_model.pth') cfg.merge_from_file( os.path.join(get_pythia_root(), '../data', 'model_data/detectron_model.yaml')) cfg.freeze() class VQAMaskRCNNBenchmark(nn.Module): def __init__(self): super(VQAMaskRCNNBenchmark, self).__init__() # self.avgpool = nn.AdaptiveAvgPool2d(output_size=1) self.model = build_detection_model(cfg)