Beispiel #1
0
    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.special_tok_map = {}  # map from HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if PathManager.exists(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if PathManager.exists(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.bpe_dropout:
            raise NotImplementedError(
                '--bpe-dropout is not supported with ByteLevelBPE because tokenizers '
                'library does not allow dynamically turning BPE on/off. You can use '
                '--dict-tokenizer slow_bytelevel_bpe to gain this feature.'
            )

        if self.lower:
            warn_once('Are you sure you want to lower case your BPE dictionary?')
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).'
            )
        if 'bpe_vocab' not in opt:
            raise ValueError('--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError('--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError(
                '--bpe-vocab and --bpe-merge are mandatory with '
                '--dict-tokenizer bytelevelbpe'
            )

        if not PathManager.exists(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not PathManager.exists(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(
            self.vocab_path, self.merge_path, self.add_prefix_space
        )
Beispiel #2
0
    def load(self, path):
        """
        Load from a given path.
        """
        mode = self.opt.get('image_mode', 'raw')
        if mode is None or mode == 'no_image_model':
            # don't need to load images
            return None
        elif mode == 'raw':
            return self._load_image(path)
        elif mode == 'ascii':
            # convert images to ascii ¯\_(ツ)_/¯
            return self._img_to_ascii(self._load_image(path))

        # otherwise, looks for preprocessed version under 'mode' directory
        prepath, imagefn = self._get_prepath(path)
        dpath = os.path.join(prepath, mode)
        if not PathManager.exists(dpath):
            build_data.make_dir(dpath)
        imagefn = imagefn.split('.')[0]
        new_path = os.path.join(prepath, mode, imagefn)
        if not PathManager.exists(new_path):
            return self.extract(self._load_image(path), new_path)
        else:
            with PathManager.open(new_path, 'rb') as f:
                return torch.load(f)
Beispiel #3
0
    def load_init(cls, optfile: str) -> Opt:
        """
        Like load, but also looks in opt_presets folders.

        optfile may also be a comma-separated list of multiple presets/files.
        """
        if "," in optfile:
            # load and combine each of the individual files
            new_opt = cls()
            for subopt in optfile.split(","):
                new_opt.update(cls.load_init(subopt))
            return new_opt

        oa_filename = os.path.join("opt_presets", optfile + ".opt")
        user_filename = os.path.join(os.path.expanduser(f"~/.parlai"),
                                     oa_filename)
        if PathManager.exists(optfile):
            return cls.load(optfile)
        elif PathManager.exists(user_filename):
            # use a user's custom opt preset
            return cls.load(user_filename)
        elif pkg_resources.resource_exists("parlai", oa_filename):
            # Maybe a bundled opt preset
            return cls.load(
                pkg_resources.resource_filename("parlai", oa_filename))
        else:
            raise FileNotFoundError(
                f"Could not find filename '{optfile} or opt preset '{optfile}.opt'. "
                "Please check https://parl.ai/docs/opt_presets.html for a list "
                "of available opt presets.")
Beispiel #4
0
def set_defaults(opt):
    init_model = None
    # check first for 'init_model' for loading model from file
    if opt.get('init_model') and PathManager.exists(opt['init_model']):
        init_model = opt['init_model']
    # next check for 'model_file', this would override init_model
    if opt.get('model_file') and PathManager.exists(opt['model_file']):
        init_model = opt['model_file']

    if init_model is None:
        # Embeddings options
        opt['embedding_file'] = modelzoo_path(opt.get('datapath'),
                                              opt['embedding_file'])
        if opt.get('embedding_file'):
            if not PathManager.exists(opt['embedding_file']):
                raise IOError('No such file: %s' % opt['embedding_file'])
            with PathManager.open(opt['embedding_file']) as f:
                dim = len(f.readline().strip().split(' ')) - 1
                if dim == 1:
                    # first line was a dud
                    dim = len(f.readline().strip().split(' ')) - 1
            opt['embedding_dim'] = dim
        elif not opt.get('embedding_dim'):
            raise RuntimeError(('Either embedding_file or embedding_dim '
                                'needs to be specified.'))

        # Make sure tune_partial and fix_embeddings are consistent
        if opt['tune_partial'] > 0 and opt['fix_embeddings']:
            print('Setting fix_embeddings to False as tune_partial > 0.')
            opt['fix_embeddings'] = False

        # Make sure fix_embeddings and embedding_file are consistent
        if opt['fix_embeddings'] and not opt.get('embedding_file'):
            print('Setting fix_embeddings to False as embeddings are random.')
            opt['fix_embeddings'] = False
Beispiel #5
0
def create_agent_from_opt_file_and_model_class(opt, model_class):
    model_file = opt['model_file']
    optfile = model_file + '.opt'

    if not PathManager.exists(optfile):
        return None

    opt_from_file = Opt.load(optfile)

    # delete args that we do not want to copy over when loading the model
    for arg in NOCOPY_ARGS:
        if arg in opt_from_file:
            del opt_from_file[arg]

    # only override opts specified in 'override' dict
    if opt.get('override'):
        for k, v in opt['override'].items():
            if k in opt_from_file and str(v) != str(opt_from_file.get(k)):
                logging.warn(
                    f'Overriding opt["{k}"] to {v} (previously: {opt_from_file.get(k)})'
                )
            opt_from_file[k] = v

    if hasattr(model_class, 'upgrade_opt'):
        opt_from_file = model_class.upgrade_opt(opt_from_file)

    # add model arguments to opt_from_file if they aren't in opt_from_file already
    for k, v in opt.items():
        if k not in opt_from_file:
            opt_from_file[k] = v

    # update model file path to the one set by opt
    opt_from_file['model_file'] = model_file
    # update init model path to the one set by opt
    # NOTE: this step is necessary when for example the 'init_model' is
    # set by the Train Loop (as is the case when loading from checkpoint)
    if opt.get('init_model') is not None:
        opt_from_file['init_model'] = opt['init_model']

    # update dict file path
    if not opt_from_file.get('dict_file'):
        old_dict_file = None
        opt_from_file['dict_file'] = model_file + '.dict'
    elif opt_from_file.get('dict_file') and not PathManager.exists(
            opt_from_file['dict_file']):
        old_dict_file = opt_from_file['dict_file']
        opt_from_file['dict_file'] = model_file + '.dict'
    if not PathManager.exists(opt_from_file['dict_file']):
        warn_once(
            'WARNING: Neither the specified dict file ({}) nor the '
            '`model_file`.dict file ({}) exists, check to make sure either '
            'is correct. This may manifest as a shape mismatch later '
            'on.'.format(old_dict_file, opt_from_file['dict_file']))

    # if we want to load weights from --init-model, compare opts with
    # loaded ones
    compare_init_model_opts(opt, opt_from_file)
    return model_class(opt_from_file)
Beispiel #6
0
    def __init__(self, opt, shared=None):
        """
        Set up model if shared params not set, otherwise no work to do.
        """
        super().__init__(opt, shared)
        opt = self.opt
        self.reset_metrics()
        self.id = 'Starspace'
        self.NULL_IDX = 0
        self.cands = torch.LongTensor(1, 1, 1)
        self.ys_cache = []
        self.ys_cache_sz = opt['cache_size']
        self.truncate = opt['truncate'] if opt['truncate'] > 0 else None
        self.history = {}
        self.debugMode = False
        if shared:
            # set up shared properties
            self.dict = shared['dict']
            self.model = shared['model']
        else:
            print("[ creating StarspaceAgent ]")
            # this is not a shared instance of this class, so do full init
            if opt.get('model_file') and (
                PathManager.exists(opt.get('model_file') + '.dict')
                or (opt['dict_file'] is None)
            ):
                # set default dict-file if not set
                opt['dict_file'] = opt['model_file'] + '.dict'
            # load dictionary and basic tokens & vectors
            self.dict = DictionaryAgent(opt)

            self.model = Starspace(opt, len(self.dict), self.dict)
            if opt.get('model_file') and PathManager.exists(opt['model_file']):
                self.load(opt['model_file'])
            else:
                self._init_embeddings()
            self.model.share_memory()

        # set up modules
        self.criterion = torch.nn.CosineEmbeddingLoss(
            margin=opt['margin'], size_average=False
        )
        self.reset()
        self.fixedCands = False
        self.fixedX = None
        if self.opt.get('fixed_candidates_file'):
            self.fixedCands_txt = load_cands(self.opt.get('fixed_candidates_file'))
            fcs = []
            for c in self.fixedCands_txt:
                fcs.append(torch.LongTensor(self.parse(c)).unsqueeze(0))
            self.fixedCands = fcs
            print("[loaded candidates]")
Beispiel #7
0
 def _build_model(self, path=None):
     init_model_path = None
     if self.opt.get('init_model') and PathManager.exists(self.opt['init_model']):
         init_model_path = self.opt['init_model']
     elif self.opt.get('model_file') and PathManager.exists(self.opt['model_file']):
         init_model_path = self.opt['model_file']
     elif path is not None:
         init_model_path = path
     print('Creating or loading model')
     self.model = TransresnetModel(self.opt, self.personalities_list, self.dict)
     if init_model_path is not None:
         self.load(init_model_path)
     if self.use_cuda:
         self.model.cuda()
Beispiel #8
0
 def _setup_cands(self):
     self.fixed_cands = None
     self.fixed_cands_enc = None
     if self.fcp is not None:
         with PathManager.open(self.fcp) as f:
             self.fixed_cands = [c.replace('\n', '') for c in f.readlines()]
         cands_enc_file = '{}.cands_enc'.format(self.fcp)
         print('loading saved cand encodings')
         if PathManager.exists(cands_enc_file):
             with PathManager.open(cands_enc_file, 'rb') as f:
                 self.fixed_cands_enc = torch.load(
                     f, map_location=lambda cpu, _: cpu
                 )
         else:
             print('Extracting cand encodings')
             self.model.eval()
             pbar = tqdm.tqdm(
                 total=len(self.fixed_cands),
                 unit='cand',
                 unit_scale=True,
                 desc='Extracting candidate encodings',
             )
             fixed_cands_enc = []
             for _, batch in enumerate(
                 [
                     self.fixed_cands[i : i + 50]
                     for i in range(0, len(self.fixed_cands) - 50, 50)
                 ]
             ):
                 embedding = self.model(None, None, batch)[1].detach()
                 fixed_cands_enc.append(embedding)
                 pbar.update(50)
             self.fixed_cands_enc = torch.cat(fixed_cands_enc, 0)
             torch_utils.atomic_save(self.fixed_cands_enc, cands_enc_file)
Beispiel #9
0
 def _build_model(self, path=None):
     init_model_path = None
     if self.opt.get("init_model") and PathManager.exists(self.opt["init_model"]):
         init_model_path = self.opt["init_model"]
     elif self.opt.get("model_file") and PathManager.exists(self.opt["model_file"]):
         init_model_path = self.opt["model_file"]
     elif path is not None:
         init_model_path = path
     print("Creating or loading model")
     self.model = TransresnetMultimodalModel(
         self.opt, self.personalities_list, self.dict
     )
     if init_model_path is not None:
         self.load(init_model_path)
     if self.use_cuda:
         self.model.cuda()
Beispiel #10
0
 def _check_parent_dir_exits(datapath):
     parent_dir = os.path.dirname(datapath)
     if not parent_dir or PathManager.exists(parent_dir):
         return
     logging.info(
         f'Parent directory ({parent_dir}) did not exist and was created.')
     PathManager.mkdirs(parent_dir)
Beispiel #11
0
 def _setup_cands(self):
     """
     Override for different call to model.
     """
     self.fixed_cands = None
     self.fixed_cands_enc = None
     if self.fcp is not None:
         with PathManager.open(self.fcp) as f:
             self.fixed_cands = [c.replace("\n", "") for c in f.readlines()]
         cands_enc_file = "{}.cands_enc".format(self.fcp)
         print("loading saved cand encodings")
         if PathManager.exists(cands_enc_file):
             with PathManager.open(cands_enc_file, 'rb') as f:
                 self.fixed_cands_enc = torch.load(
                     f, map_location=lambda cpu, _: cpu)
         else:
             print("Extracting cand encodings")
             self.model.eval()
             pbar = tqdm.tqdm(
                 total=len(self.fixed_cands),
                 unit="cand",
                 unit_scale=True,
                 desc="Extracting candidate encodings",
             )
             fixed_cands_enc = []
             for _, batch in enumerate([
                     self.fixed_cands[i:i + 50]
                     for i in range(0,
                                    len(self.fixed_cands) - 50, 50)
             ]):
                 embedding = self.model.forward_text_encoder(batch).detach()
                 fixed_cands_enc.append(embedding)
                 pbar.update(50)
             self.fixed_cands_enc = torch.cat(fixed_cands_enc, 0)
             torch_utils.atomic_save(self.fixed_cands_enc, cands_enc_file)
Beispiel #12
0
    def _get_data(self):
        # useful constants
        # all of these colors are bolded
        RESET = '\033[0m'
        RED = '\033[1;91m'
        YELLOW = '\033[1;93m'
        GREEN = '\033[1;92m'
        BLUE = '\033[1;96m'
        CYAN = '\033[1;94m'
        MAGENTA = '\033[1;95m'

        # only use colors if we're outputting to a terminal
        USE_COLORS = _sys.stdout.isatty()
        if not USE_COLORS:
            RESET = RED = YELLOW = GREEN = BLUE = CYAN = MAGENTA = ''

        # generate the rainbow stars
        rainbow = [RED, YELLOW, GREEN, CYAN, BLUE, MAGENTA]
        size = 78 // len(rainbow)
        stars = ''.join([color + '*' * size for color in rainbow])
        stars += RESET

        if not os.path.exists(self.data_path):
            PathManager.mkdirs(self.data_path)
        if not PathManager.exists(os.path.join(self.data_path, 'train.csv')):
            raise RuntimeError(
                f'\n\n{stars}\nThis data must be downloaded from {self.DATA_SOURCE}'
                '\nIt cannot be automatically downloaded, as one must agree to '
                'the competition rules outlined on the website before '
                'gaining access to the data.\n\n'
                'Once downloaded, please put the data in the following '
                f'directory: \n{self.data_path}\n{stars}')
Beispiel #13
0
    def __init__(self, opt, shared=None):
        opt = copy.deepcopy(opt)
        if 'subtask' not in opt:
            print('Warning: SelfFeedingteacher should be assigned subtask. '
                  'Defaulting to dialog')
            opt['subtask'] = 'dialog'

        # Use 'in' to also capture 'train:ordered:stream'
        if 'train' in opt['datatype']:
            # Use the filename explicitly given with the flag if available
            # Otherwise, use train_xx.txt where xx is inferred from the subtask
            train_file_flag = f"{opt['subtask'][:3]}_train"
            if opt.get(train_file_flag, None):
                path = _path(opt, opt[train_file_flag], add_suffix=False)
            else:
                path = _path(opt, "train", add_suffix=True)
        else:
            # Use the filename explicitly given with the flag if available
            # Otherwise, use the datatype (valid_xx.txt or test_xx.txt) where xx is
            # inferred from the subtask.
            eval_file_flag = f"{opt['subtask'][:3]}_{opt['datatype']}"
            if opt.get(eval_file_flag, None):
                path = _path(opt, opt[eval_file_flag], add_suffix=False)
            else:
                path = _path(opt,
                             opt['datatype'].split(':')[0],
                             add_suffix=True)

        if not PathManager.exists(path):
            raise ValueError("Unrecognized filepath: {}".format(path))

        opt['parlaidialogteacher_datafile'] = path
        opt['datafile'] = path
        super().__init__(opt, shared)
Beispiel #14
0
    def __init__(self, opt, shared=None):
        super().__init__(opt, shared)

        # All agents keep track of the episode (for multiple questions)
        self.episode_done = True

        self.opt['cuda'] = not self.opt['no_cuda'] and torch.cuda.is_available(
        )

        if shared is not None:
            # model has already been set up
            self.word_dict = shared['word_dict']
            self.model = shared['model']
            self.feature_dict = shared['feature_dict']
        else:
            # set up model
            self.word_dict = DrqaAgent.dictionary_class()(opt)
            if self.opt.get('model_file') and PathManager.exists(
                    opt['model_file']):
                self._init_from_saved(opt['model_file'])
            else:
                if self.opt.get('init_model'):
                    self._init_from_saved(opt['init_model'])
                else:
                    self._init_from_scratch()
            if self.opt['cuda']:
                print('[ Using CUDA (GPU %d) ]' % opt['gpu'])
                torch.cuda.set_device(opt['gpu'])
                self.model.cuda()

        # Set up params/logging/dicts
        self.id = self.__class__.__name__
        config.set_defaults(self.opt)
        self.n_examples = 0
Beispiel #15
0
 def load_eli5(self, opt):
     """
     Load data based on data split.
     """
     dp = opt['datapath']
     dt = opt['datatype'].split(':')[0]
     eli_path = "eli5/processed_data/selected_15_1/explainlikeimfive_"
     fname = os.path.join(dp, eli_path + dt + ".json")
     if not PathManager.exists(fname):
         raise FileNotFoundError(
             f"{fname} not found. Please follow the instructions found at "
             "https://github.com/facebookresearch/ParlAI/tree/master/parlai/tasks/eli5/README.md"
             " to construct the dataset.")
     opt['datafile'] = fname
     with PathManager.open(fname) as json_file:
         data = json.load(json_file)
     ds = []
     for d in data:
         if self.opt['knowledge']:
             text = d['document'] + "\n" + d['question']
         else:
             text = d['question']
         act = {
             'id': 'eli5',
             'text': text,
             'labels': [d['answer']],
             'episode_done': True,
         }
         ds.append(act)
     return ds
Beispiel #16
0
def compare_init_model_opts(opt: Opt, curr_opt: Opt):
    """
    Print loud warning when `init_model` opts differ from previous configuration.
    """
    if opt.get('init_model') is None:
        return
    opt['init_model'] = modelzoo_path(opt['datapath'], opt['init_model'])
    optfile = opt['init_model'] + '.opt'
    if not PathManager.exists(optfile):
        return
    init_model_opt = Opt.load(optfile)

    extra_opts = {}
    different_opts = {}
    exempt_opts = [
        'model_file',
        'dict_file',
        'override',
        'starttime',
        'init_model',
        'batchindex',
    ]

    # search through init model opts
    for k, v in init_model_opt.items():
        if (k not in exempt_opts and k in init_model_opt
                and init_model_opt[k] != curr_opt.get(k)):
            if isinstance(v, list):
                if init_model_opt[k] != list(curr_opt[k]):
                    different_opts[k] = ','.join([str(x) for x in v])
            else:
                different_opts[k] = v

    # search through opts to load
    for k, v in curr_opt.items():
        if k not in exempt_opts and k not in init_model_opt:
            if isinstance(v, list):
                extra_opts[k] = ','.join([str(x) for x in v])
            else:
                extra_opts[k] = v

    # print warnings
    extra_strs = ['{}: {}'.format(k, v) for k, v in extra_opts.items()]
    if extra_strs:
        logging.warning(
            'your model is being loaded with opts that do not '
            'exist in the model you are initializing the weights with: '
            '{}'.format(','.join(extra_strs)))

    different_strs = [
        '--{} {}'.format(k.replace('_', '-'), v)
        for k, v in different_opts.items()
    ]
    if different_strs:
        logging.warning(
            'your model is being loaded with opts that differ '
            'from the model you are initializing the weights with. Add the '
            'following args to your run command to change this: \n'
            '{}'.format(' '.join(different_strs)))
Beispiel #17
0
def git_ls_files(root=None, skip_nonexisting=True):
    """
    List all files tracked by git.
    """
    filenames = git_.ls_files(root).split('\n')
    if skip_nonexisting:
        filenames = [fn for fn in filenames if PathManager.exists(fn)]
    return filenames
Beispiel #18
0
def built(path, version_string=None):
    """
    Check if '.built' flag has been set for that task.

    If a version_string is provided, this has to match, or the version is regarded as
    not built.
    """
    if version_string:
        fname = os.path.join(path, '.built')
        if not PathManager.exists(fname):
            return False
        else:
            with PathManager.open(fname, 'r') as read:
                text = read.read().split('\n')
            return len(text) > 1 and text[1] == version_string
    else:
        return PathManager.exists(os.path.join(path, '.built'))
Beispiel #19
0
    def set_vocab_candidates(self, shared):
        """
        Load the tokens from the vocab as candidates.

        self.vocab_candidates will contain a [num_cands] list of strings
        self.vocab_candidate_vecs will contain a [num_cands, 1] LongTensor
        """
        self.opt['encode_candidate_vecs'] = True
        if shared:
            self.vocab_candidates = shared['vocab_candidates']
            self.vocab_candidate_vecs = shared['vocab_candidate_vecs']
            self.vocab_candidate_encs = shared['vocab_candidate_encs']
        else:
            if 'vocab' in (self.opt['candidates'],
                           self.opt['eval_candidates']):
                cands = []
                vecs = []
                for ind in range(1, len(self.dict)):
                    txt = self.dict[ind]
                    cands.append(txt)
                    vecs.append(
                        self._vectorize_text(
                            txt,
                            add_start=True,
                            add_end=True,
                            truncate=self.label_truncate,
                        ))
                self.vocab_candidates = cands
                self.vocab_candidate_vecs = padded_3d([vecs]).squeeze(0)
                print("[ Loaded fixed candidate set (n = {}) from vocabulary ]"
                      "".format(len(self.vocab_candidates)))
                enc_path = self.opt.get('model_file') + '.vocab.encs'
                if PathManager.exists(enc_path):
                    self.vocab_candidate_encs = self.load_candidates(
                        enc_path, cand_type='vocab encodings')
                else:
                    cand_encs = []
                    vec_batches = [
                        self.vocab_candidate_vecs[i:i + 512]
                        for i in range(0, len(self.vocab_candidate_vecs), 512)
                    ]
                    print("[ Vectorizing vocab candidates ({} batch(es) of up "
                          "to 512) ]".format(len(vec_batches)))
                    for vec_batch in tqdm(vec_batches):
                        cand_encs.append(self.encode_candidates(vec_batch))
                    self.vocab_candidate_encs = torch.cat(cand_encs, 0)
                    self.save_candidates(self.vocab_candidate_encs,
                                         enc_path,
                                         cand_type='vocab encodings')
                if self.use_cuda:
                    self.vocab_candidate_vecs = self.vocab_candidate_vecs.cuda(
                    )
                    self.vocab_candidate_encs = self.vocab_candidate_encs.cuda(
                    )
            else:
                self.vocab_candidates = None
                self.vocab_candidate_vecs = None
                self.vocab_candidate_encs = None
Beispiel #20
0
def modelzoo_path(datapath, path):
    """
    Map pretrain models filenames to their path on disk.

    If path starts with 'models:', then we remap it to the model zoo path within the
    data directory (default is ParlAI/data/models). We download models from the model
    zoo if they are not here yet.
    """
    if path is None:
        return None
    if (not path.startswith('models:') and not path.startswith('zoo:')
            and not path.startswith('izoo:')):
        return path
    elif path.startswith('models:') or path.startswith('zoo:'):
        zoo = path.split(':')[0]
        zoo_len = len(zoo) + 1
        model_path = path[zoo_len:]
        # Check if we need to download the model
        if "/" in path:
            animal = path[zoo_len:path.rfind('/')].replace('/', '.')
        else:
            animal = path[zoo_len:]
        if '.' not in animal:
            animal += '.build'
        module_name = 'parlai.zoo.{}'.format(animal)
        try:
            my_module = importlib.import_module(module_name)
            my_module.download(datapath)
        except (ImportError, AttributeError):
            try:
                # maybe we didn't find a specific model, let's try generic .build
                animal_ = '.'.join(animal.split(".")[:-1]) + '.build'
                module_name_ = 'parlai.zoo.{}'.format(animal_)
                my_module = importlib.import_module(module_name_)
                my_module.download(datapath)
            except (ImportError, AttributeError) as exc:
                # truly give up
                raise ImportError(
                    f'Could not find pretrained model in {module_name} or {module_name_}.'
                    ' Please check your spelling and make sure you\'ve pulled from master.'
                ) from exc

        return os.path.join(datapath, 'models', model_path)
    else:
        # Internal path (starts with "izoo:") -- useful for non-public
        # projects.  Save the path to your internal model zoo in
        # parlai_internal/.internal_zoo_path
        # TODO: test the internal zoo.
        zoo_path = 'parlai_internal/zoo/.internal_zoo_path'
        if not PathManager.exists('parlai_internal/zoo/.internal_zoo_path'):
            raise RuntimeError(
                'Please specify the path to your internal zoo in the '
                'file parlai_internal/zoo/.internal_zoo_path in your '
                'internal repository.')
        else:
            with PathManager.open(zoo_path, 'r') as f:
                zoo = f.read().split('\n')[0]
            return os.path.join(zoo, path[5:])
Beispiel #21
0
    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        out_json_path = os.path.join(dir_name, file_name + "-vocab.json")
        out_merge_path = os.path.join(dir_name, file_name + "-merges.txt")
        # Possibly bad assumption: if the destination file already exists,
        # we don't need to copy it over again.
        if not PathManager.exists(out_json_path):
            logging.info(f"Copying {self.json_path} to {out_json_path}")
            PathManager.copy(self.json_path, out_json_path)
        if not PathManager.exists(out_merge_path):
            logging.info(f"Copying {self.merge_path} to {out_merge_path}")
            PathManager.copy(self.merge_path, out_merge_path)
Beispiel #22
0
    def _build_data(self) -> Tuple[str, str]:
        """
        Build data.

        Maybe download the appropriate data.

        :return (bpe_data, json_path):
            bpe_data and path to encoder json
        """
        data_path = os.path.join(self.opt['datapath'], 'gpt2')
        vocab_path = os.path.join(data_path, 'vocab.bpe')
        json_path = os.path.join(data_path, 'encoder.json')
        if not PathManager.exists(vocab_path) or not PathManager.exists(json_path):
            make_dir(data_path)
            download(self.DEFAULT_VOCAB_BPE, data_path, 'vocab.bpe')
            download(self.DEFAULT_ENCODER_JSON, data_path, 'encoder.json')
        with PathManager.open(vocab_path, 'r', encoding="utf-8") as f:
            bpe_data = f.read()

        return bpe_data, json_path, vocab_path
Beispiel #23
0
def print_announcements(opt):
    """
    Output any announcements the ParlAI team wishes to make to users.

    Also gives the user the option to suppress the output.
    """
    # no annoucements to make right now
    return

    noannounce_file = os.path.join(opt.get('datapath'), 'noannouncements')
    if PathManager.exists(noannounce_file):
        # user has suppressed announcements, don't do anything
        return

    # useful constants
    # all of these colors are bolded
    RESET = '\033[0m'
    BOLD = '\033[1m'
    RED = '\033[1;91m'
    YELLOW = '\033[1;93m'
    GREEN = '\033[1;92m'
    BLUE = '\033[1;96m'
    CYAN = '\033[1;94m'
    MAGENTA = '\033[1;95m'

    # only use colors if we're outputting to a terminal
    USE_COLORS = _sys.stdout.isatty()
    if not USE_COLORS:
        RESET = BOLD = RED = YELLOW = GREEN = BLUE = CYAN = MAGENTA = ''

    # generate the rainbow stars
    rainbow = [RED, YELLOW, GREEN, CYAN, BLUE, MAGENTA]
    size = 78 // len(rainbow)
    stars = ''.join([color + '*' * size for color in rainbow])
    stars += RESET

    # do the actual output
    print(
        '\n'.join(
            [
                '',
                stars,
                BOLD,
                'Announcements go here.',
                RESET,
                # don't bold the suppression command
                'To suppress this message (and future announcements), run\n`touch {}`'.format(
                    noannounce_file
                ),
                stars,
            ]
        )
    )
Beispiel #24
0
def get_list_of_files(top_path):
    result = {}
    for path, _dirs, files in os.walk(top_path):
        for filename in files:
            if filename.endswith('.xml'):
                full_filename = os.path.realpath(os.path.join(path, filename))
                assert PathManager.exists(full_filename), 'Bad file ' + full_filename
                movie_id = get_movie_id(full_filename)
                if movie_id not in result:
                    result[movie_id] = []
                result[movie_id].append(full_filename)
    return result
Beispiel #25
0
    def _load_conversations(self, datapath):
        if not PathManager.exists(datapath):
            raise RuntimeError(f'Conversations at path {datapath} not found. '
                               'Double check your path.')

        conversations = []
        with PathManager.open(datapath, 'r') as f:
            lines = f.read().splitlines()
            for line in lines:
                conversations.append(Conversation(json.loads(line)))

        return conversations
Beispiel #26
0
    def __init__(self, opt: Opt):
        try:
            # tensorboard is a very expensive thing to import. Wait until the
            # last second to import it.
            from tensorboardX import SummaryWriter
        except ImportError:
            raise ImportError('Please run `pip install tensorboard tensorboardX`.')

        tbpath = opt['model_file'] + '.tensorboard'
        logging.debug(f'Saving tensorboard logs to: {tbpath}')
        if not PathManager.exists(tbpath):
            PathManager.makedirs(tbpath)
        self.writer = SummaryWriter(tbpath, comment=json.dumps(opt))
Beispiel #27
0
def git_changed_files(skip_nonexisting=True):
    """
    List all the changed files in the git repository.

    :param bool skip_nonexisting:
        If true, ignore files that don't exist on disk. This is useful for
        disregarding files created in master, but don't exist in HEAD.
    """
    fork_point = git_.merge_base('origin/master', 'HEAD').strip()
    filenames = git_.diff('--name-only', fork_point).split('\n')
    if skip_nonexisting:
        filenames = [fn for fn in filenames if PathManager.exists(fn)]
    return filenames
Beispiel #28
0
    def _load_raw(self, datapath):
        """
        Load the data as a raw, unparsed file.

        Useful for fast IO stuff like random indexing.
        """
        if not PathManager.exists(datapath):
            raise RuntimeError(f'Conversations at path {datapath} not found. '
                               'Double check your path.')

        with PathManager.open(datapath, 'r') as f:
            lines = f.read().splitlines()
            for line in lines:
                yield line
Beispiel #29
0
def create_agent(opt: Opt, requireModelExists=False):
    """
    Create an agent from the options ``model``, ``model_params`` and ``model_file``.

    The input is either of the form
    ``parlai.agents.ir_baseline.agents:IrBaselineAgent`` (i.e. the path
    followed by the class name) or else just ``ir_baseline`` which
    assumes the path above, and a class name suffixed with 'Agent'.

    If ``model-file`` is available in the options this function can also
    attempt to load the model from that location instead. This avoids having to
    specify all the other options necessary to set up the model including its
    name as they are all loaded from the options file if it exists (the file
    opt['model_file'] + '.opt' must exist and contain a pickled or json dict
    containing the model's options).
    """
    if opt.get('datapath', None) is None:
        add_datapath_and_model_args(opt)

    if opt.get('model_file'):
        opt['model_file'] = modelzoo_path(opt.get('datapath'),
                                          opt['model_file'])
        if requireModelExists and not PathManager.exists(opt['model_file']):
            raise RuntimeError(
                'WARNING: Model file does not exist, check to make '
                'sure it is correct: {}'.format(opt['model_file']))
        # Attempt to load the model from the model file first (this way we do
        # not even have to specify the model name as a parameter)
        model = create_agent_from_opt_file(opt)
        if model is not None:
            return model
        else:
            logging.info(
                f"No model with opt yet at: {opt['model_file']}(.opt)")

    if opt.get('model'):
        model_class = load_agent_module(opt['model'])
        # if we want to load weights from --init-model, compare opts with
        # loaded ones
        compare_init_model_opts(opt, opt)
        model = model_class(opt)
        if requireModelExists and hasattr(
                model, 'load') and not opt.get('model_file'):
            # double check that we didn't forget to set model_file on loadable model
            logging.warning(
                'model_file unset but model has a `load` function.')
        return model
    else:
        raise RuntimeError('Need to set `model` argument to use create_agent.')
Beispiel #30
0
def get_model_name(opt):
    """
    Get the model name from either `--model` or `--model-file`.
    """
    model = opt.get('model', None)
    if model is None:
        # try to get model name from model opt file
        model_file = opt.get('model_file', None)
        if model_file is not None:
            model_file = modelzoo_path(opt.get('datapath'), model_file)
            optfile = model_file + '.opt'
            if PathManager.exists(optfile):
                new_opt = Opt.load(optfile)
                model = new_opt.get('model', None)
    return model