Example #1
0
 def __init__(self, opt: Opt, agents=None, shared=None, default_world=None):
     super().__init__(opt)
     self.worlds: List[World] = []
     for index, k in enumerate(opt['task'].split(',')):
         k = k.strip()
         if k:
             opt_singletask = copy.deepcopy(opt)
             opt_singletask['task'] = k
             if shared:
                 # Create worlds based on shared data.
                 s = shared['worlds'][index]
                 self.worlds.append(s['world_class'](s['opt'], None, s))
             else:
                 # Agents are already specified.
                 self.worlds.append(
                     create_task_world(opt_singletask,
                                       agents,
                                       default_world=default_world))
     self.world_idx = -1
     self.new_world = True
     self.parleys = -1
     self.random = opt.get('datatype', None) == 'train'
     # Make multi-task task probabilities.
     self.cum_task_weights = [1] * len(self.worlds)
     self.task_choices = range(len(self.worlds))
     weights = self.opt.get('multitask_weights', [1])
     sum = 0
     for i in self.task_choices:
         if len(weights) > i:
             weight = weights[i]
         else:
             weight = 1
         self.cum_task_weights[i] = weight + sum
         sum += weight
Example #2
0
    def __init__(self, opt: Opt, shared=None):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)
Example #3
0
def retriever_factory(opt: Opt,
                      dictionary: DictionaryAgent,
                      shared=None) -> Optional[RagRetriever]:
    """
    Build retriever.

    Override to build special BB2 Search Retrievers, if necessary

    :param opt:
        ParlAI Opt
    :param dictionary:
        dictionary agent
    :param shared:
        shared objects.

    :return retriever:
        return a retriever for RAG.
    """
    if opt.get('converting'):
        return None
    retriever = RetrieverType(opt['rag_retriever_type'])
    if retriever is RetrieverType.SEARCH_ENGINE:
        return BB2SearchQuerySearchEngineRetriever(opt,
                                                   dictionary,
                                                   shared=shared)
    elif retriever is RetrieverType.SEARCH_TERM_FAISS:
        return BB2SearchQueryFaissIndexRetriever(opt,
                                                 dictionary,
                                                 shared=shared)
    elif retriever is RetrieverType.OBSERVATION_ECHO_RETRIEVER:
        return BB2ObservationEchoRetriever(opt, dictionary, shared=shared)
    else:
        return rag_retriever_factory(opt, dictionary, shared=shared)
Example #4
0
 def __init__(self, opt: Opt, shared=None):
     self._pad_tokens = [
         '__fp16_pad_0__',
         '__fp16_pad_1__',
         '__fp16_pad_2__',
         '__fp16_pad_3__',
     ]
     self._response_seperator = opt.get('response_sep', '')
     self._response_num = opt.get('response_num', 1)
     self._deactivate_nli_reranking = opt.get('deactivate_nli_reranking', False)
     self._deactivate_dialogRPT_reranking = opt.get('deactivate_dialogRPT_reranking', False)
     if self._response_num > 1 and len(self._response_seperator) == 0:
         raise ValueError(
             'Response seperator empty while response number over 1.')
     super().__init__(opt, shared)
     self._init_reranking_models()
Example #5
0
def _path(opt: Opt) -> Tuple[str, str, str]:
    """
    Return appropriate datapaths.

    :param opt:
        options

    :return (data path, personalities path, image_path):
        path to data, personalities, and images
    """
    build(opt)
    dt = opt['datatype'].split(':')[0]
    if dt in ['train', 'valid', 'test']:
        data_path = os.path.join(opt['datapath'],
                                 'image_chat/{}.json'.format(dt))

    personalities_data_path = os.path.join(opt['datapath'],
                                           'image_chat/personalities.json')
    image_path = ''
    if opt.get('yfcc_path'):
        image_path = opt['yfcc_path']
    else:
        image_path = os.path.join(opt['datapath'], 'yfcc_images')

    return data_path, personalities_data_path, image_path
Example #6
0
def retriever_factory(opt: Opt,
                      dictionary: DictionaryAgent,
                      shared=None) -> Optional[RagRetriever]:
    """
    Build retriever.

    :param opt:
        ParlAI Opt
    :param dictionary:
        dictionary agent
    :param shared:
        shared objects.

    :return retriever:
        return a retriever for RAG.
    """
    if opt.get('converting'):
        return None
    # only build retriever when not converting a BART model
    retriever = RetrieverType(opt['rag_retriever_type'])
    if retriever is RetrieverType.DPR:
        return DPRRetriever(opt, dictionary, shared=shared)
    elif retriever is RetrieverType.TFIDF:
        return TFIDFRetriever(opt, dictionary, shared=shared)
    elif retriever is RetrieverType.DPR_THEN_POLY:
        return DPRThenPolyRetriever(opt, dictionary, shared=shared)
    elif retriever is RetrieverType.POLY_FAISS:
        return PolyFaissRetriever(opt, dictionary, shared=shared)
Example #7
0
    def __init__(self, opt: Opt, shared=None):
        self.tasks: List[Agent] = []
        self.opt = opt

        self.id = opt['task']
        if shared and 'tasks' in shared:
            self.tasks = [create_agent_from_shared(t) for t in shared['tasks']]
        else:
            tasks = opt['task'].split(',')
            for k in tasks:
                k = k.strip()
                if k:
                    opt_singletask = copy.deepcopy(opt)
                    opt_singletask['task'] = k
                    self.tasks.extend(
                        create_task_agent_from_taskname(opt_singletask))
        self.task_idx = -1
        self.new_task = True
        self.random = opt.get('datatype') == 'train'
        # Make multi-task task probabilities.
        self.cum_task_weights = [1] * len(self.tasks)
        self.task_choices = range(len(self.tasks))
        weights = self.opt.get('multitask_weights', [1])
        sum = 0
        for i in self.task_choices:
            if len(weights) > i:
                weight = weights[i]
            else:
                weight = 1
            self.cum_task_weights[i] = weight + sum
            sum += weight
Example #8
0
 def __init__(self, opt: Opt, shared=None):
     if not hasattr(self, "fold"):
         self.fold = DatatypeHelper.fold(opt["datatype"])
     super().__init__(opt, shared)
     self.epochDone = False
     self.batchsize = opt.get("batchsize", 1)
     self.max_episodes = len(self.episodes)
     if opt.get("num_episodes", 0) > 0:
         self.max_episodes = min(self.max_episodes, opt.get("num_episodes"))
     self.episode_idx = opt.get("batchindex", 0)
     self._setup_next_episode()
     self.round_idx = 0  # for some downstream utt + sysUttAndApiCallAgents.
     if is_distributed():  # cause gotta manually handle
         rank = get_rank()
         chunk_size = ceil(self.max_episodes / num_workers())
         self.episode_idx += rank * chunk_size
         self.max_episodes = min(self.max_episodes, (rank + 1) * chunk_size)
Example #9
0
def create_agent(opt: Opt, requireModelExists=False):
    """
    Create an agent from the options ``model``, ``model_params`` and ``model_file``.

    The input is either of the form
    ``parlai.agents.ir_baseline.agents:IrBaselineAgent`` (i.e. the path
    followed by the class name) or else just ``ir_baseline`` which
    assumes the path above, and a class name suffixed with 'Agent'.

    If ``model-file`` is available in the options this function can also
    attempt to load the model from that location instead. This avoids having to
    specify all the other options necessary to set up the model including its
    name as they are all loaded from the options file if it exists (the file
    opt['model_file'] + '.opt' must exist and contain a pickled or json dict
    containing the model's options).
    """
    if opt.get('datapath', None) is None:
        add_datapath_and_model_args(opt)

    if opt.get('model_file'):
        opt['model_file'] = modelzoo_path(opt.get('datapath'), opt['model_file'])
        if requireModelExists and not PathManager.exists(opt['model_file']):
            raise RuntimeError(
                'WARNING: Model file does not exist, check to make '
                'sure it is correct: {}'.format(opt['model_file'])
            )
        # Attempt to load the model from the model file first (this way we do
        # not even have to specify the model name as a parameter)
        model = create_agent_from_opt_file(opt)
        if model is not None:
            return model
        else:
            logging.info(f"No model with opt yet at: {opt['model_file']}(.opt)")

    if opt.get('model'):
        model_class = load_agent_module(opt['model'])
        # if we want to load weights from --init-model, compare opts with
        # loaded ones
        compare_init_model_opts(opt, opt)
        model = model_class(opt)
        if requireModelExists and hasattr(model, 'load') and not opt.get('model_file'):
            # double check that we didn't forget to set model_file on loadable model
            logging.warn('model_file unset but model has a `load` function.')
        return model
    else:
        raise RuntimeError('Need to set `model` argument to use create_agent.')
Example #10
0
    def __init__(self, opt: Opt, shared=None):
        # Must call _get_init_model() first so that paths are updated if necessary
        # (e.g., a .dict file)
        init_model, is_finetune = self._get_init_model(opt, shared)
        opt['rank_candidates'] = True
        super().__init__(opt, shared)

        states: Dict[str, Any]
        if shared:
            states = {}
        else:
            # Note: we cannot change the type of metrics ahead of time, so you
            # should correctly initialize to floats or ints here
            self.criterion = self.build_criterion()
            self.model = self.build_model()
            if self.model is None or self.criterion is None:
                raise AttributeError(
                    'build_model() and build_criterion() need to return the model or criterion'
                )
            if self.use_cuda:
                self.model.cuda()
                self.criterion.cuda()

            print("Total parameters: {}".format(self._total_parameters()))
            print("Trainable parameters:  {}".format(
                self._trainable_parameters()))

            if self.fp16:
                self.model = self.model.half()
            if init_model:
                print('Loading existing model parameters from ' + init_model)
                states = self.load(init_model)
            else:
                states = {}

        self.rank_top_k = opt.get('rank_top_k', -1)

        # Vectorize and save fixed/vocab candidates once upfront if applicable
        self.set_fixed_candidates(shared)
        self.set_vocab_candidates(shared)

        if shared:
            # We don't use get here because hasattr is used on optimizer later.
            if 'optimizer' in shared:
                self.optimizer = shared['optimizer']
        else:
            optim_params = [
                p for p in self.model.parameters() if p.requires_grad
            ]
            self.init_optim(optim_params, states.get('optimizer'),
                            states.get('optimizer_type'))
            self.build_lr_scheduler(states, hard_reset=is_finetune)

        if shared is None and is_distributed():
            self.model = torch.nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[self.opt['gpu']],
                broadcast_buffers=False)
Example #11
0
def create_task(opt: Opt, user_agents, default_world=None):
    """
    Create a world + task_agents (aka a task).

    Assuming ``opt['task']="task_dir:teacher_class:options"`` e.g. ``"babi:Task1k:1"``
    or ``"#babi-1k"`` or ``"#QA"``, see ``parlai/tasks/tasks.py`` and see
    ``parlai/tasks/task_list.py`` for list of tasks.
    """
    task = opt.get('task')
    if not task:
        raise RuntimeError('No task specified. Please select a task with ' +
                           '--task {task_name}.')
    if type(user_agents) != list:
        user_agents = [user_agents]

    # Convert any hashtag task labels to task directory path names.
    # (e.g. "#QA" to the list of tasks that are QA tasks).
    opt = copy.deepcopy(opt)
    opt['task'] = ids_to_tasks(opt['task'])
    logging.info(f"creating task(s): {opt['task']}")

    if ',' not in opt['task']:
        # Single task
        world = create_task_world(opt,
                                  user_agents,
                                  default_world=default_world)
    else:
        # Multitask teacher/agent
        # TODO: remove and replace with multiteachers only?
        world = MultiWorld(opt, user_agents, default_world=default_world)

    if DatatypeHelper.is_training(
            opt['datatype']) and opt.get('num_workers', 0) > 0:
        # note that we never use Background preprocessing in the valid/test
        # worlds, as we are unable to call Teacher.observe(model_act) in BG
        # preprocessing, so we are unable to compute Metrics or accurately
        # differentiate MultiWorld stats.
        world = BackgroundDriverWorld(opt, world)
    elif opt.get('batchsize', 1) > 1 and opt.get('dynamic_batching'):
        world = DynamicBatchWorld(opt, world)
    elif opt.get('batchsize', 1) > 1:
        # otherwise check if should use batchworld
        world = BatchWorld(opt, world)

    return world
Example #12
0
 def __init__(self, opt: Opt):
     self.opt = opt
     self.agents = []
     self.agent_dict = None
     self.generations = []
     self.input_type = 'Search'
     self.knowledge_access_method = KnowledgeAccessMethod(
         opt['knowledge_access_method'])
     model_file = modelzoo_path(opt['datapath'],
                                opt['query_generator_model_file'])
     if (self.knowledge_access_method is KnowledgeAccessMethod.SEARCH_ONLY
             and 'blenderbot2/query_generator/model' in model_file):
         raise ValueError(
             'You cannot use the blenderbot2 query generator with search_only. Please '
             'consider setting --query-generator-model-file zoo:sea/bart_sq_gen/model '
             'instead.')
     if model_file and os.path.exists(model_file):
         logging.info(f'Building Query Generator from file: {model_file}')
         logging.disable()
         overrides: Dict[str, Any] = {'skip_generation': False}
         overrides['inference'] = opt['query_generator_inference']
         overrides['beam_size'] = opt.get('query_generator_beam_size', 3)
         overrides['beam_min_length'] = opt.get(
             'query_generator_beam_min_length', 2)
         overrides['model_parallel'] = opt['model_parallel']
         overrides['no_cuda'] = opt['no_cuda']
         if self.opt['query_generator_truncate'] > 0:
             overrides['text_truncate'] = self.opt[
                 'query_generator_truncate']
             overrides['truncate'] = self.opt['query_generator_truncate']
         base_agent = create_agent_from_model_file(model_file,
                                                   opt_overrides=overrides)
         assert isinstance(base_agent, TorchAgent)
         self.agents = [base_agent]
         bsz = max(
             opt.get('batchsize') or 1,
             opt.get('eval_batchsize') or 1)
         rag_turn_n_turns = opt.get('rag_turn_n_turns', 1)
         if bsz > 1 or rag_turn_n_turns > 1:
             self.agents += [
                 create_agent_from_shared(self.agents[0].share())
                 for _ in range((bsz * rag_turn_n_turns) - 1)
             ]
         self.agent_dict = self.agents[0].build_dictionary()
         logging.enable()
Example #13
0
 def __init__(
     self,
     opt: Opt,
     dictionary: DictionaryAgent,
     query_encoder: Optional[torch.nn.Module] = None,
     shared=None,
 ):
     super().__init__(opt, dictionary, shared)
     self.n_docs = opt['n_docs']
     if query_encoder is None:
         self.query_encoder = DprQueryEncoder(
             opt,
             dpr_model=opt['memory_reader_model'],
             pretrained_path=opt['dpr_model_file'],
         )
     else:
         self.query_encoder = query_encoder
     self.memory_encoder = DprDocumentEncoder(
         opt,
         dpr_model=opt['memory_writer_model'],
         pretrained_path=opt['memory_writer_model_file'],
     ).eval()
     self._tokenizer = RagRetrieverTokenizer(
         datapath=opt['datapath'],
         query_model=opt['query_model'],
         dictionary=dictionary,
         delimiter='\n',
         max_length=opt['memory_retriever_truncate']
         if opt['memory_retriever_truncate'] > 0 else
         opt['rag_query_truncate'],
     )
     self.max_memories = opt.get('max_memories', 100)
     self.num_memory_slots = opt.get('batchsize', 1) * opt.get(
         'rag_turn_n_turns', 1)
     self.memory_vec_dict: Dict[int, torch.LongTensor] = {  # type: ignore
         k: torch.zeros(self.max_memories,
                        opt['max_doc_token_length']).to(torch.int64)
         for k in range(self.num_memory_slots)
     }
     self.memory_enc_dict: Dict[int, torch.Tensor] = {
         k: torch.zeros(self.max_memories, opt['retriever_embedding_size'])
         for k in range(self.num_memory_slots)
     }
     self.active_memory_slots: List[int] = []
     self.dict = dictionary
Example #14
0
 def __init__(self, opt: Opt, shared: TShared = None):
     super().__init__(opt, shared)
     self.opt = opt
     self.image_mode = opt.get('image_mode', 'no_image_model')
     self.data_path, personalities_data_path, self.image_path = _path(opt)
     self.datatype = opt['datatype'].split(':')[0]
     self.include_personality = opt.get('include_personality')
     self.include_image = opt.get('include_image') and opt.get('load_images')
     self.num_cands = opt.get('num_cands')
     if shared and 'data' in shared:
         self.data = shared['data']
         self.personalities = shared['personalities']
         self.image_loader = shared['image_loader']
     else:
         self.image_loader = ImageLoader(opt)
         self._setup_data(self.data_path, personalities_data_path)
     self.num_exs = sum(len(d['dialog']) for d in self.data)
     self.reset()
Example #15
0
 def _init_mutators(self, opt: Opt):
     """
     Initialize mutator objects for sub agents.
     """
     self.krm_mutators = None
     self.drm_mutators = None
     if opt['krm_message_mutators']:
         logging.warning(
             'WARNING: If specifying KRM Mutators, they MUST be message mutators'
         )
         mutator_types = Mutator.load_mutator_types(opt.get('krm_message_mutators'))
         self.krm_mutators = [mutator(opt) for mutator in mutator_types]
     if opt['drm_message_mutators']:
         logging.warning(
             'WARNING: If specifying DRM Mutators, they MUST be message mutators'
         )
         mutator_types = Mutator.load_mutator_types(opt.get('drm_message_mutators'))
         self.drm_mutators = [mutator(opt) for mutator in mutator_types]
Example #16
0
 def init_predictor(self, opt: Opt, shared=None):
     if not shared:
         override = {
             'return_cand_scores': True,
             'datatype': 'valid',
             'no_cuda': opt['reranker_no_cuda'],
             'interactive_mode': opt.get('interactive_mode', True),
             'ignore_bad_candidates': True,
             'encode_candidate_vecs': True,
             'interactive_candidates': 'inline',
         }  # to not init optim
         if opt.get('predictor_characters_file'):
             override['fixed_candidates_path'] = opt[
                 'predictor_characters_file']
         self.predictor = create_agent_from_model_file(
             self.predictor_model_file, opt_overrides=override)
     else:
         self.predictor = shared['predictor']
Example #17
0
    def __init__(self, opt: Opt, shared=None):
        super().__init__(opt, shared)
        self.epochDone = False
        if shared is None:
            self.episodes = self._setup_single_goal_episodes()
        else:
            # Handled fine in _TodDataDumpAgent
            pass

        self.max_episodes = len(self.episodes)
        if opt.get("num_episodes", 0) > 0:
            self.max_episodes = min(self.max_episodes, opt.get("num_episodes"))
        if is_distributed():  # cause gotta manually handle
            rank = get_rank()
            chunk_size = ceil(self.max_episodes / num_workers())
            self.max_episodes = min(self.max_episodes, (rank + 1) * chunk_size)

        self._setup_next_episode()
Example #18
0
    def __init__(self, opt: Opt):
        self.task_name = opt['task_name']
        self.output_folder = opt['output_folder']
        self.results_format = opt.get('results_format', 'json')
        self.database_path = opt['database_path']

        # We lazily load these later, or inject their mock version during testing.
        self._mephisto_db = None
        self._mephisto_data_browser = None
Example #19
0
def _get_task_world(opt: Opt, user_agents, default_world=None):
    task_agents = _create_task_agents(opt)
    sp = opt['task'].strip()
    repo = 'parlai'
    if sp.startswith('internal:'):
        # To switch to local repo, useful for non-public projects
        # (make a directory called 'parlai_internal' with your private agents)
        repo = 'parlai_internal'
        sp = sp[9:]
    sp = sp.split(':')
    if '.' in sp[0]:
        # The case of opt['task'] = 'parlai.tasks.squad.agents:DefaultTeacher'
        # (i.e. specifying your own path directly, assumes DialogPartnerWorld)
        if default_world is not None:
            world_class = default_world
        elif len(task_agents + user_agents) == 2:
            world_class = DialogPartnerWorld
        else:
            world_class = MultiAgentDialogWorld
    else:
        task = sp[0].lower()
        if len(sp) > 1:
            sp[1] = sp[1][0].upper() + sp[1][1:]
            world_name = sp[1] + "World"
            if opt.get('interactive_task', False):
                world_name = "Interactive" + world_name
        else:
            if opt.get('interactive_task', False):
                world_name = "InteractiveWorld"
            else:
                world_name = "DefaultWorld"
        module_name = "%s.tasks.%s.worlds" % (repo, task)
        try:
            my_module = importlib.import_module(module_name)
            world_class = getattr(my_module, world_name)
        except (ModuleNotFoundError, AttributeError):
            # Defaults to this if you did not specify a world for your task.
            if default_world is not None:
                world_class = default_world
            elif len(task_agents + user_agents) == 2:
                world_class = DialogPartnerWorld
            else:
                world_class = MultiAgentDialogWorld
    return world_class, task_agents
Example #20
0
 def __init__(self, opt: Opt, shared=None):
     if not hasattr(self, 'opt'):
         self.opt = copy.deepcopy(opt)
     if not hasattr(self, 'id'):
         self.id = opt.get('task', 'teacher')
     if not hasattr(self, 'metrics'):
         if shared and shared.get('metrics'):
             self.metrics = shared['metrics']
         else:
             self.metrics = Metrics(opt)
     self.epochDone = False
Example #21
0
def get_n_positions_from_options(opt: Opt):
    """
    Determine n_positions from options dict.
    """
    if opt.get('n_positions'):
        # if the number of positions is explicitly provided, use that
        n_positions = opt['n_positions']
    else:
        # else, use the worst case from truncate
        n_positions = max(
            opt.get('truncate') or 0,
            opt.get('text_truncate') or 0,
            opt.get('label_truncate') or 0,
        )
        if n_positions == 0:
            # default to 1024
            n_positions = 1024
    if n_positions < 0:
        raise ValueError('n_positions must be positive')
    return n_positions
Example #22
0
def create_task(opt: Opt, user_agents, default_world=None):
    """
    Create a world + task_agents (aka a task).

    Assuming ``opt['task']="task_dir:teacher_class:options"`` e.g. ``"babi:Task1k:1"``
    or ``"#babi-1k"`` or ``"#QA"``, see ``parlai/tasks/tasks.py`` and see
    ``parlai/tasks/task_list.py`` for list of tasks.
    """
    task = opt.get('task')
    if not task:
        raise RuntimeError('No task specified. Please select a task with ' +
                           '--task {task_name}.')
    if type(user_agents) != list:
        user_agents = [user_agents]

    # Convert any hashtag task labels to task directory path names.
    # (e.g. "#QA" to the list of tasks that are QA tasks).
    opt = copy.deepcopy(opt)
    opt['task'] = ids_to_tasks(opt['task'])
    print('[creating task(s): ' + opt['task'] + ']')

    # check if single or multithreaded, and single-example or batched examples
    if ',' not in opt['task']:
        # Single task
        world = create_task_world(opt,
                                  user_agents,
                                  default_world=default_world)
    else:
        # Multitask teacher/agent
        # TODO: remove and replace with multiteachers only?
        world = MultiWorld(opt, user_agents, default_world=default_world)

    if opt.get('numthreads', 1) > 1:
        # use hogwild world if more than one thread requested
        # hogwild world will create sub batch worlds as well if bsz > 1
        world = HogwildWorld(opt, world)
    elif opt.get('batchsize', 1) > 1:
        # otherwise check if should use batchworld
        world = BatchWorld(opt, world)

    return world
Example #23
0
def get_dialogue_task_mutators(opt: Opt) -> str:
    """
    Set the mutators appropriately for the dialogue tasks.
    """
    mutators = '+'.join([
        'flatten', 'extract_entity_for_knowledge_model',
        'skip_retrieval_mutator'
    ])
    if opt.get('mutators'):
        mutators = '+'.join([mutators, opt['mutators']])
    logging.warning(f'overriding mutators to {mutators}')
    return mutators
Example #24
0
 def __init__(self, opt: Opt, shared=None):
     """
     Initializes reranker.
     """
     self.predictor_model_file = modelzoo_path(opt['datapath'],
                                               opt['predictor_model_file'])
     self.reranker_strategy = opt['reranker_strategy']
     self.normalize_candidates = opt['normalize_candidates']
     self.delimiter = opt.get('delimiter', '\n')
     self.include_context = True
     self.include_label_cand_only = False
     self.init_predictor(opt, shared)
    def _initialize_bart(self, opt: Opt) -> Opt:
        """
        Download and convert BART pre-trained models.

        Additionally, convert `init-fairseq-model` if necessary.

        :param opt:
            ParlAI-parsed options

        :return opt:
            return opt with BART-specific args.
        """
        if not opt.get('converting'):
            download(opt['datapath'])
            opt['init_model'] = os.path.join(opt['datapath'],
                                             'models/bart/bart_large/model')
        if opt.get('init_fairseq_model'):
            opt = self._convert_model(opt)
        opt.update(BART_ARGS)
        compare_init_model_opts(opt, opt)
        return opt
Example #26
0
 def __init__(self, opt: Opt, agents=None, shared=None, default_world=None):
     super().__init__(opt)
     self.worlds: List[World] = []
     for index, k in enumerate(opt['task'].split(',')):
         k = k.strip()
         if k:
             if shared:
                 # Create worlds based on shared data.
                 s = shared['worlds'][index]
                 self.worlds.append(s['world_class'](s['opt'], None, s))
             else:
                 # Agents are already specified.
                 opt_singletask = copy.deepcopy(opt)
                 opt_singletask['task'] = k
                 self.worlds.append(
                     create_task_world(
                         opt_singletask, agents, default_world=default_world
                     )
                 )
     self.world_idx = -1
     self.new_world = True
     self.parleys = -1
     # Check to see if we are training
     self.is_training = DatatypeHelper.is_training(opt.get('datatype'))
     # Make multi-task task probabilities.
     self.cum_task_weights = [1] * len(self.worlds)
     self.task_choices = range(len(self.worlds))
     weights = self.opt.get('multitask_weights', [1])
     if weights == 'stochastic':
         weights = [w.num_episodes() for w in self.worlds]
     sum = 0
     for i in self.task_choices:
         if len(weights) > i:
             weight = weights[i]
         else:
             weight = 1
         self.cum_task_weights[i] = weight + sum
         sum += weight
     task_ids: Dict[str, Teacher] = {}
     # Having overlap in teacher ids will cause issues for metrics aggregation.
     for each_world in self.worlds:
         world_id = each_world.getID()
         if world_id in task_ids:
             raise AssertionError(
                 '{} and {} teachers have overlap in id {}.'.format(
                     task_ids[world_id],
                     each_world.get_agents()[0].__class__,
                     world_id,
                 )
             )
         else:
             task_ids[world_id] = each_world.get_task_agent()
Example #27
0
def bpe_factory(opt: Opt, shared: TShared) -> 'BPEHelper':
    """
    BPE Helper Factory.

    Returns the appropriate BPE helper given the opt
    as well as available libraries.

    :param opt:
        options
    :param shared:
        shared dict

    :return BPEHelper:
        returns the appropriate BPEHelper object
    """
    from parlai.core.dict import DictionaryAgent

    tokenizer = opt.get('dict_tokenizer', DictionaryAgent.default_tok)

    bpe_helper: Optional[BPEHelper] = None

    if tokenizer == 'bytelevelbpe':
        # Attempt to instantiate HF tokenizer
        try:
            bpe_helper = HuggingFaceBpeHelper(opt, shared)
        except ImportError:
            if opt['dict_loaded']:
                warn_once(
                    ''
                    '\n\n--------------------------------------------------\n\n'
                    'WARNING: You have chosen to use Huggingface\'s tokenizer.\n'
                    'Please install HuggingFace tokenizer with: pip install tokenizers.\n'
                    'For now, defaulting to the GPT2Tokenizer.'
                    '\n\n--------------------------------------------------\n\n'
                )
                tokenizer = 'slow_bytelevel_bpe'
            else:
                raise ImportError(
                    'Please install HuggingFace tokenizer with: pip install tokenizers.\n'
                )
    if tokenizer == 'slow_bytelevel_bpe':
        bpe_helper = SlowBytelevelBPE(opt, shared)
    if tokenizer == 'gpt2':
        bpe_helper = Gpt2BpeHelper(opt, shared)
    if tokenizer == 'bpe':
        bpe_helper = SubwordBPEHelper(opt, shared)

    assert (
        bpe_helper is not None
    ), f"bpe_factory called with invalid tokenizer: {tokenizer}"

    return bpe_helper
Example #28
0
def _all_split_datafiles(opt: Opt) -> List[str]:
    datafiles = []
    split_type = SplitType(opt.get("cmu_dog_split_type"))
    if split_type in {SplitType.SEEN, SplitType.UNSEEN}:
        # For seen/unseen split, the full set of dialogs is split
        # across train, valid, test seen, and test unseen
        for split in ['train', 'valid', 'test']:
            datafiles.append(_datafile(split, SplitType.SEEN))
        datafiles.append(_datafile('test', SplitType.UNSEEN))
    else:
        for split in ['train', 'valid', 'test']:
            datafiles.append(_datafile(split, split_type))
    return datafiles
Example #29
0
    def __init__(self, opt: Opt, shared: TShared = None):
        self.opt = opt
        self.image_model = opt.get("image_mode")
        if shared:
            self.image_loader = shared["image_loader"]
        else:
            opt.setdefault("image_mode", self.image_model)
            new_opt = ParlaiParser(True, False).parse_args([])
            for k, v in new_opt.items():
                if k not in opt:
                    opt[k] = v

            self.image_loader = ImageLoader(opt)
Example #30
0
 def __init__(self, opt: Opt, shared=None):
     """
     Setup reranker.
     """
     super().__init__(opt, shared)
     reranker_class = self.get_reranker_class()
     self.inference_strategies = (opt['inference_strategies']
                                  or opt['inference']).split(',')
     self.debug_mode = opt.get('debug_mode', False)
     if not shared:
         self.reranker = reranker_class(opt, shared=None)
     else:
         self.reranker = reranker_class(opt, shared=shared['reranker'])