def _lazy_import_torch(self):
        try:
            import torch
        except ImportError:
            raise ImportError('Need to install Pytorch: go to pytorch.org')
        import torchvision
        import torchvision.transforms as transforms
        import torch.nn as nn

        self.use_cuda = not self.opt.get('no_cuda', False) and torch.cuda.is_available()
        if self.use_cuda:
            logging.debug(f'Using CUDA')
            torch.cuda.set_device(self.opt.get('gpu', -1))
        self.torch = torch
        self.torchvision = torchvision
        self.transforms = transforms
        self.nn = nn
Exemple #2
0
    def __init__(self, opt: Opt):
        try:
            # tensorboard is a very expensive thing to import. Wait until the
            # last second to import it.
            from tensorboardX import SummaryWriter
        except ImportError:
            raise ImportError('Please run `pip install tensorboard tensorboardX`.')

        if opt['tensorboard_logdir'] is not None:
            tbpath = opt['tensorboard_logdir']
        else:
            tbpath = opt['model_file'] + '.tensorboard'

        logging.debug(f'Saving tensorboard logs to: {tbpath}')
        if not PathManager.exists(tbpath):
            PathManager.mkdirs(tbpath)
        self.writer = SummaryWriter(tbpath, comment=json.dumps(opt))
Exemple #3
0
    def checksum(self, dpath):
        """
        Checksum on a given file.

        :param dpath: path to the downloaded file.
        """
        sha256_hash = hashlib.sha256()
        with PathManager.open(os.path.join(dpath, self.file_name), "rb") as f:
            for byte_block in iter(lambda: f.read(65536), b""):
                sha256_hash.update(byte_block)
            if sha256_hash.hexdigest() != self.hashcode:
                # remove_dir(dpath)
                raise AssertionError(
                    f"Checksum for {self.file_name} from \n{self.url}\n"
                    "does not match the expected checksum. Please try again.")
            else:
                logging.debug("Checksum Successful")
Exemple #4
0
    def finalize(self,
                 frequencies: Dict[str, int],
                 num_symbols: int = 30000,
                 minfreq: int = 2) -> bool:
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.

        :return did_finalize:
            return whether codecs are finalized this call.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        logging.debug(f'Saving bpe codecs to {self.codecs}')

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2

        codec_dir, _ = os.path.split(self.codecs)
        PathManager.mkdirs(codec_dir)
        with PathManager.open(self.codecs, 'w', encoding='utf-8') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True
Exemple #5
0
    def batch_act_sdm(
        self,
        observations: List[Dict[str, Message]],
        knowledge_agent_observations: List[Message],
    ) -> Tuple[List[Message], List[int], List[Message]]:
        """
        Search Decision batch act.

        :param observations:
            observations for batch act.
        :param knowledge_agent_observations:
            observations to modify with the decision from the search decision agent.

        :return (batch_reply, search_indices, observations):
            batch_reply: reply from the search decision agent
            search_indices: batch indices with which to use search.
            observations: modified knowledge agent observations
        """
        search_indices = []
        batch_reply_sdm = [{} for _ in range(len(knowledge_agent_observations))]
        if self.search_decision is SearchDecision.ALWAYS:
            [o.force_set('skip_retrieval', False) for o in knowledge_agent_observations]
            search_indices = list(range(len(knowledge_agent_observations)))
        elif self.search_decision is SearchDecision.NEVER:
            [o.force_set('skip_retrieval', True) for o in knowledge_agent_observations]
        else:
            assert self.search_decision is SearchDecision.COMPUTE
            assert self.search_decision_agent
            batch_reply_sdm = self.search_decision_agent.batch_act(
                [o['search_decision_agent'] for o in observations]
            )
            for i, reply in enumerate(batch_reply_sdm):
                logging.debug(f"Example {i}: {reply['text']}")
                if reply['text'] == self.opt['search_decision_do_search_reply']:
                    search_indices.append(i)
                    knowledge_agent_observations[i].force_set('skip_retrieval', False)
                elif reply['text'] == self.opt['search_decision_dont_search_reply']:
                    knowledge_agent_observations[i].force_set('skip_retrieval', True)
                else:
                    logging.error(
                        f"SDM Reply: {reply['text']}; defaulting to no search"
                    )
                    knowledge_agent_observations[i].force_set('skip_retrieval', True)

        return batch_reply_sdm, search_indices, knowledge_agent_observations
Exemple #6
0
def _untar(path, fname, delete=True, flatten=False):
    """
    Unpack the given archive file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import tarfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    # very painfully manually extract files so that we can use PathManger.open
    # instead, lest we are using fb internal file services

    with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf:
        for item in tf:
            item_name = item.name
            while item_name.startswith("./"):
                # internal file systems will actually create a literal "."
                # directory, so we gotta watch out for that
                item_name = item_name[2:]
            if flatten:
                # flatten the tar file if there are subdirectories
                fn = os.path.join(path, os.path.split(item_name)[-1])
            else:
                fn = os.path.join(path, item_name)
            logging.debug(f"Extracting to {fn}")
            if item.isdir():
                PathManager.mkdirs(fn)
            elif item.isfile():
                with PathManager.open(fn, 'wb') as wf, tf.extractfile(
                        item.name) as rf:
                    tarfile.copyfileobj(rf, wf)
            else:
                raise NotImplementedError(
                    "No support for symlinks etc. right now.")

    if delete:
        PathManager.rm(fullpath)
Exemple #7
0
def untar(path, fname, deleteTar=True):
    """
    Unpack the given archive file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool deleteTar:
        If true, the archive will be deleted after extraction.
    """
    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    shutil.unpack_archive(fullpath, path)
    if deleteTar:
        os.remove(fullpath)
Exemple #8
0
    def batch_act_sqm(
        self, observations: List[Dict[str, Message]], search_indices: List[int]
    ) -> List[Message]:
        """
        Search Query Generator batch act.

        :param observations:
            list of observations
        :param search_indices:
            list of batch indices for which search is required.

        :return batch_reply:
            return the batch reply from the search query agent
        """
        batch_reply_sqm = [{} for _ in range(len(observations))]
        if self.search_query_agent and search_indices:
            batch_replies_with_search = self.search_query_agent.batch_act(
                [
                    o
                    for i, o in enumerate(
                        [o['search_query_agent'] for o in observations]
                    )
                    if i in search_indices
                ]
            )
            for i, reply in zip(search_indices, batch_replies_with_search):
                batch_reply_sqm[i] = reply
            search_queries = [o.get('text', '') for o in batch_reply_sqm]
            if self.inject_query_string:
                for i in range(len(search_queries)):
                    if search_queries[i]:
                        search_queries[i] = ' '.join(
                            [search_queries[i], self.inject_query_string]
                        )
            logging.debug(f"Search Queries: {search_queries}")
            self.knowledge_agent.model_api.set_search_queries(search_queries)
        else:
            try:
                self.knowledge_agent.model_api.set_search_queries([])
            except AttributeError:
                # Gold Documents, most likely
                pass

        return batch_reply_sqm
Exemple #9
0
    def retrieve_and_score(
            self, query: torch.LongTensor
    ) -> Tuple[List[List[Document]], torch.Tensor]:
        """
        Retrieve and score.

        Encode

        :param query:
            query tokens

        :return (docs, scores):
            docs: list of (text, title) tuples for each batch example
            scores: doc scores
        """
        query_enc = self.query_encoder(query)
        scores = self.score_memories(query_enc)

        top_docs, top_doc_scores = [], []
        for i in range(query.size(0)):
            scores_i = scores[i]
            memories_i, scores_i = argsort_scores_and_docs(
                scores_i,
                self.memory_vec_dict[i],
                self.n_docs  # type: ignore
            )
            mem_docs = []
            for mem in memories_i:
                mem_doc = Document('', self._tokenizer.decode(mem),
                                   '')  # type: ignore
                mem_doc.TITLE_DELIM = self.opt['memory_doc_title_delimiter']
                mem_docs.append(mem_doc)

            if len(mem_docs) < self.n_docs:
                # add dummy docs
                num_blank = self.n_docs - len(mem_docs)
                mem_docs += [BLANK_DOC] * num_blank
                scores_i = torch.cat(
                    [scores_i, torch.zeros(num_blank).to(scores_i)])
            top_docs.append(mem_docs)
            top_doc_scores.append(scores_i)
            logging.debug(scores_i)

        return top_docs, torch.stack(top_doc_scores)
Exemple #10
0
def unzip(path, fname, deleteZip=True):
    """
    Unzip the given archive file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool deleteZip:
        If true, the archive will be deleted after extraction.
    """
    logging.debug(f'unzipping {fname}')
    fullpath = os.path.join(path, fname)
    with zipfile.ZipFile(fullpath, "r") as zip_ref:
        zip_ref.extractall(path)
    if deleteZip:
        os.remove(fullpath)
Exemple #11
0
def dump_data(opt):
    # create repeat label agent and assign it to the specified task
    agent = RepeatLabelAgent(opt)
    world = create_task(opt, agent)
    opt.log()
    if opt['outfile'] is None:
        outfile = tempfile.mkstemp(prefix='{}_{}_'.format(
            opt['task'], opt['datatype']),
                                   suffix='.txt')[1]
    else:
        outfile = opt['outfile']

    if opt['num_examples'] == -1:
        num_examples = world.num_examples()
    else:
        num_examples = opt['num_examples']
    log_timer = TimeLogger()

    logging.debug('starting to convert...')
    logging.info(f'saving output to {outfile}')
    fw = open(outfile, 'w')
    text = ''
    for _ in range(num_examples):
        world.parley()
        world.acts[0]['labels'] = world.acts[0].get(
            'labels', world.acts[0].pop('eval_labels', None))

        samp = world.acts[0]
        text += samp["text"].replace("\n", " ") + " "
        fw.write("__label__%s %s\n" %
                 (samp["labels"][0].replace(' ', '_'), text))
        if world.acts[0].get('episode_done', False):
            text = ''

        if log_timer.time() > opt['log_every_n_secs']:
            text, _log = log_timer.log(world.total_parleys,
                                       world.num_examples())
            logging.info(text)

        if world.epoch_done():
            logging.info('epoch done')
            break
    fw.close()
Exemple #12
0
    def _place_modulelist(self, submodule: torch.nn.Module) -> None:
        if not isinstance(submodule, torch.nn.ModuleList):
            # not a ModuleList, leave it untouched
            return
        if getattr(submodule, 'model_parallel_exempt', False):
            return

        assert isinstance(submodule, torch.nn.ModuleList)  # for typechecker
        layers = submodule

        # mark this section as MP
        layers.is_model_parallel = True  # type: ignore

        # next, let's figure out how many parameters we can assign to each GPU,
        # but not make actual assignments yet. Assignments come later because we
        # want consecutive layers to be collocated
        keyfunc = self.__device_allocations.__getitem__
        layer_assignments = {k: 0 for k in self.devices}
        for layer_no, layer in enumerate(layers):
            if layer_no == 0:
                # hard code the first layer to be 0.
                mostfree = 'cuda:0'
            else:
                # otherwise dynamic allocation
                mostfree = min(self.devices, key=keyfunc)
            # 32 is a totally arbitrary, made up number that worked in practice
            # on the large models I tested on. I believe it should be roughly
            # batch size, but this was set empirically.
            self.__device_allocations[mostfree] += trainable_parameters(layer) * 32
            # mark a layer as going to the given element
            layer_assignments[mostfree] += 1

        devices = [d for i, d in enumerate(self.devices[:]) if layer_assignments[d] > 0]
        for layer_no, layer in enumerate(layers):
            layer_gpu = devices[0]
            assert layer_assignments[layer_gpu] > 0
            logging.debug(f"Model Parallel: Assigning {layer_no} to {layer_gpu}")
            layer._mp_gpu = layer_gpu
            layers[layer_no] = layer.to(layer_gpu)
            layer_assignments[layer_gpu] -= 1
            if layer_assignments[layer_gpu] == 0:
                devices.pop(0)
Exemple #13
0
 def setup_data(self, fold):
     domains = self.opt.get('domains', DOMAINS)
     chunks = self._load_data(fold, domains)
     domains_cnt = Counter()
     for _, row in chunks.iterrows():
         domains_cnt[row['domain']] += 1
         first = True
         utterances = row['utterances'][:]
         if (len(utterances) >= 3 and utterances[0]['speaker'] == 'USER'
                 and utterances[1]['speaker'] == 'ASSISTANT'
                 and utterances[2]['speaker'] == 'ASSISTANT'
                 and "help you?" in utterances[1]['text']):
             # skip this one
             utterances.pop(1)
         if self.opt['include_ontology']:
             yield {
                 'text': f"{ONTO_TOKEN} {row['ontology']}",
                 'label': ''
             }, True
             first = False
         while utterances:
             utt = utterances.pop(0)
             segtxt, slots = self._segments2text(utt.get('segments', []))
             if utt['speaker'] == 'USER':
                 yield {
                     'text': utt['text'],
                     'label': f'{CALL_TOKEN} {segtxt}',
                     'domain': row['domain'],
                     'slots': slots,
                     'type': 'apicall',
                 }, first
                 first = False
             elif utt['speaker'] == 'ASSISTANT':
                 yield {
                     'text': f'{RESP_TOKEN} {segtxt}',
                     'label': utt['text'],
                     'domain': row['domain'],
                     'slots': slots,
                     'type': 'apiresp',
                 }, first
                 first = False
     logging.debug(f"Fold {fold} domains: {domains_cnt}")
Exemple #14
0
    def _batch_generate(self, texts: List[str]) -> List[str]:
        """
        Batch generate items from an input list of texts.

        :param texts:
            list of texts

        :return generations:
            return agent generations for each input.
        """
        start = time.time()
        active_agents = self.agents[: len(texts)]
        for agent_i, t_i in zip(active_agents, texts):
            agent_i.observe(Message({'text': t_i, 'episode_done': True}))
        agent_replies = self.agents[0].batch_act([a.observation for a in active_agents])
        logging.debug(f'Generated: {time.time() - start:.2f}')
        for agent_i, reply_i in zip(active_agents, agent_replies):
            agent_i.self_observe(reply_i)
        self.generations = [r.get('text', 'dummy') for r in agent_replies]
        return self.generations
Exemple #15
0
def dump_data(opt):
    # create repeat label agent and assign it to the specified task
    agent = RepeatLabelAgent(opt)
    world = create_task(opt, agent)
    opt.log()
    ignorefields = opt.get('ignore_fields', '')
    if opt['outfile'] is None:
        outfile = tempfile.mkstemp(prefix='{}_{}_'.format(
            opt['task'], opt['datatype']),
                                   suffix='.txt')[1]
    else:
        outfile = opt['outfile']

    if opt['num_examples'] == -1:
        num_examples = world.num_examples()
    else:
        num_examples = opt['num_examples']
    log_timer = TimeLogger()

    logging.debug('starting to convert...')
    logging.info(f'saving output to {outfile}')
    fw = open(outfile, 'w')
    for _ in range(num_examples):
        world.parley()
        acts = world.get_acts()
        value = acts[0].get('labels', acts[0].pop('eval_labels', None))
        acts[0].force_set('labels', value)
        txt = msg_to_str(acts[0], ignore_fields=ignorefields)
        fw.write(txt + '\n')
        if acts[0].get('episode_done', False):
            fw.write('\n')

        if log_timer.time() > opt['log_every_n_secs']:
            text, _log = log_timer.log(world.total_parleys,
                                       world.num_examples())
            logging.info(text)

        if world.epoch_done():
            logging.info('epoch done')
            break
    fw.close()
Exemple #16
0
    def _load_metadata(self, datapath):
        """
        Load metadata.

        Metadata should be saved at <identifier>.metadata
        Metadata should be of the following format:
        {
            'date': <date collected>,
            'opt': <opt used to collect the data>,
            'speakers': <identity of speakers>,
            ...
            Other arguments.
        }
        """
        try:
            metadata = Metadata(datapath)
            return metadata
        except RuntimeError:
            logging.debug(
                'Metadata does not exist. Please double check your datapath.')
            return None
Exemple #17
0
def _unzip(path, fname, delete=True):
    """
    Unpack the given zip file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import zipfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf:
        for member in zf.namelist():
            outpath = os.path.join(path, member)
            if zf.getinfo(member).is_dir():
                logging.debug(f"Making directory {outpath}")
                PathManager.mkdirs(outpath)
                continue
            logging.debug(f"Extracting to {outpath}")
            with zf.open(member, 'r') as inf, PathManager.open(outpath,
                                                               'wb') as outf:
                shutil.copyfileobj(inf, outf)
    if delete:
        try:
            PathManager.rm(fullpath)
        except PermissionError:
            logging.error(
                f"Tried to delete {fullpath} but got a permission error. This "
                "is known to happen in Windows and is probably not fatal.")
Exemple #18
0
def _unzip(path, fname, delete=True):
    """
    Unpack the given zip file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import zipfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf:
        for member in zf.namelist():
            outpath = os.path.join(path, member)
            if zf.getinfo(member).is_dir():
                logging.debug(f"Making directory {outpath}")
                PathManager.mkdirs(outpath)
                continue
            logging.debug(f"Extracting to {outpath}")
            with zf.open(member, 'r') as inf, PathManager.open(outpath,
                                                               'wb') as outf:
                shutil.copyfileobj(inf, outf)
    if delete:
        PathManager.rm(fullpath)
Exemple #19
0
    def write_memory(self, mem_dict: Dict[int, torch.LongTensor]):
        """
        Write vectors to memory.

        Assume that we clear the memory as well.

        :param mem_dict:
            mapping from memory slot to 2D-tokenized memories
        """
        self.active_memory_slots = list(mem_dict.keys())
        with torch.no_grad():
            slot_num_mems = [m.size(0) for m in mem_dict.values()]
            logging.debug(f'Writing {slot_num_mems} memories')
            mem_vecs = torch.cat(list(mem_dict.values()), dim=0)
            mem_encs = self.memory_encoder(mem_vecs)
            offset = 0
            for mem_slot, num_mems in zip(mem_dict.keys(), slot_num_mems):
                self.memory_vec_dict[mem_slot] = mem_vecs[  # type: ignore
                    offset:offset + num_mems]
                self.memory_enc_dict[mem_slot] = mem_encs[offset:offset +
                                                          num_mems]
                offset += num_mems
Exemple #20
0
    def generate_memories(
        self, input: torch.LongTensor, num_inputs: torch.LongTensor
    ) -> List[List[str]]:
        """
        Generate memories from input.

        Each input is split into the lines of conversational context.
        These are considered independently.

        We then assign a prefix ("your/partner's persona:") dependent on
        whether the bot or it's partner said the line.

        :param input:
            input to the memory decoder
        :param num_inputs:
            number of lines per batch item
        """
        assert self.agent_dict is not None
        memories = []
        offset = 0
        for idx, i in enumerate(input):
            if num_inputs[idx] == 0:
                continue
            context_lines_vec = i[offset : offset + num_inputs[idx]]
            offset += num_inputs[idx]
            context_lines = [
                self.agent_dict.vec2txt(self.clean_input(j)) for j in context_lines_vec
            ]
            raw_memories_i = list(reversed(self._batch_generate(context_lines)))
            logging.debug(f'raw memories: {raw_memories_i}')
            memories_i = self._extract_from_raw_memories(raw_memories_i)
            logging.debug(f'memories to write: {memories_i}')
            mem_string = '\n'.join(memories_i)
            logging.verbose(f'Writing memories: {mem_string}')
            memories.append(memories_i)

        self.memories_full_list = memories
        return memories
Exemple #21
0
    def search(self, query_vectors: np.array,
               top_docs: int) -> List[Tuple[List[int], List[np.array]]]:
        """
        Search FAISS index.

        :param query_vectors:
            query vectors into the index
        :param top_docs:
            number of docs to return

        :return top_docs:
            returns, for each query vector:
                a list of document ids (according to db),
                a list of reconstructed document vectors
        """
        query_vectors = self.get_search_vectors(query_vectors)
        logging.debug(f'query_vectors {query_vectors.shape}')
        _scores, indexes, vectors = self.index.search_and_reconstruct(
            query_vectors, top_docs)
        db_ids = [[self.index_id_to_db_id[i] for i in query_top_idxs]
                  for query_top_idxs in indexes]
        result = [(db_ids[i], vectors[i]) for i in range(len(db_ids))]
        return result
Exemple #22
0
def download(url, path, fname, redownload=False, num_retries=5):
    """
    Download file using `requests`.

    If ``redownload`` is set to false, then will not download tar file again if it is
    present (default ``False``).
    """
    outfile = os.path.join(path, fname)
    download = not PathManager.exists(outfile) or redownload
    logging.info(f"Downloading {url} to {outfile}")
    retry = num_retries
    exp_backoff = [2**r for r in reversed(range(retry))]

    pbar = tqdm.tqdm(unit='B',
                     unit_scale=True,
                     desc='Downloading {}'.format(fname))

    while download and retry > 0:
        response = None

        with requests.Session() as session:
            try:
                response = session.get(url, stream=True, timeout=5)

                # negative reply could be 'none' or just missing
                CHUNK_SIZE = 32768
                total_size = int(response.headers.get('Content-Length', -1))
                # server returns remaining size if resuming, so adjust total
                pbar.total = total_size
                done = 0

                with PathManager.open(outfile, 'wb') as f:
                    for chunk in response.iter_content(CHUNK_SIZE):
                        if chunk:  # filter out keep-alive new chunks
                            f.write(chunk)
                        if total_size > 0:
                            done += len(chunk)
                            if total_size < done:
                                # don't freak out if content-length was too small
                                total_size = done
                                pbar.total = total_size
                            pbar.update(len(chunk))
                    break
            except (
                    requests.exceptions.ConnectionError,
                    requests.exceptions.ReadTimeout,
            ):
                retry -= 1
                pbar.clear()
                if retry > 0:
                    pl = 'y' if retry == 1 else 'ies'
                    logging.debug(
                        f'Connection error, retrying. ({retry} retr{pl} left)')
                    time.sleep(exp_backoff[retry])
                else:
                    logging.error('Retried too many times, stopped retrying.')
            finally:
                if response:
                    response.close()
    if retry <= 0:
        raise RuntimeError(
            'Connection broken too many times. Stopped retrying.')

    if download and retry > 0:
        pbar.update(done - pbar.n)
        if done < total_size:
            raise RuntimeError(
                f'Received less data than specified in Content-Length header for '
                f'{url}. There may be a download problem.')

    pbar.close()
Exemple #23
0
    def custom_evaluation(
        self,
        teacher_action: Message,
        labels: Optional[Tuple[str]],
        model_response: Message,
    ):
        if 'metrics' in model_response and 'type' in teacher_action:
            # keep copies of metrics across both api calls/responses
            prefix = teacher_action['type']
            keys = list(model_response['metrics'].keys())
            for k in keys:
                self.metrics.add(f'{prefix}_{k}', model_response['metrics'][k])

        if 'text' not in model_response or not labels or 'type' not in teacher_action:
            return

        domain = teacher_action['domain']

        if teacher_action['type'] == 'apicall':
            # also count slot accuracy
            text = model_response['text']
            slot_guesses = set(
                text.replace(
                    CALL_TOKEN + " ",
                    "").split(' ; '))  # prevent cheating via repeated guesses
            correct = 0
            for slot_guess in slot_guesses:
                if ' = ' not in slot_guess:
                    continue
                try:
                    slot, guess = slot_guess.split(' = ')
                except ValueError:
                    continue
                if teacher_action['slots'].get(slot) == guess:
                    self.metrics.add('slot_p', AverageMetric(1))
                    self.metrics.add(f'{domain}_slot_p', AverageMetric(1))
                    correct += 1
                else:
                    self.metrics.add('slot_p', AverageMetric(0))
                    self.metrics.add(f'{domain}_slot_p', AverageMetric(0))
                    logging.debug(
                        f"Bad slot guess '{slot_guess}' != {teacher_action['slots']}"
                    )
            if teacher_action['slots']:
                self.metrics.add(
                    'slot_r',
                    AverageMetric(correct, len(teacher_action['slots'])))
                self.metrics.add(
                    f'{domain}_slot_r',
                    AverageMetric(correct, len(teacher_action['slots'])),
                )
                self.metrics.add(
                    'jga',
                    AverageMetric(correct == len(teacher_action['slots'])))

        elif teacher_action['type'] == 'apiresp':
            # keep track of statistics by domain
            f1_metric = F1Metric.compute(model_response['text'], labels)
            bleu_metric = BleuMetric.compute(model_response['text'], labels)
            self.metrics.add(f'{domain}_lex_f1', f1_metric)
            self.metrics.add(f'{domain}_lex_bleu', bleu_metric)

            delex_text = model_response['text']
            delex_label = labels[0]
            # compute delexicalized string metrics
            for slot, value in teacher_action['slots'].items():
                delex_text = delex_text.replace(value, slot)
                delex_label = delex_label.replace(value, slot)
            f1_metric = F1Metric.compute(delex_text, (delex_label, ))
            self.metrics.add('delex_f1', f1_metric)
            self.metrics.add(f'{domain}_delex_f1', f1_metric)
            bleu_metric = BleuMetric.compute(delex_text, [delex_label])
            self.metrics.add('delex_bleu', bleu_metric)
            self.metrics.add(f'{domain}_delex_bleu', bleu_metric)
Exemple #24
0
    def access_long_term_memory(
        self,
        query_vec: torch.LongTensor,
        memory_indices: torch.LongTensor,
        memory_vec: Optional[torch.LongTensor],
        num_memories: torch.LongTensor,
        memory_decoder_vec: Optional[torch.LongTensor],
        generated_memories: List[List[str]],
    ) -> Tuple[Optional[List[List[Document]]], Optional[torch.Tensor]]:
        """
        Access long term memory.

        :param query_vec:
            retrieval vector for the long-term memory
        :param memory_indices:
            indices to access memory slots
        :param memory_vec:
            extracted memories from the observation
        :param num_memories:
            bsz-length tensor corresponding to number of memories per batch item
        :param memory_decoder_vec:
            input to the memory decoder
        :param generated_memories:
            memories generated by the memory decoder

        :return memories, memory_scores:
            return memories and memory scores, if there are memories retrieved
        """
        start = time.time()
        memories = None
        memory_scores = None
        memory_dict = {}
        indices = memory_indices.tolist()

        if memory_vec is not None:
            # Only look in memory_vec for batch elements with memories
            memory_ids = [m for m in indices if num_memories[m] > 0]
            memory_dict = {
                batch_id: memory_vec[batch_id, :num_memories[mem_id]]
                for batch_id, mem_id in enumerate(memory_ids)
            }
        if memory_decoder_vec is not None:
            for batch_id in indices:
                new_mems_i = generated_memories[batch_id]
                if not new_mems_i:
                    continue
                tokenized = [
                    self.long_term_memory.tokenize_query(m)
                    for m in generated_memories[batch_id]
                ]
                if batch_id in memory_dict:
                    tokenized += memory_dict[batch_id].tolist()
                new_mems_i, _ = padded_tensor(
                    tokenized,
                    pad_idx=self.dict[self.dict.null_token]  # type: ignore
                )
                memory_dict[batch_id] = new_mems_i.to(query_vec)
        if self.knowledge_access_method in [
                KnowledgeAccessMethod.ALL,
                KnowledgeAccessMethod.MEMORY_ONLY,
        ]:
            # Add dummy memories just in case we are retrieving from memories
            if memory_vec is not None:
                seqlen = memory_vec.size(-1)
            elif memory_decoder_vec is not None:
                seqlen = memory_decoder_vec.size(-1)
            else:
                seqlen = query_vec.size(-1)
            for batch_id in indices:
                if batch_id not in memory_dict:
                    memory_dict[batch_id] = torch.zeros(1,
                                                        seqlen).to(query_vec)
        if memory_dict:
            # first make sure all memories are padded to the same length.
            max_length = max([m.size(-1) for m in memory_dict.values()])
            for batch_id in memory_dict:
                vec = memory_dict[batch_id]
                if vec.size(-1) < max_length:
                    memory_dict[batch_id] = torch.cat(
                        [
                            vec,
                            torch.zeros(
                                (*vec.shape[:-1],
                                 max_length - vec.size(-1))).fill_(
                                     self.dict[self.dict.null_token]).to(vec),
                        ],
                        dim=1,
                    )
            self.long_term_memory.write_memory(memory_dict)  # type: ignore
            logging.debug(f'Write Memory Complete: {time.time() - start:.2f}')
        if self.long_term_memory.has_memory():
            memories, memory_scores = self.long_term_memory.retrieve(
                query_vec[memory_indices]  # type: ignore
            )
            logging.debug(
                f'Memory Retrieval Complete: {time.time() - start:.2f}')
            logging.debug(f'memories: {memories}')
            logging.verbose('Reading from Memory')

        return memories, memory_scores
Exemple #25
0
    def retrieve_and_concat(
        self,
        input: torch.LongTensor,
        input_lengths: torch.LongTensor,
        query_generator_vec: torch.LongTensor,
        query_vec: torch.LongTensor,
        input_turns_cnt: torch.LongTensor,
        memory_vec: torch.LongTensor,
        num_memories: torch.LongTensor,
        gold_doc_vec: torch.LongTensor,
        gold_doc_title_vec: torch.LongTensor,
        num_gold_docs: torch.LongTensor,
        memory_decoder_vec: torch.LongTensor,
        num_memory_decoder_vecs: torch.LongTensor,
        skip_search: torch.BoolTensor,
    ) -> Tuple[torch.LongTensor, List[List[Document]], torch.Tensor]:
        """
        Override RagModel.retrieve_and_concat to perform different retrieval, depending
        on the RetrieverType.
        """
        self.flush_previous_retriever_search_results()
        start = time.time()
        logging.debug(f'Begin encoder: {time.time() - start:.2f}')
        if input_turns_cnt is not None:
            if query_generator_vec is not None:
                query_generator_vec = query_generator_vec.repeat_interleave(
                    input_turns_cnt, dim=0)  # type: ignore
            if memory_vec is not None:
                memory_vec = memory_vec.repeat_interleave(
                    input_turns_cnt, dim=0)  # type: ignore
            if num_memories is not None:
                num_memories = num_memories.repeat_interleave(
                    input_turns_cnt, dim=0)  # type: ignore
            if memory_decoder_vec is not None:
                memory_decoder_vec = memory_decoder_vec.repeat_interleave(
                    input_turns_cnt, dim=0)  # type: ignore
            if num_memory_decoder_vecs is not None:
                num_memory_decoder_vecs = num_memory_decoder_vecs.repeat_interleave(
                    input_turns_cnt, dim=0)  # type: ignore
        n_input = (input_turns_cnt.sum().item()
                   if input_turns_cnt is not None else input.size(0))
        # 0a. Classify retrieval type, if necessary
        generated_memories = [[] for _ in range(int(n_input))]
        if memory_decoder_vec is not None:
            generated_memories = self.memory_decoder.generate_memories(
                memory_decoder_vec, num_memory_decoder_vecs)
        if self.should_generate_query:
            assert self.has_query_generator()
            retrieval_type, search_queries = self.query_generator.classify_retrieval(
                query_generator_vec, num_memories, generated_memories,
                skip_search)
            logging.debug(f'Classify Retrieval: {time.time() - start:.2f}')
        else:
            retrieval_type = torch.LongTensor(input.size(0))
            search_queries = None

        # 1. Retrieve
        top_docs: List[List[Document]] = [[] for _ in range(int(n_input))]
        doc_scores: List[List[torch.Tensor]] = [[]
                                                for _ in range(int(n_input))]

        # 1a. retrieve from faiss or search
        search_indices = self.get_retrieval_indices(retrieval_type,
                                                    RetrievalType.SEARCH)
        if search_indices.numel() > 0:
            search_docs, search_doc_scores = self.perform_search(
                search_queries, query_vec, search_indices)
            logging.debug(f'Search Complete: {time.time() - start:.2f}')
            logging.debug(f'search: {search_docs}')
            if gold_doc_vec is not None:
                logging.debug(f'num gold docs: {num_gold_docs}')
            self._fill_docs_and_scores(
                top_docs,
                doc_scores,
                search_indices,
                search_docs,
                search_doc_scores,
                gold_doc_vec,
                gold_doc_title_vec,
                num_gold_docs,
            )

        # 1b. memory search
        memory_indices = self.get_retrieval_indices(retrieval_type,
                                                    RetrievalType.MEMORY)
        if memory_indices.numel() > 0:
            memories, memory_scores = self.access_long_term_memory(
                query_vec,
                memory_indices,
                memory_vec,
                num_memories,
                memory_decoder_vec,
                generated_memories,
            )
            logging.debug(f'Memory Access Complete: {time.time() - start:.2f}')
            if memories is not None and memory_scores is not None:
                self._fill_docs_and_scores(top_docs, doc_scores,
                                           memory_indices, memories,
                                           memory_scores)

        # 1c. no search
        no_search_indices = self.get_retrieval_indices(retrieval_type,
                                                       RetrievalType.NONE)
        if no_search_indices.numel() > 0:
            dummy_docs, dummy_scores = self.dummy_retriever.retrieve(
                query_vec[no_search_indices]  # type: ignore
            )
            logging.debug('no search')
            self._fill_docs_and_scores(top_docs, doc_scores, no_search_indices,
                                       dummy_docs, dummy_scores)

        # 2. Expand the input
        if input_turns_cnt is not None:
            input = input.repeat_interleave(input_turns_cnt,
                                            dim=0)  # type: ignore
            input_lengths = input_lengths.repeat_interleave(
                input_turns_cnt, dim=0)  # type: ignore

        # Filtering empty doc_scores added due to dynamic batching (if used)
        doc_scores = [[s for s in ds if s is not None] for ds in doc_scores
                      if ds]
        top_doc_scores = torch.stack(
            [torch.cat([s_i for s_i in scores_i]) for scores_i in doc_scores])
        expanded_input = self.concat_docs_and_input(input, input_lengths,
                                                    top_docs,
                                                    top_doc_scores.size(1))
        return expanded_input, top_docs, top_doc_scores
Exemple #26
0
def download_multiprocess(urls,
                          path,
                          num_processes=32,
                          chunk_size=100,
                          dest_filenames=None,
                          error_path=None):
    """
    Download items in parallel (e.g. for an image + dialogue task).

    WARNING: may have issues with OS X.

    :param urls:
        Array of urls to download
    :param path:
        directory to save items in
    :param num_processes:
        number of processes to use
    :param chunk_size:
        chunk size to use
    :param dest_filenames:
        optional array of same length as url with filenames.  Images will be
        saved as path + dest_filename
    :param error_path:
        where to save error logs
    :return:
        array of tuples of (destination filename, http status code, error
        message if any). Note that upon failure, file may not actually be
        created.
    """

    pbar = tqdm.tqdm(total=len(urls), position=0)

    # Resume TODO: isfile() may take too long ?? Should I try in a .tmp file
    if dest_filenames:
        if len(dest_filenames) != len(urls):
            raise Exception(
                'If specified, destination filenames must equal url array in length.'
            )
    else:

        def _naming_fn(url, url_metadata=None):
            return hashlib.md5(url.encode('utf-8')).hexdigest()

        dest_filenames = [_naming_fn(url) for url in urls]

    items = zip(urls, dest_filenames)
    remaining_items = [
        it for it in items if not PathManager.exists(os.path.join(path, it[1]))
    ]
    logging.info(
        f'Of {len(urls)} items, {len(urls) - len(remaining_items)} already existed; only going to download {len(remaining_items)} items.'
    )
    pbar.update(len(urls) - len(remaining_items))

    pool_chunks = ((remaining_items[i:i + chunk_size], path,
                    _download_multiprocess_single)
                   for i in range(0, len(remaining_items), chunk_size))
    remaining_chunks_count = math.ceil(float(
        len(remaining_items) / chunk_size))
    logging.info(
        f'Going to download {remaining_chunks_count} chunks with {chunk_size} images per chunk using {num_processes} processes.'
    )

    pbar.desc = 'Downloading'
    all_results = []
    collected_errors = []

    with Pool(num_processes) as pool:
        for idx, chunk_result in enumerate(
                pool.imap_unordered(_download_multiprocess_map_chunk,
                                    pool_chunks, 2)):
            all_results.extend(chunk_result)
            for dest_file, http_status_code, error_msg in chunk_result:
                if http_status_code != 200:
                    # msg field available as third item in the tuple
                    # not using b/c error log file would blow up
                    collected_errors.append({
                        'dest_file': dest_file,
                        'status_code': http_status_code,
                        'error': error_msg,
                    })
                    logging.error(
                        f'Bad download - chunk: {idx}, dest_file: {dest_file}, http status code: {http_status_code}, error_msg: {error_msg}'
                    )
            pbar.update(len(chunk_result))
    pbar.close()

    if error_path:
        now = time.strftime("%Y%m%d-%H%M%S")
        error_filename = os.path.join(
            error_path, 'parlai_download_multiprocess_errors_%s.log' % now)

        with PathManager.open(os.path.join(error_filename), 'w') as error_file:
            error_file.write(json.dumps(collected_errors))
            logging.error(f'Summary of errors written to {error_filename}')

    logging.info(f'Of {len(remaining_items)} items attempted downloading, '
                 f'{len(collected_errors)} had errors.')

    logging.debug('Finished downloading chunks.')
    return all_results
Exemple #27
0
    def retrieve_and_score(
        self, query: torch.LongTensor
    ) -> Tuple[List[List[Document]], torch.Tensor]:
        """
        Retrieves relevant documents for the query (the conversation context). This
        method conducts three main steps that are flagged in the main code as well.

        Step 1: generate search queries for the conversation context batch.This step
        uses the query generator model (self.query_generator).

        Step 2: use the search client to retrieve documents.This step uses retrieval
        API agent (self.search_client)

        Step 3: generate the list of Document objects from the
        retrieved content. Here if the documents too long, the code splits them and
        chooses a chunk based on the selected `doc_chunks_ranker` in the opt.
        """
        # step 1
        search_queries = self.generate_search_query(query)

        # step 2
        search_results_batch = self.search_client.retrieve(search_queries, self.n_docs)

        # step 3
        top_docs = []
        top_doc_scores = []
        max_n_docs: int = self.n_docs
        for sq, search_results in zip(search_queries, search_results_batch):
            if not search_results:
                search_results = self._empty_docs(self.n_docs)
            elif len(search_results) < self.n_docs:
                remain_docs = self.n_docs - len(search_results)
                search_results.extend(self._empty_docs(remain_docs))
            docs_i = []
            scors_i = []
            # Change this debug later
            logging.debug(f'URLS:\n{self._display_urls(search_results)}')
            for i, doc in enumerate(search_results):
                url = doc['url']
                title = doc['title']
                dcontent = doc['content']
                assert type(dcontent) in (
                    str,
                    list,
                ), f'Unrecognized retrieved doc: {dcontent}'
                full_text = (
                    dcontent if isinstance(dcontent, str) else '\n'.join(doc['content'])
                )
                doc_chunks = [
                    dc[0] for dc in self.pick_chunk(sq, title, full_text, url)
                ]
                for splt_id, splt_content in enumerate(doc_chunks):
                    docs_i.append(
                        Document(
                            docid=url, text=splt_content, title=f'{title}_{splt_id}'
                        )
                    )
                    scors_i.append(self.rank_score(i))
            max_n_docs = max(max_n_docs, len(docs_i))
            top_docs.append(docs_i)
            top_doc_scores.append(scors_i)
        # Pad with empty docs
        for i in range(len(top_docs)):
            n_empty = max_n_docs - len(top_docs[i])
            if n_empty:
                top_docs[i] = top_docs[i] + [BLANK_DOC] * n_empty
                top_doc_scores[i] = top_doc_scores[i] + [0] * n_empty
        self.top_docs = top_docs
        return top_docs, torch.Tensor(top_doc_scores).to(query.device)
Exemple #28
0
 def shutdown(self):
     logging.debug("Killing all the worker processes")
     for p in self._process_pool.processes:
         p.kill()
     super().shutdown()
Exemple #29
0
def build_dict(opt, skip_if_built=False):
    if isinstance(opt, ParlaiParser):
        logging.error('Should be passed opt not Parser')
        opt = opt.parse_args()
    if not opt.get('dict_file'):
        logging.error(
            'Tried to build dictionary but `--dict-file` is not set. Set '
            'this param so the dictionary can be saved.')
        return
    if skip_if_built and PathManager.exists(opt['dict_file']):
        # Dictionary already built, skip all loading or setup
        logging.debug("dictionary already built.")
        return None

    if opt.get('dict_class'):
        # Custom dictionary class
        dictionary = str2class(opt['dict_class'])(opt)
    else:
        # Default dictionary class
        dictionary = DictionaryAgent(opt)

    if PathManager.exists(
            opt['dict_file']) or (hasattr(dictionary, 'is_prebuilt')
                                  and dictionary.is_prebuilt()):
        # Dictionary already built, return loaded dictionary agent
        logging.debug("dictionary already built.")
        return dictionary

    if is_distributed():
        raise ValueError(
            'Dictionaries should be pre-built before distributed train.')

    ordered_opt = copy.deepcopy(opt)
    cnt = 0
    # we use train set to build dictionary

    ordered_opt['batchsize'] = 1
    # Set this to none so that image features are not calculated when Teacher is
    # instantiated while building the dict
    ordered_opt['image_mode'] = 'no_image_model'

    ordered_opt.log()

    datatypes = ['train:ordered:stream']
    if opt.get('dict_include_valid'):
        datatypes.append('valid:stream')
    if opt.get('dict_include_test'):
        datatypes.append('test:stream')
    cnt = 0
    for dt in datatypes:
        ordered_opt['datatype'] = dt
        world_dict = create_task(ordered_opt, dictionary)
        # pass examples to dictionary
        log_time = TimeLogger()
        total = world_dict.num_examples()
        if opt['dict_maxexs'] >= 0:
            total = min(total, opt['dict_maxexs'])

        log_every_n_secs = opt.get('log_every_n_secs', None)
        if log_every_n_secs:
            pbar = tqdm.tqdm(total=total,
                             desc='Building dictionary',
                             unit='ex',
                             unit_scale=True)
        else:
            pbar = None
        while not world_dict.epoch_done():
            cnt += 1
            if cnt > opt['dict_maxexs'] and opt['dict_maxexs'] >= 0:
                logging.info('Processed {} exs, moving on.'.format(
                    opt['dict_maxexs']))
                # don't wait too long...
                break
            world_dict.parley()
            if pbar:
                pbar.update(1)
        if pbar:
            pbar.close()

    dictionary.save(opt['dict_file'], sort=True)
    logging.info(f'dictionary built with {len(dictionary)} tokens '
                 f'in {log_time.total_time():.1f}s')
    return dictionary
    def retrieve_and_score(
        self, query: torch.LongTensor
    ) -> Tuple[List[List[Document]], torch.Tensor]:
        """
        Override retrieve and score to filter out docs that contain the label string.

        Copy over the whole thing because we need to check before chunking.
        """
        # step 1
        search_queries = self.generate_search_query(query)  # type: ignore

        # step 2
        search_results_batch = self.search_client.retrieve(
            search_queries, self.n_docs
        )  # type: ignore

        # step 3
        top_docs = []
        top_doc_scores = []
        max_n_docs: int = self.n_docs  # type: ignore
        for batch_id, (sq, search_results) in enumerate(
            zip(search_queries, search_results_batch)
        ):
            if not search_results:
                search_results = self._empty_docs(self.n_docs)  # type: ignore
            elif len(search_results) < self.n_docs:  # type: ignore
                remain_docs = self.n_docs - len(search_results)  # type: ignore
                search_results.extend(self._empty_docs(remain_docs))  # type: ignore
            docs_i = []
            scors_i = []
            # Change this debug later
            logging.debug(
                f'URLS:\n{self._display_urls(search_results)}'
            )  # type: ignore
            label_text = self.dict.vec2txt(
                self.label_vec[batch_id, :-1]
            )  # type: ignore
            for i, doc in enumerate(search_results):
                url = doc['url']
                title = doc['title']
                dcontent = doc['content']
                assert type(dcontent) in (
                    str,
                    list,
                ), f'Unrecognized retrieved doc: {dcontent}'
                full_text = (
                    dcontent if isinstance(dcontent, str) else '\n'.join(doc['content'])
                )
                if label_text in full_text:
                    docs_i.append(BLANK_DOC)
                    scors_i.append(0)
                else:
                    doc_chunks = [
                        dc[0]
                        for dc in self.pick_chunk(
                            sq, title, full_text, url
                        )  # type: ignore
                    ]
                    for splt_id, splt_content in enumerate(doc_chunks):
                        docs_i.append(
                            Document(
                                docid=url, text=splt_content, title=f'{title}_{splt_id}'
                            )
                        )
                        scors_i.append(self.rank_score(i))  # type: ignore
            max_n_docs = max(max_n_docs, len(docs_i))
            top_docs.append(docs_i)
            top_doc_scores.append(scors_i)
        # Pad with empty docs
        for i in range(len(top_docs)):
            n_empty = max_n_docs - len(top_docs[i])
            if n_empty:
                top_docs[i] = top_docs[i] + [BLANK_DOC] * n_empty
                top_doc_scores[i] = top_doc_scores[i] + [0] * n_empty
        self.top_docs = top_docs
        self.search_queries = search_queries
        return top_docs, torch.Tensor(top_doc_scores).to(query.device)