Esempio n. 1
0
 def save(self, path=None):
     self.rebuild()
     with PathManager.open(self.opt['model_file'] + '.opt', 'w') as handle:
         json.dump(self.opt, handle)
     with PathManager.open(self.opt['model_file'], 'w') as f:
         f.write('\n')
Esempio n. 2
0
 def data_to_json(self, pd, file_name):
     response = pd.to_dict('records')
     with PathManager.open(os.path.join(self.data_path, file_name),
                           'w') as f:
         f.write(json.dumps(response, indent=4))
Esempio n. 3
0
 def _load_data_dump(self):
     with PathManager.open(self.data_path, 'rb') as f:
         dump = json.load(f)
     return dump['adversarial']
Esempio n. 4
0
    def _setup_data(self, base_datatype):

        if self.opt.get('deepmoji') is not None:
            self.embed = np.load(self.opt['deepmoji'] + base_datatype + ".npy")

        if self.opt.get('fasttextloc') is not None and self.opt.get(
                'prepend', -1) > 0:
            try:
                import fastText
            except ImportError:
                raise ImportError("Please run 'pip install fasttext'.")
            ftpath = self.opt['fasttextloc']
            ftmodel = fastText.FastText.load_model(ftpath)

        with PathManager.open(self.datapath) as f:
            df = f.readlines()

        turn_idx = 1
        responder_text_dialogue = []
        experiencer_text_dialogue = []
        self.data = []
        for i in range(1, len(df)):

            cparts = df[i - 1].strip().split(",")
            sparts = df[i].strip().split(",")

            if cparts[0] == sparts[0]:

                # Check that the turn number has incremented correctly
                turn_idx += 1
                assert (int(cparts[1]) + 1 == int(sparts[1])
                        and int(sparts[1]) == turn_idx)

                contextt = cparts[5].replace("_comma_", ",")
                label = sparts[5].replace("_comma_", ",")
                prompt = sparts[2]
                sit = sparts[3].replace("_comma_", ",")
                if len(sparts) == 9:
                    if sparts[8] != '':
                        inline_label_candidates = [
                            cand.replace("_comma_",
                                         ",").replace("_pipe_", "|")
                            for cand in sparts[8].split('|')
                        ]
                    else:
                        inline_label_candidates = []
                elif len(sparts) == 8:
                    inline_label_candidates = []
                else:
                    raise ValueError(
                        f'Line {i:d} has the wrong number of fields!')

                context_emb, cand_emb = None, None
                if self.opt.get('deepmoji') is not None:
                    context_emb = self.embed[i - 2]
                    cand_emb = self.embed[i - 1]

                ft_ctx, ft_cand = None, None
                if (self.opt.get('fasttextloc') is not None
                        and self.opt.get('prepend', -1) > 0):
                    ft_ctx = ""
                    gettop, _ = ftmodel.predict(contextt,
                                                k=self.opt['prepend'])
                    for f in gettop:
                        ft_ctx = f.split("_")[-1] + " " + ft_ctx
                    ft_cand = ""
                    gettop, _ = ftmodel.predict(label, k=self.opt['prepend'])
                    for f in gettop:
                        ft_cand = f.split("_")[-1] + " " + ft_cand

                # Check if either the text or label are marked as being political
                is_political = '<POLITICAL>' in cparts[
                    7] or '<POLITICAL>' in sparts[7]

                dialogue_parts = [
                    contextt,
                    label,
                    prompt,
                    sit,
                    context_emb,
                    cand_emb,
                    ft_ctx,
                    ft_cand,
                    inline_label_candidates,
                    is_political,
                ]

                if int(sparts[1]) % 2 == 0:
                    # experiencer is the "text" and responder is the "label"
                    experiencer_text_dialogue.append(dialogue_parts)
                else:
                    # responder is the "text" and experiencer is the "label"
                    responder_text_dialogue.append(dialogue_parts)

            else:

                # We've finished the previous episode, so add it to the data
                turn_idx = 1
                self.data += self._select_dialogues_to_add(
                    experiencer_text_dialogue, responder_text_dialogue)
                experiencer_text_dialogue = []
                responder_text_dialogue = []

        # Add in the final episode
        self.data += self._select_dialogues_to_add(experiencer_text_dialogue,
                                                   responder_text_dialogue)
Esempio n. 5
0
def download(url, path, fname, redownload=False, num_retries=5):
    """
    Download file using `requests`.

    If ``redownload`` is set to false, then will not download tar file again if it is
    present (default ``False``).
    """
    outfile = os.path.join(path, fname)
    download = not PathManager.exists(outfile) or redownload
    logging.info(f"Downloading {url} to {outfile}")
    retry = num_retries
    exp_backoff = [2**r for r in reversed(range(retry))]

    pbar = tqdm.tqdm(unit='B',
                     unit_scale=True,
                     desc='Downloading {}'.format(fname))

    while download and retry > 0:
        response = None

        with requests.Session() as session:
            try:
                response = session.get(url, stream=True, timeout=5)

                # negative reply could be 'none' or just missing
                CHUNK_SIZE = 32768
                total_size = int(response.headers.get('Content-Length', -1))
                # server returns remaining size if resuming, so adjust total
                pbar.total = total_size
                done = 0

                with PathManager.open(outfile, 'wb') as f:
                    for chunk in response.iter_content(CHUNK_SIZE):
                        if chunk:  # filter out keep-alive new chunks
                            f.write(chunk)
                        if total_size > 0:
                            done += len(chunk)
                            if total_size < done:
                                # don't freak out if content-length was too small
                                total_size = done
                                pbar.total = total_size
                            pbar.update(len(chunk))
                    break
            except (
                    requests.exceptions.ConnectionError,
                    requests.exceptions.ReadTimeout,
            ):
                retry -= 1
                pbar.clear()
                if retry > 0:
                    pl = 'y' if retry == 1 else 'ies'
                    logging.debug(
                        f'Connection error, retrying. ({retry} retr{pl} left)')
                    time.sleep(exp_backoff[retry])
                else:
                    logging.error('Retried too many times, stopped retrying.')
            finally:
                if response:
                    response.close()
    if retry <= 0:
        raise RuntimeError(
            'Connection broken too many times. Stopped retrying.')

    if download and retry > 0:
        pbar.update(done - pbar.n)
        if done < total_size:
            raise RuntimeError(
                f'Received less data than specified in Content-Length header for '
                f'{url}. There may be a download problem.')

    pbar.close()
Esempio n. 6
0
    def _setup_data(self, opt):
        """
        Load original LIGHT dataset.
        """
        # Add new data?
        dt = opt['datatype'].split(':')[0]

        orig_episodes = OrigLightTeacher(opt).episodes
        if self.add_new_data:
            new_data = self._get_new_data(opt)
            total_data = orig_episodes + new_data
            self.fixed_random.shuffle(total_data)
            orig_episodes = total_data

        # Flatten this data
        flat_episodes = []
        for ep in orig_episodes:
            # flatten the episode into 1-example episodes with context
            flattened_ep = flatten(ep, -1, include_labels=True, delimiter='\n')
            flat_episodes += flattened_ep

        # Counterfactual?
        if self.add_counterfactual and dt != 'test':
            with PathManager.open(os.path.join(_path(opt), COUNTERFACTUALS), 'rb') as f:
                self.swap_dct = json.load(f)

            new_episodes = []
            for ex in flat_episodes:
                new_ex = self._flip_ex(ex)
                ex['counterfactual'] = False  # mark which episode is swapped
                new_ex['counterfactual'] = True
                # add both old and new examples
                new_episodes.append(ex)
                new_episodes.append(new_ex)

            flat_episodes = new_episodes

        # Conditional training?
        bucket_percentages = {}
        new_episodes = []
        for ex in flat_episodes:
            label_type = 'labels' if 'labels' in ex else 'eval_labels'
            label = ex[label_type][0]
            # get bucket for label
            bucket_key = self.get_bucket(label)
            # update the bucket percentages
            bucket_percentages.setdefault(bucket_key, 0)
            bucket_percentages[bucket_key] += 1
            # append this bucket to the text field
            if self.add_conditional:
                if self.force_conditional is None:
                    new_text = ex['text'] + '\n' + bucket_key
                else:
                    # force the model to see a specific bucket every time
                    # NOTE: we still track the original bucket that the
                    # text was supposed to fall into
                    new_text = ex['text'] + self.force_conditional
                ex.force_set('text', new_text)
            ex['bucket'] = bucket_key
            if self.bucket_only is None or self.bucket_only == bucket_key:
                new_episodes.append(ex)

        # Summarize the bucket distribution
        print('Distribution of bins:')
        total = sum(bucket_percentages.values())
        strs = []
        for k, v in bucket_percentages.items():
            pct = round((v / total) * 100, 2)
            strs.append(f'{k}: {pct}%')
        strs = sorted(strs)
        for string in strs:
            print(string)

        return new_episodes
Esempio n. 7
0
    def setup_data(self, datafile: str):
        datapath = _datapath(self.opt)
        with PathManager.open(
                os.path.join(datapath, f"conversations/{datafile}")) as f:
            data = json.load(f)
        with PathManager.open(os.path.join(datapath, "wiki_data.json")) as f:
            wiki_data = json.load(f)

        # Filter by rating
        data = {
            k: c
            for k, c in data.items()
            if c["rating"] in self.opt["cmu_dog_rating"]
        }

        def _can_see_info(turn, convo):
            # Sometimes only one participant has access to the article
            return turn["uid"] in convo["whoSawDoc"]

        num_eps = len(data)
        data = list(data.items())
        # loop through conversations
        for i in range(len(data) * 2):
            conv_idx = i % num_eps
            start_idx = i // num_eps

            _conv_id, conv_data = data[conv_idx]

            dialog = _collapse_multi_msgs(
                conv_data["history"], self.opt['cmu_dog_multi_msg_delimiter'])
            movie_article = wiki_data[str(conv_data["wikiDocumentIdx"])]

            if self.opt["cmu_dog_only_with_knowledge"] and not _can_see_info(
                    dialog[start_idx], conv_data):
                continue

            # loop through turns
            for idx in range(start_idx, len(dialog), 2):
                label_turn = dialog[idx]
                label = label_turn["text"].strip()

                # The section displayed changes across the conversation
                doc_idx = str(label_turn["docIdx"])
                gold_knowledge = _article_section_to_text(
                    movie_article[doc_idx], self.opt['cmu_dog_fact_delimiter'])
                section = (movie_article[doc_idx] if _can_see_info(
                    label_turn, conv_data) else None)
                section_text = _article_section_to_text(
                    section,
                    self.opt['cmu_dog_fact_delimiter'],
                    self.opt.get('cmu_dog_include_knowledge_keys').split(','),
                )

                # By default, start conversation with silence
                if idx == start_idx:
                    context = (section_text
                               if self.opt['cmu_dog_provide_movie_context']
                               else SILENCE)
                else:
                    context = dialog[idx - 1]["text"].strip()

                yield Message({
                    'text': context,
                    'labels': [label],
                    'available_knowledge_raw': section,
                    'available_knowledge_text': section_text,
                    'title': movie_article['0']['movieName'],
                    'checked_sentence': gold_knowledge,
                }), idx == start_idx
Esempio n. 8
0
    def __init__(self, datapath: str = None):
        """
        Get data from external sources and build data representation.

        If datapath ends in '.txt' it is assumed a custom model file is already given.
        """
        import parlai.core.build_data as build_data
        from parlai.core.dict import DictionaryAgent

        self.tokenize = DictionaryAgent.split_tokenize

        def _path():
            # Build the data if it doesn't exist.
            build()
            return os.path.join(self.datapath, 'OffensiveLanguage',
                                'OffensiveLanguage.txt')

        def build():
            version = 'v1.0'
            dpath = os.path.join(self.datapath, 'OffensiveLanguage')
            if not build_data.built(dpath, version):
                logging.info(f'building data: {dpath}')
                if build_data.built(dpath):
                    # An older version exists, so remove these outdated files.
                    build_data.remove_dir(dpath)
                build_data.make_dir(dpath)

                # Download the data.
                fname = 'OffensiveLanguage.txt'
                url = 'http://parl.ai/downloads/offensive_language/' + fname
                build_data.download(url, dpath, fname)

                # Mark the data as built.
                build_data.mark_done(dpath, version)

        if datapath is not None and datapath.endswith('.txt'):
            # Load custom file.
            self.datafile = datapath
        else:
            # Build data from zoo, and place in given datapath.
            if datapath is None:
                # Build data from zoo.
                from parlai.core.params import ParlaiParser

                parser = ParlaiParser(False, False)
                self.datapath = parser.parse_args([])['datapath']
            else:
                self.datapath = datapath
            self.datafile = _path()

        # store a token trie: e.g.
        # {'2': {'girls': {'1': {'cup': {'__END__': True}}}}
        self.END = '__END__'
        self.max_len = 1
        self.offensive_trie = {}
        self.word_prefixes = [
            'de',
            'de-',
            'dis',
            'dis-',
            'ex',
            'ex-',
            'mis',
            'mis-',
            'pre',
            'pre-',
            'non',
            'non-',
            'semi',
            'semi-',
            'sub',
            'sub-',
            'un',
            'un-',
        ]
        self.word_suffixes = [
            'a',
            'able',
            'as',
            'dom',
            'ed',
            'er',
            'ers',
            'ery',
            'es',
            'est',
            'ful',
            'fy',
            'ies',
            'ify',
            'in',
            'ing',
            'ish',
            'less',
            'ly',
            's',
            'y',
        ]
        self.allow_list = [
            'butter',
            'buttery',
            'spicy',
            'spiced',
            'spices',
            'spicier',
            'spicing',
            'twinkies',
        ]

        with PathManager.open(self.datafile, 'r') as f:
            for p in f.read().splitlines():
                mod_ps = [p]
                mod_ps += [pref + p for pref in self.word_prefixes]
                mod_ps += [p + suff for suff in self.word_suffixes]
                for mod_p in mod_ps:
                    if mod_p not in self.allow_list:
                        self.add_phrase(mod_p)
def main(opt):
    """
    Extracts training data for the negative response classifier (NRC) from Mturk logs.

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating
        positive/negative example
    """
    examples = []

    num_episodes = 0
    num_parleys = 0
    for episode in extract_parlai_episodes(opt['infile']):
        num_episodes += 1
        history = []
        for parley in episode:
            num_parleys += 1
            # Update history (not including stock control flow responses)
            if parley.context.startswith(
                    INITIAL_PROMPT) or parley.context.startswith(NEWTOPIC):
                # a prompt, first utterance
                # Begin history
                history = [parley.response]
                # NOTE: we now allow these one-utterance episodes to be examples
                # continue
            elif parley.context.startswith(
                    EXP_REQUEST) or parley.context.startswith(RAT_REQUEST):
                # If 'filter_accusation' is on and the last example added was a human,
                # toss the previous example, which is when the human expressed
                # dissatisfaction
                if (opt['mode'] == 'human' and opt['filter_accusation']
                        and parley.context.startswith(EXP_REQUEST)
                        and len(examples) > 0):
                    examples.pop()
                # If 'filter_mistake' is on and the last example in the queue was a bot,
                # toss it too, since that's when the bot messed up
                if (opt['mode'] == 'bot' and opt['filter_mistake']
                        and parley.context.startswith(EXP_REQUEST)
                        and len(examples) > 0):
                    examples.pop()

                # Asked for y_exp or rating, got it
                # Messed up, so blast history
                history = []
                continue
            elif CONTINUE in parley.context:
                # if response was negative, history will get blasted in EXP_REQUEST
                # if we're here, response was neutral/positive, so continue the history
                history.append(parley.context[parley.context.rindex(':') + 1:])
                history.append(parley.response)
            else:
                # normal turn: maintain the history
                history.append(parley.context)
                history.append(parley.response)

            if opt['mode'] in ['bot'] and len(history) >= 2:
                if len(history) == 2:
                    example = Parley(context='__null__', response=history[0])
                else:
                    example = Parley(
                        context=add_person_tokens(history[:-2],
                                                  last_speaker=1),
                        response=history[-2],  # What the bot said
                    )
                examples.append(example)

            if opt['mode'] in ['human']:
                if len(history) == 1:
                    example = Parley(context='__null__', response=history[0])
                else:
                    example = Parley(
                        # this is not technically true:
                        # the last speaker was the bot (__p2__),
                        # not the human (__p1__), but in all our data, __p1__ is always
                        # the speaking partner of the learner
                        context=add_person_tokens(history[:-1],
                                                  last_speaker=1),
                        response=history[-1],  # What the bot said
                    )
                examples.append(example)

    with PathManager.open(opt['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(f"Extracted {len(examples)} examples out of {num_episodes} episodes "
          f"({num_parleys} parleys) and wrote them to {opt['outfile']} with "
          f"histsz == {opt['history_size']}.")
Esempio n. 10
0
def prepare_integration_tests(opt: Opt):
    logging.info("[ Preparing data for adversarial human evaluation ... ]")

    _check_log_dir(opt["log_folder"])
    wrapper_name = opt["wrapper"]
    wrapper = load_wrapper_module(wrapper_name)()
    safety_setting = opt["safety_setting"]

    if safety_setting == "adversarial":
        task_name = "bot_adversarial_dialogue:HumanSafetyEvaluation"
        task_specific_opt = {
            "flatten_dialogue": True,
            "datatype": "valid",
            "bad_include_persona": False,
        }
    else:
        task_name = "bot_adversarial_dialogue:HumanNonadvSafetyEvaluation"
        task_specific_opt = {"datatype": "valid"}

    # Collect model responses to task
    teacher_acts, model_responses = collect_task_data_and_model_response(
        wrapper,
        opt=opt,
        task_name=task_name,
        task_specific_opt=task_specific_opt,
        save_teacher_acts=True,
    )

    # Format the way it is required for human evaluations
    chat_logs = []
    for inp_act, resp in zip(teacher_acts, model_responses):
        episode = []
        inp = inp_act["text"]
        inp_split = inp.split("\n")
        if len(inp_split) % 2 != 1:
            # Join first input
            first_inp = inp_split[0]
            inp_split = inp_split[1:]
            inp_split[0] = "\n".join([inp_split[0], first_inp])
        inp_split.append(resp)
        # section off into pairs
        pairs = [inp_split[i:i + 2] for i in range(0, len(inp_split), 2)]
        for pair in pairs:
            episode.append([
                {
                    'text': pair[0],
                    'episode_done': False,
                    'id': 'human'
                },
                {
                    'text': pair[1],
                    'episode_done': False,
                    'id': 'bot'
                },
            ])
        # mark the last episode as done
        episode[-1][1]['episode_done'] = True
        if "human_eval_turn_range" in inp_act:
            turn_range = [
                int(x) for x in inp_act["human_eval_turn_range"].split("|")
            ]
            episode = episode[turn_range[0]:turn_range[1] + 1]

        chat_logs.append(episode)

    task_data_path = os.path.join(opt["log_folder"], "task_data.jsonl")
    indices_path = os.path.join(opt["log_folder"], "annotation_indices.jsonl")
    with PathManager.open(task_data_path, 'w') as fw:
        for episode in chat_logs:
            fw.write(json.dumps(episode) + '\n')
    with PathManager.open(indices_path, 'w') as fw:
        for episode in chat_logs:
            fw.write(f'[{len(episode) * 2 -1}]' + '\n')

    _next_steps(safety_setting, task_data_path, indices_path)
Esempio n. 11
0
 def get_data_from_file(self, filepath):
     data = []
     with PathManager.open(filepath) as f:
         for line in f:
             data.append(json.loads(line))
     return data
Esempio n. 12
0
    def __init__(self, opt: Opt, shared=None):
        init_model, self.is_finetune = self._get_init_model(opt, shared)
        super().__init__(opt, shared)

        # set up classes
        if opt.get('classes') is None and opt.get('classes_from_file') is None:
            raise RuntimeError(
                'Must specify --classes or --classes-from-file argument.')
        if not shared:
            if opt['classes_from_file'] is not None:
                with PathManager.open(opt['classes_from_file']) as f:
                    self.class_list = f.read().splitlines()
            else:
                self.class_list = opt['classes']
            self.class_dict = {val: i for i, val in enumerate(self.class_list)}
            if opt.get('class_weights', None) is not None:
                self.class_weights = opt['class_weights']
            else:
                self.class_weights = [1.0 for c in self.class_list]
            self.reset_metrics()
        else:
            self.class_list = shared['class_list']
            self.class_dict = shared['class_dict']
            self.class_weights = shared['class_weights']

        # in binary classfication, opt['threshold'] applies to ref class
        if opt['ref_class'] is None or opt['ref_class'] not in self.class_dict:
            self.ref_class = self.class_list[0]
        else:
            self.ref_class = opt['ref_class']
            ref_class_id = self.class_list.index(self.ref_class)
            if ref_class_id != 0:
                # move to the front of the class list
                self.class_list.insert(0, self.class_list.pop(ref_class_id))

        # set up threshold, only used in binary classification
        if len(self.class_list) == 2 and opt.get('threshold', 0.5) != 0.5:
            self.threshold = opt['threshold']
        else:
            self.threshold = None

        # set up model and optimizers
        states = {}
        if shared:
            self.model = shared['model']
        else:
            self.model = self.build_model()
            # freeze the encoder and update the classifier only
            if opt.get("update_classifier_head_only", False):
                for _param_name, _param_value in self.model.named_parameters():
                    if not _param_name.startswith('additional_linear_layer'):
                        _param_value.requires_grad = False

            self.criterion = self.build_criterion()
            if self.model is None or self.criterion is None:
                raise AttributeError(
                    'build_model() and build_criterion() need to return the model or criterion'
                )
            if init_model:
                logging.info(
                    f'Loading existing model parameters from {init_model}')
                states = self.load(init_model)
            if self.use_cuda:
                if self.model_parallel:
                    ph = PipelineHelper()
                    ph.check_compatibility(self.opt)
                    self.model = ph.make_parallel(self.model)
                else:
                    self.model.cuda()
                if self.data_parallel:
                    self.model = torch.nn.DataParallel(self.model)
                self.criterion.cuda()

            train_params = trainable_parameters(self.model)
            total_params = total_parameters(self.model)
            logging.info(
                f"Total parameters: {total_params:,d} ({train_params:,d} trainable)"
            )

        if shared:
            # We don't use get here because hasattr is used on optimizer later.
            if 'optimizer' in shared:
                self.optimizer = shared['optimizer']
        elif self._should_initialize_optimizer():
            optim_params = [
                p for p in self.model.parameters() if p.requires_grad
            ]
            self.init_optim(optim_params)
            self.build_lr_scheduler(states, hard_reset=self.is_finetune)
Esempio n. 13
0
    def _save_outputs(self, opt, world, logger, episode_metrics):
        if is_distributed():  # flatten everything intelligently if need be
            world_report = aggregate_unnamed_reports(
                all_gather_list(world.report()))
            episode_metrics_unflattened = all_gather_list(episode_metrics)
            flattened = []
            for rank_elem in episode_metrics_unflattened:
                for elem in rank_elem:
                    flattened.append(elem)
            episode_metrics = flattened
        else:
            world_report = world.report()
        logging.report("Final report:\n" + nice_report(world_report))

        report = dict_report(world_report)

        def get_episode_report(goal, episode_metric):
            metrics_dict = dict_report(episode_metric.report())
            metrics_dict["goal"] = goal
            return metrics_dict

        report["tod_metrics"] = [
            get_episode_report(g, e) for g, e in episode_metrics
        ]

        if "report_filename" in opt and opt["report_filename"] is not None:
            if len(world_report) == 0:
                logging.warning("Report is empty; not saving report")

            report_fname = f"{opt['report_filename']}.json"
            # Save report
            if not is_distributed() or is_primary_worker():
                with PathManager.open(report_fname, "w") as f:
                    logging.info(f"Saving model report to {report_fname}")
                    json.dump({"opt": opt, "report": report}, f, indent=4)
                    f.write("\n")  # for jq

        if "world_logs" in opt and opt["world_logs"] is not None:
            if is_distributed():  # Save separately, then aggregate together
                rank = get_rank()
                log_outfile_part = (
                    f"{opt['world_logs']}_{opt['save_format']}_{rank}.jsonl")
                logger.write(log_outfile_part,
                             world,
                             file_format=opt["save_format"])
                sync_object(None)
                if is_primary_worker():
                    log_outfile = f"{opt['world_logs']}_{opt['save_format']}.jsonl"
                    log_outfile_metadata = (
                        f"{opt['world_logs']}_{opt['save_format']}.metadata")
                    with open(log_outfile, "w+") as outfile:
                        for rank in range(num_workers()):
                            log_outfile_part = (
                                f"{opt['world_logs']}_{opt['save_format']}_{rank}.jsonl"
                            )
                            with open(log_outfile_part) as infile:
                                for line in infile:
                                    json_blob = json.loads(line.strip())
                                    if (
                                            len(json_blob["dialog"]) < 2
                                    ):  # skip when we don't have generation
                                        continue
                                    json_blob[
                                        "metadata_path"] = log_outfile_metadata
                                    outfile.write(json.dumps(json_blob))
                                    outfile.write("\n")
                            log_output_part_metadata = f"{opt['world_logs']}_{opt['save_format']}_{rank}.metadata"
                            if rank == 0:
                                copyfile(log_output_part_metadata,
                                         log_outfile_metadata),
                            os.remove(log_outfile_part)
                            os.remove(log_output_part_metadata)
            else:
                log_outfile = f"{opt['world_logs']}_{opt['save_format']}.jsonl"
                logger.write(log_outfile,
                             world,
                             file_format=opt["save_format"])

        return report
Esempio n. 14
0
def main():
    opt = setup_args()
    output_dir = pjoin(opt['datapath'], opt['output_dir'])
    ### collect submissions and comments monthly URLs
    date_to_url_submissions = gather_dump_urls(REDDIT_URL, "submissions")
    date_to_url_comments = gather_dump_urls(REDDIT_URL, "comments")
    date_to_urls = {}
    for k, v in date_to_url_submissions.items():
        date_to_urls[k] = (v, date_to_url_comments.get(k, ''))
    ### download, filter, process, remove
    subprocess.run(['mkdir', pjoin(output_dir, 'reddit_tmp')], stdout=subprocess.PIPE)
    st_time = time()
    if not opt['id_list']:
        subreddit_names = json.loads(opt['subreddit_list'])
        output_files = dict(
            [
                (name, "%s/processed_data/%s_qalist.json" % (output_dir, name))
                for name in subreddit_names
            ]
        )
        qa_dict = dict([(name, {}) for name in subreddit_names])
        for name, fname in output_files.items():
            if isfile(fname):
                print("loading already processed documents from", fname)
                f = open(fname)
                qa_dict[name] = dict(json.load(f))
                f.close()
                print("loaded already processed documents")
        # slice file save
        # get monthly reddit dumps
        for year in range(opt['start_year'], opt['end_year'] + 1):
            st_month = opt['start_month'] if year == opt['start_year'] else 1
            end_month = opt['end_month'] if year == opt['end_year'] else 12
            months = range(st_month, end_month + 1)
            for month in months:
                merged_comments = 0
                submissions_url, comments_url = date_to_urls[(year, month)]
                if not opt['answers_only']:
                    try:
                        processed_submissions = download_and_process(
                            submissions_url,
                            'submissions',
                            subreddit_names,
                            st_time,
                            output_dir,
                        )
                    except FileNotFoundError:
                        sleep(60)
                        print("retrying %s once" % (submissions_url))
                        processed_submissions = download_and_process(
                            submissions_url,
                            'submissions',
                            subreddit_names,
                            st_time,
                            output_dir,
                        )
                    for name in subreddit_names:
                        for dct in processed_submissions[name]:
                            qa_dict[name][dct['id']] = dct
                if not opt['questions_only']:
                    try:
                        processed_comments = download_and_process(
                            comments_url,
                            'comments',
                            subreddit_names,
                            st_time,
                            output_dir,
                        )
                    except FileNotFoundError:
                        sleep(60)
                        print("retrying %s once" % (comments_url))
                        processed_comments = download_and_process(
                            comments_url,
                            'comments',
                            subreddit_names,
                            st_time,
                            output_dir,
                        )
                    # merge submissions and comments
                    for name in subreddit_names:
                        merged_comments = 0
                        for dct in processed_comments[name]:
                            did = dct['parent_id'].split('_')[-1]
                            if did in qa_dict[name]:
                                merged_comments += 1
                                comments_list = qa_dict[name][did].get(
                                    'comments', []
                                ) + [dct]
                                qa_dict[name][did]['comments'] = sorted(
                                    comments_list,
                                    key=lambda x: x['score'],
                                    reverse=True,
                                )
                        print(
                            "----- added to global dictionary",
                            name,
                            year,
                            month,
                            time() - st_time,
                            merged_comments,
                            len(qa_dict[name]),
                        )
                for name, out_file_name in output_files.items():
                    fo = open(out_file_name, "w")
                    json.dump(
                        [(eli_k, eli_dct) for eli_k, eli_dct in qa_dict[name].items()],
                        fo,
                    )
                    fo.close()

    # get specific reddit posts
    if opt['id_list']:
        with PathManager.open(opt['id_list']) as f:
            post_ids = json.load(f)
        sr_names = None
        if not opt['answers_only']:
            try:
                sr_names, processed_submissions = download_and_process_posts(
                    post_ids, st_time
                )
            except FileNotFoundError:
                sleep(60)
                print("retrying %s once" % (submissions_url))
                sr_names, processed_submissions = download_and_process_posts(
                    post_ids, st_time
                )

            output_files = dict(
                [
                    (name, "%s/processed_data/%s_qalist.json" % (output_dir, name))
                    for name in sr_names
                ]
            )
            qa_dict = dict([(name, {}) for name in sr_names])
            for name, fname in output_files.items():
                if isfile(fname):
                    print("loading already processed documents from", fname)
                    f = open(fname)
                    qa_dict[name] = dict(json.load(f))
                    f.close()
                    print("loaded already processed documents")

            for name in sr_names:
                for dct in processed_submissions[name]:
                    qa_dict[name][dct['id']] = dct

        if not opt['questions_only']:
            try:
                sr_names, processed_comments = download_and_process_comments(
                    post_ids, st_time
                )
            except FileNotFoundError:
                sleep(60)
                print("retrying %s once" % (submissions_url))
                sr_names, processed_comments = download_and_process_comments(
                    post_ids, st_time
                )

            output_files = dict(
                [
                    (name, "%s/processed_data/%s_qalist.json" % (output_dir, name))
                    for name in sr_names
                ]
            )
            qa_dict = dict([(name, {}) for name in sr_names])
            for name, fname in output_files.items():
                if isfile(fname):
                    print("loading already processed documents from", fname)
                    f = open(fname)
                    qa_dict[name] = dict(json.load(f))
                    f.close()
                    print("loaded already processed documents")
            # merge submissions and comments
            for name in sr_names:
                merged_comments = 0
                for dct in processed_comments[name]:
                    did = dct['parent_id'].split('_')[-1]
                    if did in qa_dict[name]:
                        merged_comments += 1
                        comments_list = qa_dict[name][did].get('comments', []) + [dct]
                        qa_dict[name][did]['comments'] = sorted(
                            comments_list, key=lambda x: x['score'], reverse=True
                        )
                print(
                    "----- added to global dictionary",
                    name,
                    time() - st_time,
                    merged_comments,
                    len(qa_dict[name]),
                )

        for name, out_file_name in output_files.items():
            fo = open(out_file_name, "w")
            json.dump(
                [(eli_k, eli_dct) for eli_k, eli_dct in qa_dict[name].items()], fo
            )
            fo.close()

    if not opt['questions_only']:
        for name, out_file_name in output_files.items():
            qa_dct_list = [
                (k, post_process(rdct, name))
                for k, rdct in qa_dict[name].items()
                if 'comments' in rdct
            ]
            qa_dct_list = [
                x
                for x in qa_dct_list
                if len(x[1]['comments']) > 0 and name in x[1]['url'].lower()
            ]
            fo = open(out_file_name, "w")
            json.dump(qa_dct_list, fo)
            fo.close()
Esempio n. 15
0
def make_parlai_format(outpath, dtype, data):
    print('building parlai:' + dtype)
    with PathManager.open(os.path.join(outpath, dtype + '.txt'), 'w') as fout:
        for data_point in data:
            fout.write(_handle_data_point(data_point))
Esempio n. 16
0
 def _setup_data(self, fold):
     self.data = []
     fpath = os.path.join(self.opt['datapath'], 'dailydialog', fold + '.json')
     with PathManager.open(fpath) as f:
         for line in f:
             self.data.append(json.loads(line))
Esempio n. 17
0
    def set_fixed_candidates(self, shared):
        """
        Load a set of fixed candidates and their vectors (or vectorize them here).

        self.fixed_candidates will contain a [num_cands] list of strings
        self.fixed_candidate_vecs will contain a [num_cands, seq_len] LongTensor

        See the note on the --fixed-candidate-vecs flag for an explanation of the
        'reuse', 'replace', or path options.

        Note: TorchRankerAgent by default converts candidates to vectors by vectorizing
        in the common sense (i.e., replacing each token with its index in the
        dictionary). If a child model wants to additionally perform encoding, it can
        overwrite the vectorize_fixed_candidates() method to produce encoded vectors
        instead of just vectorized ones.
        """
        if shared:
            self.fixed_candidates = shared['fixed_candidates']
            self.fixed_candidate_vecs = shared['fixed_candidate_vecs']
            self.fixed_candidate_encs = shared['fixed_candidate_encs']
            self.num_fixed_candidates = shared['num_fixed_candidates']
        else:
            self.num_fixed_candidates = 0
            opt = self.opt
            cand_path = self.fixed_candidates_path
            if 'fixed' in (self.candidates, self.eval_candidates):
                if not cand_path:
                    # Attempt to get a standard candidate set for the given task
                    path = self.get_task_candidates_path()
                    if path:
                        logging.info(f"setting fixed_candidates path to: {path}")
                        self.fixed_candidates_path = path
                        cand_path = self.fixed_candidates_path
                # Load candidates
                logging.info(f"Loading fixed candidate set from {cand_path}")
                with PathManager.open(cand_path, 'r', encoding='utf-8') as f:
                    cands = [line.strip() for line in f.readlines()]
                # Load or create candidate vectors
                if PathManager.exists(self.opt['fixed_candidate_vecs']):
                    vecs_path = opt['fixed_candidate_vecs']
                    vecs = self.load_candidates(vecs_path)
                else:
                    setting = self.opt['fixed_candidate_vecs']
                    model_dir, model_file = os.path.split(self.opt['model_file'])
                    model_name = os.path.splitext(model_file)[0]
                    cands_name = os.path.splitext(os.path.basename(cand_path))[0]
                    vecs_path = os.path.join(
                        model_dir, '.'.join([model_name, cands_name, 'vecs'])
                    )
                    if setting == 'reuse' and PathManager.exists(vecs_path):
                        vecs = self.load_candidates(vecs_path)
                    else:  # setting == 'replace' OR generating for the first time
                        vecs = self._make_candidate_vecs(cands)
                        self._save_candidates(vecs, vecs_path)

                self.fixed_candidates = cands
                self.num_fixed_candidates = len(self.fixed_candidates)
                self.fixed_candidate_vecs = vecs
                if self.use_cuda:
                    self.fixed_candidate_vecs = self.fixed_candidate_vecs.cuda()

                if self.encode_candidate_vecs:
                    # candidate encodings are fixed so set them up now
                    enc_path = os.path.join(
                        model_dir, '.'.join([model_name, cands_name, 'encs'])
                    )
                    if setting == 'reuse' and PathManager.exists(enc_path):
                        encs = self.load_candidates(enc_path, cand_type='encodings')
                    else:
                        encs = self._make_candidate_encs(self.fixed_candidate_vecs)
                        self._save_candidates(
                            encs, path=enc_path, cand_type='encodings'
                        )
                    self.fixed_candidate_encs = encs
                    if self.use_cuda:
                        self.fixed_candidate_encs = self.fixed_candidate_encs.cuda()
                    if self.fp16:
                        self.fixed_candidate_encs = self.fixed_candidate_encs.half()
                    else:
                        self.fixed_candidate_encs = self.fixed_candidate_encs.float()
                else:
                    self.fixed_candidate_encs = None

            else:
                self.fixed_candidates = None
                self.fixed_candidate_vecs = None
                self.fixed_candidate_encs = None
Esempio n. 18
0
def build(opt):
    version = '1.1'
    dpath = os.path.join(opt['datapath'], 'empatheticdialoguesru')

    if not build_data.built(dpath, version_string=version):
        print(f'[building data: {dpath}]')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(os.path.join(dpath, 'empatheticdialoguesru'))

        build_en_data(opt)

        mname = "Helsinki-NLP/opus-mt-en-ru"
        if torch.cuda.is_available():
            device = torch.device('cuda:0')
        else:
            device = torch.device('cpu')
        tokenizer = AutoTokenizer.from_pretrained(mname)
        model = MarianMTModel.from_pretrained(mname)
        model.to(device)

        for base_datatype in ['train', 'valid', 'test']:
            en_dfpath = os.path.join(
                opt['datapath'],
                'empatheticdialogues',
                'empatheticdialogues',
                base_datatype + '.csv',
            )
            with PathManager.open(en_dfpath) as f:
                df = f.readlines()

            def _translate_utterances(utterances):
                dataset = _SimpleDataset(utterances)
                dataloader = DataLoader(dataset,
                                        batch_size=opt.get('batch_size'),
                                        shuffle=False)

                outputs = []
                for batch in dataloader:
                    tokens = tokenizer(batch,
                                       return_tensors='pt',
                                       padding=True)['input_ids']
                    outputs.append(
                        model.generate(tokens.to(device)).to(
                            torch.device('cpu')))

                translated = [
                    tokenizer.decode(output[i], skip_special_tokens=True)
                    for output in outputs for i in range(output.shape[0])
                ]
                return translated

            def _translate_and_repack(utterances):
                input_utterances = [
                    utterance.replace("_comma_", ",")
                    for utterance in utterances
                ]
                translated = _translate_utterances(input_utterances)
                return [
                    utterance.replace(",", "_comma_")
                    for utterance in translated
                ]

            dfpath = en_dfpath.replace('empatheticdialogues',
                                       'empatheticdialoguesru')

            with PathManager.open(dfpath, mode='w') as f:
                f.write(df[0])
                turn_idx = 1
                jobs = {}
                lines = []
                lines_with_cands = {}
                for i in tqdm(range(1, len(df)),
                              f"Translating dataset: {base_datatype}"):
                    cparts = df[i - 1].strip().split(",")
                    sparts = df[i].strip().split(",")

                    # Collect turn's utterances
                    def _collect():
                        lines.append(sparts)
                        line_idx = len(lines) - 1
                        for in_line_idx in [3, 5]:
                            jobs.setdefault(sparts[in_line_idx], []).append({
                                'line_idx':
                                line_idx,
                                'in_line_idx':
                                in_line_idx
                            })

                        if len(sparts) == 9:
                            if sparts[8] != '':
                                in_line_idx = 8
                                for cand_idx, cand in enumerate(
                                        sparts[8].split('|')):
                                    jobs.setdefault(
                                        cand.replace("_pipe_", "|"),
                                        []).append({
                                            'line_idx': line_idx,
                                            'in_line_idx': in_line_idx,
                                            'cand_idx': cand_idx
                                        })
                                    lines_with_cands.setdefault(
                                        f"{line_idx}:{in_line_idx}",
                                        []).append(None)

                        elif len(sparts) == 8:
                            pass
                        else:
                            raise ValueError(
                                f'Line {i:d} has the wrong number of fields!')

                    if cparts[0] == sparts[0]:
                        # Check that the turn number has incremented correctly
                        turn_idx += 1
                        assert (int(cparts[1]) + 1 == int(sparts[1])
                                and int(sparts[1]) == turn_idx)

                        _collect()
                    else:
                        # We've finished the previous episode, so translate it
                        def _translate_episode():
                            # Add indirection level to reduce memory use
                            inputs = []
                            positions = []
                            for key, value in jobs.items():
                                inputs.append(key)
                                positions.append(value)

                            if len(inputs) == 0:
                                return
                            outputs = _translate_and_repack(inputs)

                            for out_idx, output in enumerate(outputs):
                                for position in positions[out_idx]:
                                    if 'cand_idx' not in position:
                                        lines[position['line_idx']][
                                            position['in_line_idx']] = output
                                    else:
                                        lines_with_cands[
                                            f"{position['line_idx']}:{position['in_line_idx']}"][
                                                position[
                                                    'cand_idx']] = output.replace(
                                                        "|", "_pipe_")
                            for key, value in lines_with_cands.items():
                                line_idx, pos_idx = key.split(':')
                                line_idx = int(line_idx)
                                pos_idx = int(pos_idx)
                                # Assert we found every single output that was supposed to be here
                                assert all([val is not None for val in value])
                                lines[line_idx][pos_idx] = '|'.join(value)

                            for line in lines:
                                f.write(','.join(line) + '\n')

                        _translate_episode()

                        turn_idx = 1
                        jobs = {}
                        lines = []
                        lines_with_cands = {}
                        # First utterance of any episode requires special processing
                        _collect()
                # Translate the final episode
                _translate_episode()

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Esempio n. 19
0
def self_chat(opt):
    random.seed(opt['seed'])
    partner = opt['partner_model_file']
    partner_opt_file = opt.get('partner_opt_file')

    # Create agents
    agent1 = create_agent(opt, requireModelExists=True)
    agent1.opt.log("Agent 1 Opt")
    if partner is None:
        # Self chat with same model
        agent2 = agent1.clone()
    else:
        # Self chat with different models
        if partner_opt_file:
            print(f"WARNING: Loading override opts from: {partner_opt_file}")
            with PathManager.open(partner_opt_file) as f:
                partner_opt = json.load(f)
        else:
            partner_opt = {}
        partner_opt['interactive_mode'] = opt.get('interactive_mode', True)
        print(
            f"WARNING: Setting partner interactive mode to: {partner_opt['interactive_mode']}"
        )
        agent2 = create_agent_from_model_file(partner, partner_opt)
        agent2.opt.log("Agent 2 Opt")

    # Set IDs
    agent1.id = agent1.id + "_1"
    agent2.id = agent2.id + "_2"

    model_id = agent1.id + "_" + agent2.id

    world = create_task(opt, user_agents=[agent1, agent2])

    # Set up world logging
    logger = WorldLogger(opt)
    log_time = TimeLogger()

    # Run some self chats.
    for i in range(opt['num_self_chats']):
        _run_self_chat_episode(opt, world, logger)
        report = world.report()
        text, report = log_time.log(i + 1, opt['num_self_chats'], report)
        logging.info(text)

    # Save chats
    if opt['outfile'] is None:
        outfile = '/tmp/{}_selfchat'.format(model_id)
    else:
        outfile = opt['outfile']

    if opt['save_format'] == 'conversations' and hasattr(world, 'write'):
        # use self chat specific world to write conversation
        # this might be useful for logging extra contextual
        # information (like personas)
        world.write(logger, outfile)
    else:
        # use default logger write function
        logger.write(outfile, world, opt['save_format'])

    return logger.get_logs()
Esempio n. 20
0
    def _setup_data(self, path):
        logging.info(f"Loading ParlAI text data: {path}")

        self.episodes = []
        self.num_exs = 0
        eps = []
        with PathManager.open(path, newline='\n', encoding='utf-8') as read:
            for line_no, line in enumerate(read, 1):
                msg = str_to_msg(line.rstrip('\n'))
                if msg and 'eval_labels' in msg:
                    raise ValueError(
                        f"It looks like you've written eval_labels as a key in your "
                        f"data file. This is not appropriate; labels will be converted "
                        f"for you automatically. This is happening on Line {line_no} "
                        f"in {path}. The line is:\n\t{line}")
                if msg and 'text' not in msg:
                    raise ValueError(
                        f'ParlaiDialogTeacher requires a "text" field in every '
                        f'entry, but one is missing in Line {line_no} in {path}. '
                        f'The line is:\n\t{line}')
                if msg and 'labels' not in msg:
                    raise ValueError(
                        f'ParlaiDialogTeacher requires a "labels" field in every '
                        f'entry, but one is missing in Line {line_no} in {path}. '
                        f'The line is:\n\t{line}')

                if (self.opt['bad_speaker_to_eval'] != 'all'
                        and self.opt['bad_speaker_to_eval'] !=
                        msg['speaker_to_eval']):
                    continue
                if (self.opt['bad_safety_mix'] != 'all'
                        and SAFETY_DICT[self.opt['bad_safety_mix']] !=
                        msg['labels'][0]):
                    continue
                msg_text = msg['text']
                dialog = msg_text.split('\n')
                if self.opt['bad_include_persona'] and msg[
                        'speaker_to_eval'] == 'bot':
                    # only display persona if it's asked to and if the last turn is bot.
                    if len(msg['bot_persona'].strip()) > 0:
                        dialog[0] = msg['bot_persona'] + '\n' + dialog[0]
                if self.opt['bad_num_turns'] > 0:
                    msg_text = '\n'.join(dialog[-self.opt['bad_num_turns']:])
                else:
                    msg_text = '\n'.join(dialog)
                if msg:
                    msg.force_set('text', msg_text)
                    self.num_exs += 1
                    eps.append(msg)
                    if msg.get('episode_done', False):
                        self.episodes.append(eps)
                        eps = []
        if len(eps) > 0:
            # add last episode
            eps[-1].force_set('episode_done', True)
            self.episodes.append(eps)
        if len(self.episodes) == 1 and line_no > 100:
            logging.error(
                f'The data in {path} looks like one very long episode. If this '
                f'is intentional, you may ignore this, but you MAY have a bug in '
                f'your data.')
Esempio n. 21
0
 def _load_from_codecs(self):
     """
     Load BPE from codecs file.
     """
     with PathManager.open(self.codecs, 'r', encoding='utf-8') as codecs_file:
         self.bpe = apply_bpe.BPE(codecs_file)
Esempio n. 22
0
 def _setup_data(self, data_path, personalities_data_path):
     print('loading: ' + data_path)
     with PathManager.open(data_path) as f:
         self.data = json.load(f)
     with PathManager.open(personalities_data_path) as f:
         self.personalities = json.load(f)
Esempio n. 23
0
    def __init__(self, opt):
        # if python is called from a non-interactive shell, like a bash script,
        # it will by-default ignore SIGINTs, and KeyboardInterrupt exceptions are
        # not produced. This line brings them back
        signal.signal(signal.SIGINT, signal.default_int_handler)
        # Possibly load from checkpoint
        trainstats_suffix = '.trainstats'  # we might load training statistics from here
        if (opt['load_from_checkpoint'] and opt.get('model_file')
                and PathManager.exists(opt['model_file'] + '.checkpoint')):
            opt['init_model'] = opt['model_file'] + '.checkpoint'
            trainstats_suffix = '.checkpoint.trainstats'
        # Possibly build a dictionary (not all models do this).
        if not (opt.get('dict_file') or opt.get('model_file')):
            raise RuntimeError(
                'WARNING: For train_model, please specify either a '
                'model_file or dict_file.')
        if 'dict_file' in opt:
            if opt['dict_file'] is None and opt.get('model_file'):
                opt['dict_file'] = opt['model_file'] + '.dict'
            logging.info("building dictionary first...")
            build_dict(opt, skip_if_built=True)

        # Create model and assign it to the specified task
        self.agent = create_agent(opt)
        self.agent.opt.log()
        self.world = create_task(opt, self.agent)
        # set up timers
        self.train_time = Timer()
        self.validate_time = Timer()
        self.log_time = Timer()
        self.save_time = Timer()

        self.parleys = 0
        self.max_num_epochs = (opt['num_epochs']
                               if opt['num_epochs'] > 0 else float('inf'))
        self.max_train_time = (opt['max_train_time']
                               if opt['max_train_time'] > 0 else float('inf'))
        self.log_every_n_secs = (opt['log_every_n_secs'] if
                                 opt['log_every_n_secs'] > 0 else float('inf'))
        self.val_every_n_secs = (opt['validation_every_n_secs']
                                 if opt['validation_every_n_secs'] > 0 else
                                 float('inf'))
        self.save_every_n_secs = (opt['save_every_n_secs']
                                  if opt['save_every_n_secs'] > 0 else
                                  float('inf'))
        self.val_every_n_epochs = (opt['validation_every_n_epochs']
                                   if opt['validation_every_n_epochs'] > 0 else
                                   float('inf'))

        # smart defaults for --validation-metric-mode
        if opt['validation_metric'] in {'loss', 'ppl', 'mean_rank'}:
            opt['validation_metric_mode'] = 'min'
        elif opt['validation_metric'] in {
                'accuracy', 'hits@1', 'hits@5', 'f1', 'bleu'
        }:
            opt['validation_metric_mode'] = 'max'
        if opt.get('validation_metric_mode') is None:
            opt['validation_metric_mode'] = 'max'

        self.last_valid_epoch = 0
        self.valid_optim = 1 if opt['validation_metric_mode'] == 'max' else -1
        self.train_reports = []
        self.valid_reports = []
        self.best_valid = None

        self.impatience = 0
        self.saved = False
        self.valid_worlds = None
        self.opt = opt

        # we may have been preempted, make sure we note that amount
        self._preempted_epochs = 0.0
        if opt.get('model_file') and PathManager.exists(opt['model_file'] +
                                                        trainstats_suffix):
            # looks like we were preempted. make sure we load up our total
            # training stats, etc
            with PathManager.open(opt['model_file'] + trainstats_suffix) as ts:
                obj = json.load(ts)
                self.parleys = obj.get('parleys', 0)
                self._preempted_epochs = obj.get('total_epochs', 0)
                self.train_time.total = obj.get('train_time', 0)
                self.impatience = obj.get('impatience', 0)
                self.valid_reports = obj.get('valid_reports', [])
                self.train_reports = obj.get('train_reports', [])
                if 'best_valid' in obj:
                    self.best_valid = obj['best_valid']
                else:
                    # old method
                    if opt.get('model_file') and PathManager.exists(
                            opt['model_file'] + '.best_valid'):
                        with PathManager.open(
                                opt['model_file'] + ".best_valid", 'r') as f:
                            x = f.readline()
                            self.best_valid = float(x)
                            f.close()

        if opt['tensorboard_log'] and is_primary_worker():
            self.tb_logger = TensorboardLogger(opt)
Esempio n. 24
0
 def _setup_data(self, data_path):
     print('loading: ' + data_path)
     with PathManager.open(data_path) as data_file:
         self.episodes = data_file.readlines()
Esempio n. 25
0
def download_multiprocess(urls,
                          path,
                          num_processes=32,
                          chunk_size=100,
                          dest_filenames=None,
                          error_path=None):
    """
    Download items in parallel (e.g. for an image + dialogue task).

    WARNING: may have issues with OS X.

    :param urls:
        Array of urls to download
    :param path:
        directory to save items in
    :param num_processes:
        number of processes to use
    :param chunk_size:
        chunk size to use
    :param dest_filenames:
        optional array of same length as url with filenames.  Images will be
        saved as path + dest_filename
    :param error_path:
        where to save error logs
    :return:
        array of tuples of (destination filename, http status code, error
        message if any). Note that upon failure, file may not actually be
        created.
    """

    pbar = tqdm.tqdm(total=len(urls), position=0)

    # Resume TODO: isfile() may take too long ?? Should I try in a .tmp file
    if dest_filenames:
        if len(dest_filenames) != len(urls):
            raise Exception(
                'If specified, destination filenames must equal url array in length.'
            )
    else:

        def _naming_fn(url, url_metadata=None):
            return hashlib.md5(url.encode('utf-8')).hexdigest()

        dest_filenames = [_naming_fn(url) for url in urls]

    items = zip(urls, dest_filenames)
    remaining_items = [
        it for it in items if not PathManager.exists(os.path.join(path, it[1]))
    ]
    logging.info(
        f'Of {len(urls)} items, {len(urls) - len(remaining_items)} already existed; only going to download {len(remaining_items)} items.'
    )
    pbar.update(len(urls) - len(remaining_items))

    pool_chunks = ((remaining_items[i:i + chunk_size], path,
                    _download_multiprocess_single)
                   for i in range(0, len(remaining_items), chunk_size))
    remaining_chunks_count = math.ceil(float(
        len(remaining_items) / chunk_size))
    logging.info(
        f'Going to download {remaining_chunks_count} chunks with {chunk_size} images per chunk using {num_processes} processes.'
    )

    pbar.desc = 'Downloading'
    all_results = []
    collected_errors = []

    with Pool(num_processes) as pool:
        for idx, chunk_result in enumerate(
                pool.imap_unordered(_download_multiprocess_map_chunk,
                                    pool_chunks, 2)):
            all_results.extend(chunk_result)
            for dest_file, http_status_code, error_msg in chunk_result:
                if http_status_code != 200:
                    # msg field available as third item in the tuple
                    # not using b/c error log file would blow up
                    collected_errors.append({
                        'dest_file': dest_file,
                        'status_code': http_status_code,
                        'error': error_msg,
                    })
                    logging.error(
                        f'Bad download - chunk: {idx}, dest_file: {dest_file}, http status code: {http_status_code}, error_msg: {error_msg}'
                    )
            pbar.update(len(chunk_result))
    pbar.close()

    if error_path:
        now = time.strftime("%Y%m%d-%H%M%S")
        error_filename = os.path.join(
            error_path, 'parlai_download_multiprocess_errors_%s.log' % now)

        with PathManager.open(os.path.join(error_filename), 'w') as error_file:
            error_file.write(json.dumps(collected_errors))
            logging.error(f'Summary of errors written to {error_filename}')

    logging.info(f'Of {len(remaining_items)} items attempted downloading, '
                 f'{len(collected_errors)} had errors.')

    logging.debug('Finished downloading chunks.')
    return all_results
Esempio n. 26
0
    def save_conversations(
        cls,
        act_list,
        datapath,
        opt,
        save_keys='all',
        context_ids='context',
        self_chat=False,
        **kwargs,
    ):
        """
        Write Conversations to file from an act list.

        Conversations assume the act list is of the following form: a list of episodes,
        each of which is comprised of a list of act pairs (i.e. a list dictionaries
        returned from one parley)
        """
        to_save = cls._get_path(datapath)

        context_ids = context_ids.split(',')
        # save conversations
        speakers = []
        with PathManager.open(to_save, 'w') as f:
            for ep in act_list:
                if not ep:
                    continue
                convo = {
                    'dialog': [],
                    'context': [],
                    'metadata_path': Metadata._get_path(to_save),
                }
                for act_pair in ep:
                    new_pair = []
                    for ex in act_pair:
                        ex_id = ex.get('id')
                        if ex_id in context_ids:
                            context = True
                        else:
                            context = False
                            if ex_id not in speakers:
                                speakers.append(ex_id)

                        # set turn
                        turn = {}
                        if save_keys != 'all':
                            save_keys_lst = save_keys.split(',')
                        else:
                            save_keys_lst = [
                                key for key in ex.keys() if key != 'metrics'
                            ]
                        for key in save_keys_lst:
                            turn[key] = ex.get(key, '')
                        turn['id'] = ex_id
                        if not context:
                            new_pair.append(turn)
                        else:
                            convo['context'].append(turn)
                    if new_pair:
                        convo['dialog'].append(new_pair)
                json_convo = json.dumps(convo)
                f.write(json_convo + '\n')
        logging.info(f'Conversations saved to file: {to_save}')

        # save metadata
        Metadata.save_metadata(to_save,
                               opt,
                               self_chat=self_chat,
                               speakers=speakers,
                               **kwargs)
Esempio n. 27
0
 def _load_data_dump(self):
     with PathManager.open(self.data_path, 'rb') as f:
         dump = json.load(f)
     return dump['standard']
Esempio n. 28
0
def create_supp(opt):
    """
    Evaluates a model.

    :param opt: tells the evaluation function how to run
    :return: the final result of calling report()
    """
    # Create model and assign it to the specified task
    agent = create_agent(opt, requireModelExists=True)
    world = create_task(opt, agent)

    # Extract supp examples from misses on deploy set
    num_seen = 0
    num_misses = 0
    num_supp = 0
    num_supp_correct = 0
    examples = []
    while not world.epoch_done():
        world.parley()
        # Examples are considered one at a time
        num_seen += 1
        if num_seen % 1000 == 0:
            print(f"{num_seen}/{world.num_examples()}")
        report = world.report()
        if report['accuracy'] < 1.0:
            # Example is a miss (i.e., model got it wrong)
            num_misses += 1
            if random.random() < opt['conversion_rate']:
                # Example will be converted (e.g., bot recognized mistake and asked)
                num_supp += 1
                texts = world.acts[0]['text'].split('\n')
                context = texts[-1]
                memories = texts[:-1]
                candidates = world.acts[0]['label_candidates']
                # Reward of 1 indicates positive, -1 indicates negative (for training)
                # For now, we only train with positives, and the reward field is unused
                reward = 1

                if random.random() < opt['conversion_acc']:
                    # Example will be converted correctly (e.g., good user response)
                    num_supp_correct += 1
                    response = world.acts[0]['eval_labels'][0]
                else:
                    # Example will be converted incorrectly (e.g., bad user response)
                    response = random.choice(
                        world.acts[0]['label_candidates'][:NUM_INLINE_CANDS -
                                                          1])

                example = Parley(context, response, reward, candidates,
                                 memories)
                examples.append(example)
        world.reset_metrics()

    print("EPOCH DONE")
    print(f"Model file: {opt['model_file']}")
    print(f"Deploy file: {opt['task']}")
    print(f"Supp file: {opt['outfile']}")
    print(f"Deploy size (# examples seen): {num_seen}")
    print(f"Supp size (# examples converted): {num_supp}")

    acc = 1 - (num_misses / num_seen)
    print(f"Accuracy (% of deploy): {acc * 100:.1f}% ({num_misses} misses)")
    print(f"Conversion rate (% of misses): {num_supp/num_misses * 100:.2f}% "
          f"({num_supp}/{num_misses})")
    print(
        f"Conversion acc (% of converted): {num_supp_correct/num_supp * 100:.2f}% "
        f"({num_supp_correct}/{num_supp})")

    with PathManager.open(opt['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')
Esempio n. 29
0
 def _setup_data(self):
     print('loading: ' + self.data_path)
     with PathManager.open(self.data_path) as f:
         self.data = json.load(f)
Esempio n. 30
0
    def setup_data(self, datafile):
        print('loading: ' + datafile)
        if self.datatype.startswith('train'):
            path_to_open = os.path.join(datafile, 'train.txt')
        elif self.datatype.startswith('valid'):
            path_to_open = os.path.join(datafile, 'valid.txt')
        else:
            path_to_open = os.path.join(datafile, 'test.txt')

        with PathManager.open(path_to_open) as f:
            raw_data = [json.loads(line.strip()) for line in f]

        data = []
        label_speaker_id_range = {}
        predicted_summary_dict = {}
        if self.use_predicted_summary:
            is_session_level = not ('utt_' in self.previous_persona_type)
            predsum_path = get_predicted_summary_path(self.msc_dpath,
                                                      is_session_level)
            logger.warning(f"use the predicted summary from {predsum_path}")
            with PathManager.open(predsum_path) as jsonfile:
                predicted_summary_dict = json.load(jsonfile)

        def _get_time_gap(time_num, time_unit, time_token=""):
            time_gap = str(time_num) + ' ' + time_unit
            return f'{time_token} {time_gap}' if len(
                time_token) > 0 else time_gap

        def _compile_persona_dialog_input(dialog, personas, previous_dialogs,
                                          label_speaker_id):
            new_dialog = copy.deepcopy(dialog)
            new_previous_dialogs = copy.deepcopy(previous_dialogs)
            your_persona = ""
            partner_persona = ""
            if label_speaker_id == 'self':
                your_persona = '\n'.join(
                    [f'your persona: {x}' for x in personas[1]])
                partner_persona = '\n'.join(
                    [f"partner's persona: {x}" for x in personas[0]])
            elif label_speaker_id == 'their':
                your_persona = '\n'.join(
                    [f'your persona: {x}' for x in personas[0]])
                partner_persona = '\n'.join(
                    [f"partner's persona: {x}" for x in personas[1]])
                for prev_dialog in new_previous_dialogs:
                    prev_dialog['dialog'].insert(0, {"text": DUMMY_TEXT})
                    if len(prev_dialog['dialog']) % 2 == 1 and (
                            self.history_person_tokens is None):
                        prev_dialog['dialog'].append({"text": DUMMY_TEXT})
                new_dialog.insert(0, {"text": DUMMY_TEXT})

            return your_persona, partner_persona, new_dialog, new_previous_dialogs

        for dialog_dict in raw_data:
            initial_data_id = dialog_dict['metadata']['initial_data_id']
            if self.label_speaker_id == 'both':
                label_speaker_id_range = ['their', 'self']
            else:
                label_speaker_id_range = [self.label_speaker_id]

            for label_speaker_id in label_speaker_id_range:
                if self.use_predicted_summary:
                    personas_to_complie = predicted_summary_dict[str(
                        self.session_id - 1)][initial_data_id]
                elif self.previous_persona_type.startswith('init'):
                    personas_to_complie = dialog_dict['init_personas']
                else:
                    personas_to_complie = dialog_dict['personas']

                (
                    your_persona,
                    partner_persona,
                    new_dialog,
                    new_previous_dialogs,
                ) = _compile_persona_dialog_input(
                    dialog_dict['dialog'],
                    personas_to_complie,
                    dialog_dict['previous_dialogs'],
                    label_speaker_id,
                )
                previous_sessions_msgs = []
                if self.previous_persona_type == 'raw_history':
                    for d_id in range(len(new_previous_dialogs)):
                        previous_dialog_msg = [
                            x['text']
                            for x in new_previous_dialogs[d_id]['dialog']
                        ]
                        if self.history_person_tokens:
                            previous_dialog_msg = [
                                self.history_person_tokens[i % 2] + ' ' + text
                                for i, text in enumerate(previous_dialog_msg)
                                if text != DUMMY_TEXT
                            ]
                        if self.history_time_gaps_token:
                            time_gap_i = _get_time_gap(
                                new_previous_dialogs[d_id]['time_num'],
                                new_previous_dialogs[d_id]['time_unit'],
                                time_token=self.history_time_gaps_token,
                            )
                            previous_sessions_msgs.append(
                                '\n'.join(previous_dialog_msg + [time_gap_i]))
                        else:
                            previous_sessions_msgs.append(
                                '\n'.join(previous_dialog_msg))

                if self.previous_session_delimiter is not None:
                    previous_sessions_msgs = [
                        val for pair in zip(
                            previous_sessions_msgs,
                            [self.previous_session_delimiter] *
                            len(previous_sessions_msgs),
                        ) for val in pair
                    ]
                previous_sessions_msgs = '\n'.join(previous_sessions_msgs)

                episode = []
                for i in range(0, len(new_dialog) - 1, 2):
                    text = new_dialog[i]['text']
                    partner_persona_one_line = partner_persona.replace(
                        '\n', '').split("partner's persona: ")
                    your_persona_one_line = your_persona.replace(
                        '\n', '').split("your persona: ")
                    action = {
                        'id':
                        self.id,
                        'text':
                        self.normalize_replies(text),
                        'labels':
                        [self.normalize_replies(new_dialog[i + 1]['text'])],
                        'session_id':
                        self.session_id,
                        'initial_data_id':
                        initial_data_id,
                        'personas':
                        f'{partner_persona}\n{your_persona}',
                        'personas_one_line':
                        f"partner's persona: {' '.join(partner_persona_one_line)}\nyour persona: {' '.join(your_persona_one_line)}",
                    }
                    if i == 0:
                        action.update({
                            'time_num':
                            dialog_dict['previous_dialogs'][-1]['time_num'],
                            'time_unit':
                            dialog_dict['previous_dialogs'][-1]['time_unit'],
                        })

                    episode.append(action)
                    if self.session_openning:
                        break

                persona_context_str = ""
                if 'self' in self.previous_persona_type:
                    persona_context_str = your_persona
                elif 'their' in self.previous_persona_type:
                    persona_context_str = partner_persona
                elif 'both' in self.previous_persona_type:
                    if self.your_persona_first:
                        persona_context_str = (
                            (your_persona + '\n')
                            if len(your_persona) > 0 else "") + partner_persona
                    else:
                        persona_context_str = (
                            (partner_persona + '\n')
                            if len(partner_persona) > 0 else "") + your_persona
                elif self.previous_persona_type == 'raw_history':
                    persona_context_str = previous_sessions_msgs

                if self.include_last_time_gap:
                    time_gap = _get_time_gap(
                        dialog_dict['previous_dialogs'][-1]['time_num'],
                        dialog_dict['previous_dialogs'][-1]['time_unit'],
                    )
                    persona_context_str = (
                        (persona_context_str + '\n') if
                        len(persona_context_str) > 0 else "") + f'[{time_gap}]'

                if persona_context_str and len(persona_context_str) > 0:
                    episode[0]['text'] = persona_context_str + '\n' + episode[
                        0]['text']

                data.append(episode)

        for episode in data:
            start_idx = 0
            for i, turn in enumerate(episode):
                yield Message(turn), i == start_idx