def main(config):
    """
    Creates input files for y_exp mturk task from conversation/rating mturk task.

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of logs (in ParlaiDialog format) sliced up to begin at the start of
        an episode or following a new topic request, and ending with a y_exp
    """
    new_episodes = []
    old_episodes = [e for e in extract_parlai_episodes(config['infile'])]
    for episode in old_episodes:
        for parley in episode:
            if any(
                parley.context.startswith(x)
                for x in (NEW_TOPIC_REQUEST.lower(), INITIAL_PROMPT.lower())
            ):
                new_episode = []
            new_episode.append(parley)
            if parley.context.startswith(SUGGESTION_REQUEST.lower()):
                new_episodes.append(new_episode)

    # Create parlai dialog file for easy viewing
    with open(config['outfile'], 'w') as f:
        for episode in new_episodes:
            num_parleys = len(episode)
            for i, parley in enumerate(episode):
                if i == num_parleys - 1:
                    parley.episode_done = True
                f.write(f"{i}\t{parley.to_parlai()}\n")
    print(
        f"Extracted {len(new_episodes)} episodes out of {len(old_episodes)} "
        f"original episodes and wrote them to {config['outfile']}."
    )
Example #2
0
def main(opt):
    """Extracts training data for the negative response classifier (NRC) from Mturk logs

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating
        positive/negative example
    """
    examples = []
    positives = opt['positives'].split(',')
    negatives = opt['negatives'].split(',')
    assert len(set(positives).intersection(set(negatives))) == 0

    num_episodes = 0
    num_parleys = 0
    for episode in extract_parlai_episodes(opt['infile']):
        num_episodes += 1
        history = []
        for parley in episode:
            num_parleys += 1

            # Update history (not including stock control flow responses)
            if parley.context.startswith(INITIAL_PROMPT):
                # Conversation prompt, first utterance
                # Begin history
                history = [parley.response]
            elif parley.context.startswith(EXP_REQUEST):
                # Asked for y_exp, got y_exp
                # Messed up, so blast history
                example = Parley(
                    context=add_person_tokens(history[:-2], last_speaker=1),
                    response=parley.response,  # y_exp
                )
                examples.append(example)
                history = []
            elif parley.context.startswith(NEWTOPIC):
                # Asked for new topic, got a first utterance
                # Begin new history
                history = [parley.response]
            elif parley.context.startswith(RAT_REQUEST):
                # Asked for rating, got one-word rating
                # Nothing to update in history
                pass
            elif CONTINUE in parley.context:
                # if response was negative, history will get blasted in EXP_REQUEST
                # if we're here, response was neutral/positive, so continue the history
                history.append(parley.context[parley.context.rindex(':') + 1:])
                history.append(parley.response)
            else:
                # normal turn: maintain the history
                history.append(parley.context)
                history.append(parley.response)

    with open(opt['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(f"Extracted {len(examples)} ratings out of {num_episodes} episodes "
          f"({num_parleys} parleys) and wrote them to {opt['outfile']} with "
          f"histsz == {opt['history_size']}.")
def main(config):
    """
    Creates .stitched files from .suggested files.

    input: a .suggested file of logs (in ParlaiDialog format) from Mturk task 2, each of
        which starts with an initial prompt or topic request, and ends with a y
        that corresponds to the y_exp given in the previous turn
    output: a .stitched file (in self-feeding format) with the original mistake by the
        bot replace with the mturked y (based on y_exp)
    """
    examples = []
    episodes = [e for e in extract_parlai_episodes(config['infile'])]
    for episode in episodes:
        history = []
        num_parleys = len(episode)
        for i, parley in enumerate(episode):
            if i == 0:  # Don't include the topic request
                history.append(parley.response)
                continue
            elif i == num_parleys - 3:
                # third to last was mistake and negative feedback
                continue
            elif i == num_parleys - 2:
                # penultimate turn was suggestion request and y_exp
                continue
            elif i == num_parleys - 1:
                # ultimate turn was verbatim request and y
                example = Parley(
                    context=add_person_tokens(history, last_speaker=1),
                    response=parley.response,  # y
                )
                examples.append(example)
            else:
                # normal turn; just add to history
                history.append(parley.context)
                history.append(parley.response)

    # Write new episodes to self-feeding format
    with open(config['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(
        f"Extracted {len(examples)} self-feeding episodes out of "
        f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}."
    )
Example #4
0
def main(config):
    """
    Creates .identity files from .sliced files.

    input: a .sliced file of logs (in ParlaiDialog format) from Mturk task 1, each of
        which starts with an initial prompt or topic request, and ends with a y_exp
    output: an .identity file (in self-feeding format) with y_exps used as though they
        were ys
    """
    examples = []
    episodes = [e for e in extract_parlai_episodes(config['infile'])]
    for episode in episodes:
        history = []
        num_parleys = len(episode)
        for i, parley in enumerate(episode):
            if i == 0:  # Don't include the topic request
                history.append(parley.response)
                continue
            elif i == num_parleys - 2:
                # penultimate turn was mistake and negative feedback
                continue
            elif i == num_parleys - 1:
                # ultimate turn was correction request and y_exp
                example = Parley(
                    context=add_person_tokens(history, last_speaker=1),
                    response=parley.response,  # y_exp
                )
                examples.append(example)
            else:
                # normal turn; just add to history
                history.append(parley.context)
                history.append(parley.response)

    # Write new episodes to self-feeding format
    with open(config['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(
        f"Extracted {len(examples)} self-feeding episodes out of "
        f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}."
    )
Example #5
0
def main(config):
    """
    Creates .unfiltered files from .sliced files.

    input: a .sliced file of logs (in ParlaiDialog format) from Mturk task 1, each of
        which starts with an initial prompt or topic request, and ends with a y_exp
    output: a .unfiltered file (in self-feeding format) with every utterance output by
        bot used as a label (i.e., act as though the bot was a human and we want to
        train in a normal supervised way).
    """
    examples = []
    episodes = [e for e in extract_parlai_episodes(config['infile'])]
    for episode in episodes:
        history = []
        num_parleys = len(episode)
        for i, parley in enumerate(episode):
            if i == 0:  # Don't include the topic request
                history.append(parley.response)
                continue
            elif i == num_parleys - 1:
                # ultimate turn was correction request and explanation
                continue
            else:
                example = Parley(
                    context=add_person_tokens(history, last_speaker=1),
                    response=parley.context,  # What the bot said
                )
                examples.append(example)
                history.append(parley.context)
                history.append(parley.response)

    # Write new episodes to self-feeding format
    with PathManager.open(config['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(
        f"Extracted {len(examples)} self-feeding episodes out of "
        f"{len(episodes)} parlai episodes and wrote them to {config['outfile']}."
    )
Example #6
0
def main(opt):
    """Extracts training data for the negative response classifier (NRC) from Mturk logs

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating
        positive/negative example
    """
    examples = []

    num_episodes = 0
    num_parleys = 0
    for episode in extract_parlai_episodes(opt['infile']):
        num_episodes += 1
        history = []
        for parley in episode:
            num_parleys += 1
            # Update history (not including stock control flow responses)
            if (parley.context.startswith(INITIAL_PROMPT) or
                    parley.context.startswith(NEWTOPIC)):
                # a prompt, first utterance
                # Begin history
                history = [parley.response]
                # NOTE: we now allow these one-utterance episodes to be examples
                # continue
            elif (parley.context.startswith(EXP_REQUEST) or
                  parley.context.startswith(RAT_REQUEST)):
                # If 'filter_accusation' is on and the last example added was a human,
                # toss the previous example, which is when the human expressed
                # dissatisfaction
                if (opt['mode'] == 'human' and
                    opt['filter_accusation'] and
                    parley.context.startswith(EXP_REQUEST) and
                        len(examples) > 0):
                    examples.pop()
                # If 'filter_mistake' is on and the last example in the queue was a bot,
                # toss it too, since that's when the bot messed up
                if (opt['mode'] == 'bot' and
                    opt['filter_mistake'] and
                    parley.context.startswith(EXP_REQUEST) and
                        len(examples) > 0):
                    examples.pop()

                # Asked for y_exp or rating, got it
                # Messed up, so blast history
                history = []
                continue
            elif CONTINUE in parley.context:
                # if response was negative, history will get blasted in EXP_REQUEST
                # if we're here, response was neutral/positive, so continue the history
                history.append(parley.context[parley.context.rindex(':') + 1:])
                history.append(parley.response)
            else:
                # normal turn: maintain the history
                history.append(parley.context)
                history.append(parley.response)

            if opt['mode'] in ['bot'] and len(history) >= 2:
                if len(history) == 2:
                    example = Parley(
                        context='__null__',
                        response=history[0],
                    )
                else:
                    example = Parley(
                        context=add_person_tokens(history[:-2], last_speaker=1),
                        response=history[-2],  # What the bot said
                    )
                examples.append(example)

            if opt['mode'] in ['human']:
                if len(history) == 1:
                    example = Parley(
                        context='__null__',
                        response=history[0],
                    )
                else:
                    example = Parley(
                        # this is not technically true:
                        # the last speaker was the bot (__p2__),
                        # not the human (__p1__), but in all our data, __p1__ is always
                        # the speaking partner of the learner
                        context=add_person_tokens(history[:-1], last_speaker=1),
                        response=history[-1],  # What the bot said
                    )
                examples.append(example)

    with open(opt['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(f"Extracted {len(examples)} examples out of {num_episodes} episodes "
          f"({num_parleys} parleys) and wrote them to {opt['outfile']} with "
          f"histsz == {opt['history_size']}.")
def main(config):
    """Extracts training data for the negative response classifier (NRC) from Mturk logs

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating
        positive/negative example
    """
    examples = []
    positives = config['positives'].split(',')
    negatives = config['negatives'].split(',')
    assert(len(set(positives).intersection(set(negatives))) == 0)

    num_episodes = 0
    num_parleys = 0
    for episode in extract_parlai_episodes(config['infile']):
        num_episodes += 1
        history = []
        for parley in episode:
            num_parleys += 1

            # Update history (not including stock control flow responses)
            if parley.context.startswith(INITIAL_PROMPT.lower()):
                # Conversation prompt, first utterance
                history = [parley.response]
            elif parley.context.startswith(SUGGESTION_REQUEST.lower()):
                # Asked for y_exp, got y_exp
                pass
            elif parley.context.startswith(NEW_TOPIC_REQUEST.lower()):
                # Asked for new topic, got a first utterance
                history = [parley.response]
            else:
                history.append(parley.context)
                history.append(parley.response)

            # Only create a new example if this parley's rating is relevant
            if parley.reward in (positives + negatives):
                # Concatenate history and add speaker tokens as necessary
                # history_size refers to the total number of utterances
                # (history_size == 0 means predict sentiment from '__null__')
                # response that's being classified (so if history_size == 0 then
                # classify based only on the response w/o any extra context).
                # Note that the response being classified should always be preceded by
                # __p1__ (the human), not __p2__ (the bot).
                if config['history_size'] < 0:
                    utterances = history
                elif config['history_size'] == 0:
                    utterances = ['__null__']
                else:
                    utterances = history[-config['history_size']:]

                context = add_person_tokens(utterances, last_speaker=1)

                label = 1 if parley.reward in positives else -1

                example = Parley(context, label)
                examples.append(example)

    with open(config['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(f"Extracted {len(examples)} ratings out of {num_episodes} episodes "
          f"({num_parleys} parleys) and wrote them to {config['outfile']} with "
          f"histsz == {config['history_size']}.")
Example #8
0
def main(opt):
    """Extracts training data for the negative response classifier (NRC) from Mturk logs

    input: file of logs (in ParlaiDialog format) from Mturk task 1 with turn-by-turn
        quality ratings 1-5
    output: file of episodes (self-feeding format) w/ +1/-1 ratings indicating
        positive/negative example
    """
    examples = []
    positives = opt['positives'].split(',')
    negatives = opt['negatives'].split(',')
    assert len(set(positives).intersection(set(negatives))) == 0

    num_episodes = 0
    num_parleys = 0
    for episode in extract_parlai_episodes(opt['infile']):
        num_episodes += 1
        history = []
        for parley in episode:
            num_parleys += 1

            # Update history (not including stock control flow responses)
            if parley.context.startswith(INITIAL_PROMPT):
                # Conversation prompt, first utterance
                # Begin history
                history = [parley.response]
            elif parley.context.startswith(EXP_REQUEST):
                # Asked for y_exp, got y_exp
                # Messed up, so blast history
                history = []
            elif parley.context.startswith(NEWTOPIC):
                # Asked for new topic, got a first utterance
                # Begin new history
                history = [parley.response]
            elif parley.context.startswith(RAT_REQUEST):
                # Concatenate history and add speaker tokens as necessary
                # history_size refers to the total number of utterances
                # (history_size == 0 means predict sentiment from '__null__')
                # response that's being classified (so if history_size == 0 then
                # classify based only on the response w/o any extra context).
                # Note that the response being classified should always be preceded by
                # __p1__ (the human), not __p2__ (the bot).
                if opt['history_size'] < 0:
                    utterances = history
                elif opt['history_size'] == 0:
                    utterances = ['__null__']
                else:
                    utterances = history[-opt['history_size'] :]
                context = add_person_tokens(utterances, last_speaker=1)

                if parley.response in positives:
                    label = 1
                elif parley.response in negatives:
                    label = -1
                else:
                    label = 0

                if label:
                    example = Parley(context, label)
                    examples.append(example)

            elif CONTINUE in parley.context:
                # if response was negative, history will get blasted in EXP_REQUEST
                # if we're here, response was neutral/positive, so continue the history
                history.append(parley.context[parley.context.rindex(':') + 1 :])
                history.append(parley.response)
            else:
                history.append(parley.context)
                history.append(parley.response)

    with open(opt['outfile'], 'w') as outfile:
        for ex in examples:
            outfile.write(json.dumps(ex.to_dict()) + '\n')

    print(
        f"Extracted {len(examples)} ratings out of {num_episodes} episodes "
        f"({num_parleys} parleys) and wrote them to {opt['outfile']} with "
        f"histsz == {opt['history_size']}."
    )