コード例 #1
0
ファイル: dump_db_neg.py プロジェクト: rashirungta/cocoa
def log_worker_id_to_json(db_path, batch_results):
    '''
    {chat_id: {'0': worker_id; '1': worker_id}}
    '''
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    code_to_wid_list = []
    for batch_result in batch_results:
        if batch_result.endswith('csv'):
            code_to_wid = read_results_csv(batch_result)
        else:  # fig8 JSON
            code_to_wid = read_results_json(batch_result)

        code_to_wid_list.append(code_to_wid)

    worker_ids = chat_to_worker_id(cursor, code_to_wid_list)

    output_dir = os.path.dirname(batch_results[0])
    # outfile_name = os.path.splitext(os.path.basename(batch_results[0]))[0] + '_worker_ids.json'
    outfile_name = 'worker_ids.json'
    outfile_path = os.path.join(output_dir, outfile_name)
    write_json(worker_ids, outfile_path)
コード例 #2
0
ファイル: visualize_data.py プロジェクト: emilyahn/cocoa
def write_metadata(transcripts, outdir, responses=None):
    metadata = {'data': []}
    for chat in transcripts:
        if len(chat['events']) == 0:
            continue
        row = {}
        row['dialogue_id'] = chat['uuid']
        row['scenario_id'] = chat['scenario_uuid']
        scenario = get_scenario(chat)
        row['num_items'] = len(scenario.kbs[0].items)
        row['num_attrs'] = len(scenario.attributes)
        row['outcome'] = 'fail' if chat['outcome']['reward'] == 0 else 'success'
        row['agent0'] = AGENT_NAMES[chat['agents']['0']]
        row['agent1'] = AGENT_NAMES[chat['agents']['1']]
        if responses:
            dialogue_response = responses[chat['uuid']]
            question_scores = defaultdict(list)
            for agent_id, scores in dialogue_response.iteritems():
                for question in QUESTIONS:
                    question_scores[question].extend(scores[question])
            for question, scores in question_scores.iteritems():
                row[question] = np.mean(scores)
        metadata['data'].append(row)
    write_json(metadata, os.path.join(outdir, 'metadata.json'))
コード例 #3
0
    def dump_chats(cls, cursor, scenario_db, json_path, uids=None):
        """Dump chat transcripts to a JSON file.

        Args:
            scenario_db (ScenarioDB): retrieve Scenario by logged uuid.
            json_path (str): output path.
            uids (list): if provided, only log chats from these users.

        """
        if uids is None:
            cursor.execute('SELECT DISTINCT chat_id FROM event')
            ids = cursor.fetchall()
        else:
            ids = []
            # uids = [(x,) for x in uids]
            for uid in uids:
                # cursor.execute('SELECT chat_id FROM mturk_task WHERE name=?', uid)
                cursor.execute('SELECT chat_id FROM mturk_task WHERE name=?',
                               (uid, ))
                ids_ = cursor.fetchall()
                ids.extend(ids_)

        def is_single_agent(chat):
            agent_event = {0: 0, 1: 0}
            for event in chat.events:
                agent_event[event.agent] += 1
            return agent_event[0] == 0 or agent_event[1] == 0

        examples = []
        for chat_id in ids:
            ex = cls.get_chat_example(cursor, chat_id[0], scenario_db)
            if ex is None or is_single_agent(ex):
                continue
            examples.append(ex)

        write_json([ex.to_dict() for ex in examples], json_path)
コード例 #4
0
    kbs = [KB(scenario_attributes, items) for items in agent_items]
    scenarios = []
    for style in styles:
        scenario = Scenario(generate_uuid('S'), scenario_attributes, kbs, style, [alphas[attr] for attr in scenario_attributes])
        scenarios.append(scenario)
    return scenarios

# Generate scenarios
schema = Schema(args.schema_path, args.domain)
scenario_list = []
while len(scenario_list) < args.num_scenarios * args.num_styles:
    s_list = generate_scenario(schema)
    for s in s_list:
        if s is not None:
            scenario_list.append(s)

scenario_db = ScenarioDB(scenario_list)
write_json(scenario_db.to_dict(), args.scenarios_path)

# Output a sample of what we've generated
for i in range(min(100, len(scenario_db.scenarios_list))):
    print '---------------------------------------------------------------------------------------------'
    print '---------------------------------------------------------------------------------------------'
    scenario = scenario_db.scenarios_list[i]
    print "Scenario id: %s" % scenario.uuid
    print "Alphas: [%s]" % ", ".join(["%2.1f" % alpha for alpha in scenario.alphas])
    for agent in (0, 1):

        kb = scenario.kbs[agent]
        kb.dump()
コード例 #5
0
    question_scores = defaultdict(lambda: defaultdict(list))
    raw_chats = read_json(args.dialogue_transcripts)
    uuid_to_chat = {chat['uuid']: chat for chat in raw_chats}
    schema = Schema(args.schema_path)
    scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path))
    dialogue_ids = filter(raw_eval, uuid_to_chat)

    for eval_ in raw_eval:
        read_eval(eval_, question_scores, mask=dialogue_ids)

    if args.hist:
        hist(question_scores, args.outdir, partner=args.partner)

    if args.summary:
        summary = summarize(question_scores)
        write_json(summary, args.stats)

    if args.analyze:
        schema = Schema(args.schema_path)
        lexicon = Lexicon(schema,
                          False,
                          scenarios_json=args.scenarios_path,
                          stop_words=args.stop_words)
        preprocessor = Preprocessor(schema, lexicon, 'canonical', 'canonical',
                                    'canonical')
        analyze(question_scores, uuid_to_chat, preprocessor)

    # Visualize
    if args.html_output:
        visualize(args.viewer_mode, args.html_output, question_scores,
                  uuid_to_chat)
コード例 #6
0
ファイル: split_dataset.py プロジェクト: anushabala/cocoa-old
                    help='Transciprts paths',
                    nargs='*',
                    default=[])
parser.add_argument('--train-frac',
                    help='Fraction of training examples',
                    type=float,
                    default=0.6)
parser.add_argument('--test-frac',
                    help='Fraction of test examples',
                    type=float,
                    default=0.2)
parser.add_argument('--dev-frac',
                    help='Fraction of dev examples',
                    type=float,
                    default=0.2)
parser.add_argument('--output-path', help='Output path for splits')
args = parser.parse_args()

np.random.seed(0)
json_data = ([], [], [])
for path in args.example_paths:
    examples = read_json(path)
    folds = np.random.choice(
        3, len(examples), p=[args.train_frac, args.dev_frac, args.test_frac])
    for ex, fold in izip(examples, folds):
        json_data[fold].append(ex)

for fold, dataset in izip(('train', 'dev', 'test'), json_data):
    if len(dataset) > 0:
        write_json(dataset, '%s%s.json' % (args.output_path, fold))
コード例 #7
0
ファイル: main.py プロジェクト: tigerneil/cocoa
        if args.test and args.best:
            ckpt = tf.train.get_checkpoint_state(args.init_from + '-best')
        else:
            ckpt = tf.train.get_checkpoint_state(args.init_from)
        assert ckpt, 'No checkpoint found'
        assert ckpt.model_checkpoint_path, 'No model path found in checkpoint'

        # Load vocab
        mappings = read_pickle(vocab_path)
        print 'Done [%fs]' % (time.time() - start)
    else:
        # Save config
        if not os.path.isdir(args.checkpoint):
            os.makedirs(args.checkpoint)
        config_path = os.path.join(args.checkpoint, 'config.json')
        write_json(vars(args), config_path)
        model_args = args
        mappings = None
        ckpt = None

    schema = Schema(model_args.schema_path, model_args.domain)
    scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path))
    dataset = read_dataset(scenario_db, args)
    print 'Building lexicon...'
    start = time.time()
    lexicon = Lexicon(schema, args.learned_lex, stop_words=args.stop_words)
    print '%.2f s' % (time.time() - start)

    # Dataset
    use_kb = False if model_args.model == 'encdec' else True
    copy = True if model_args.model == 'attn-copy-encdec' else False
コード例 #8
0
                    help='Path to transcritps of mixed partners')
parser.add_argument('--output', help='Output directories')
args = parser.parse_args()

chats = read_json(args.transcripts)
chats_by_agents = defaultdict(list)
scenario_agents = defaultdict(set)
for chat in chats:
    agents = chat['agents']
    if agents['0'] == 'human':
        agents = (agents['0'], agents['1'])
    else:
        agents = (agents['1'], agents['0'])
    chats_by_agents[agents].append(chat)

    scenario_id = chat['scenario_uuid']
    scenario_agents[scenario_id].add(agents)

# Only keep scenarios with all 4 agents
scenario_subset = set(
    [s for s, a in scenario_agents.iteritems() if len(a) == 4])
print 'Number of scenarios:', len(scenario_subset)

for agents, chats in chats_by_agents.iteritems():
    chats = [c for c in chats if c['scenario_uuid'] in scenario_subset]
    print agents, len(chats)
    path = os.path.join(args.output, '%s_transcripts.json' % '-'.join(agents))
    if not os.path.isdir(args.output):
        os.makedirs(args.output)
    write_json(chats, path)