def log_worker_id_to_json(db_path, batch_results): ''' {chat_id: {'0': worker_id; '1': worker_id}} ''' conn = sqlite3.connect(db_path) cursor = conn.cursor() code_to_wid_list = [] for batch_result in batch_results: if batch_result.endswith('csv'): code_to_wid = read_results_csv(batch_result) else: # fig8 JSON code_to_wid = read_results_json(batch_result) code_to_wid_list.append(code_to_wid) worker_ids = chat_to_worker_id(cursor, code_to_wid_list) output_dir = os.path.dirname(batch_results[0]) # outfile_name = os.path.splitext(os.path.basename(batch_results[0]))[0] + '_worker_ids.json' outfile_name = 'worker_ids.json' outfile_path = os.path.join(output_dir, outfile_name) write_json(worker_ids, outfile_path)
def write_metadata(transcripts, outdir, responses=None): metadata = {'data': []} for chat in transcripts: if len(chat['events']) == 0: continue row = {} row['dialogue_id'] = chat['uuid'] row['scenario_id'] = chat['scenario_uuid'] scenario = get_scenario(chat) row['num_items'] = len(scenario.kbs[0].items) row['num_attrs'] = len(scenario.attributes) row['outcome'] = 'fail' if chat['outcome']['reward'] == 0 else 'success' row['agent0'] = AGENT_NAMES[chat['agents']['0']] row['agent1'] = AGENT_NAMES[chat['agents']['1']] if responses: dialogue_response = responses[chat['uuid']] question_scores = defaultdict(list) for agent_id, scores in dialogue_response.iteritems(): for question in QUESTIONS: question_scores[question].extend(scores[question]) for question, scores in question_scores.iteritems(): row[question] = np.mean(scores) metadata['data'].append(row) write_json(metadata, os.path.join(outdir, 'metadata.json'))
def dump_chats(cls, cursor, scenario_db, json_path, uids=None): """Dump chat transcripts to a JSON file. Args: scenario_db (ScenarioDB): retrieve Scenario by logged uuid. json_path (str): output path. uids (list): if provided, only log chats from these users. """ if uids is None: cursor.execute('SELECT DISTINCT chat_id FROM event') ids = cursor.fetchall() else: ids = [] # uids = [(x,) for x in uids] for uid in uids: # cursor.execute('SELECT chat_id FROM mturk_task WHERE name=?', uid) cursor.execute('SELECT chat_id FROM mturk_task WHERE name=?', (uid, )) ids_ = cursor.fetchall() ids.extend(ids_) def is_single_agent(chat): agent_event = {0: 0, 1: 0} for event in chat.events: agent_event[event.agent] += 1 return agent_event[0] == 0 or agent_event[1] == 0 examples = [] for chat_id in ids: ex = cls.get_chat_example(cursor, chat_id[0], scenario_db) if ex is None or is_single_agent(ex): continue examples.append(ex) write_json([ex.to_dict() for ex in examples], json_path)
kbs = [KB(scenario_attributes, items) for items in agent_items] scenarios = [] for style in styles: scenario = Scenario(generate_uuid('S'), scenario_attributes, kbs, style, [alphas[attr] for attr in scenario_attributes]) scenarios.append(scenario) return scenarios # Generate scenarios schema = Schema(args.schema_path, args.domain) scenario_list = [] while len(scenario_list) < args.num_scenarios * args.num_styles: s_list = generate_scenario(schema) for s in s_list: if s is not None: scenario_list.append(s) scenario_db = ScenarioDB(scenario_list) write_json(scenario_db.to_dict(), args.scenarios_path) # Output a sample of what we've generated for i in range(min(100, len(scenario_db.scenarios_list))): print '---------------------------------------------------------------------------------------------' print '---------------------------------------------------------------------------------------------' scenario = scenario_db.scenarios_list[i] print "Scenario id: %s" % scenario.uuid print "Alphas: [%s]" % ", ".join(["%2.1f" % alpha for alpha in scenario.alphas]) for agent in (0, 1): kb = scenario.kbs[agent] kb.dump()
question_scores = defaultdict(lambda: defaultdict(list)) raw_chats = read_json(args.dialogue_transcripts) uuid_to_chat = {chat['uuid']: chat for chat in raw_chats} schema = Schema(args.schema_path) scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path)) dialogue_ids = filter(raw_eval, uuid_to_chat) for eval_ in raw_eval: read_eval(eval_, question_scores, mask=dialogue_ids) if args.hist: hist(question_scores, args.outdir, partner=args.partner) if args.summary: summary = summarize(question_scores) write_json(summary, args.stats) if args.analyze: schema = Schema(args.schema_path) lexicon = Lexicon(schema, False, scenarios_json=args.scenarios_path, stop_words=args.stop_words) preprocessor = Preprocessor(schema, lexicon, 'canonical', 'canonical', 'canonical') analyze(question_scores, uuid_to_chat, preprocessor) # Visualize if args.html_output: visualize(args.viewer_mode, args.html_output, question_scores, uuid_to_chat)
help='Transciprts paths', nargs='*', default=[]) parser.add_argument('--train-frac', help='Fraction of training examples', type=float, default=0.6) parser.add_argument('--test-frac', help='Fraction of test examples', type=float, default=0.2) parser.add_argument('--dev-frac', help='Fraction of dev examples', type=float, default=0.2) parser.add_argument('--output-path', help='Output path for splits') args = parser.parse_args() np.random.seed(0) json_data = ([], [], []) for path in args.example_paths: examples = read_json(path) folds = np.random.choice( 3, len(examples), p=[args.train_frac, args.dev_frac, args.test_frac]) for ex, fold in izip(examples, folds): json_data[fold].append(ex) for fold, dataset in izip(('train', 'dev', 'test'), json_data): if len(dataset) > 0: write_json(dataset, '%s%s.json' % (args.output_path, fold))
if args.test and args.best: ckpt = tf.train.get_checkpoint_state(args.init_from + '-best') else: ckpt = tf.train.get_checkpoint_state(args.init_from) assert ckpt, 'No checkpoint found' assert ckpt.model_checkpoint_path, 'No model path found in checkpoint' # Load vocab mappings = read_pickle(vocab_path) print 'Done [%fs]' % (time.time() - start) else: # Save config if not os.path.isdir(args.checkpoint): os.makedirs(args.checkpoint) config_path = os.path.join(args.checkpoint, 'config.json') write_json(vars(args), config_path) model_args = args mappings = None ckpt = None schema = Schema(model_args.schema_path, model_args.domain) scenario_db = ScenarioDB.from_dict(schema, read_json(args.scenarios_path)) dataset = read_dataset(scenario_db, args) print 'Building lexicon...' start = time.time() lexicon = Lexicon(schema, args.learned_lex, stop_words=args.stop_words) print '%.2f s' % (time.time() - start) # Dataset use_kb = False if model_args.model == 'encdec' else True copy = True if model_args.model == 'attn-copy-encdec' else False
help='Path to transcritps of mixed partners') parser.add_argument('--output', help='Output directories') args = parser.parse_args() chats = read_json(args.transcripts) chats_by_agents = defaultdict(list) scenario_agents = defaultdict(set) for chat in chats: agents = chat['agents'] if agents['0'] == 'human': agents = (agents['0'], agents['1']) else: agents = (agents['1'], agents['0']) chats_by_agents[agents].append(chat) scenario_id = chat['scenario_uuid'] scenario_agents[scenario_id].add(agents) # Only keep scenarios with all 4 agents scenario_subset = set( [s for s, a in scenario_agents.iteritems() if len(a) == 4]) print 'Number of scenarios:', len(scenario_subset) for agents, chats in chats_by_agents.iteritems(): chats = [c for c in chats if c['scenario_uuid'] in scenario_subset] print agents, len(chats) path = os.path.join(args.output, '%s_transcripts.json' % '-'.join(agents)) if not os.path.isdir(args.output): os.makedirs(args.output) write_json(chats, path)