def build_question_and_duration(split="tiny"): data = data_readers.read_corpus(split) questions = [] question_durations_sec = [] response_times_sec = [] session_ids = [] sessions = data_util.get_sessions(data) for session in progressbar.progressbar(sessions): for question, response in session.iter_question_and_response(): questions.append(question.row.text) question_durations_sec.append(question.duration) response_times_sec.append((response.row.created_at - question.row.created_at).seconds) session_ids.append(session.id) dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "question_duration_sec": question_durations_sec, "response_time_sec": response_times_sec}) return dataset
def build_question_and_sentiment(split="tiny"): data = data_readers.read_corpus(split) pool = Pool(12) sessions = data_util.get_sessions(data) combined_results = defaultdict(list) for session_result in progressbar.progressbar(pool.imap(process_session, sessions), max_value=len(sessions)): for k, v in session_result.items(): combined_results[k].extend(v) session_ids = combined_results["session_ids"] questions = combined_results["questions"] sentiments = combined_results["sentiments"] response_times_sec = combined_results["response_times_sec"] dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "question_sentiment": sentiments, "response_time_sec": response_times_sec}) return dataset
def build_question_only(split="tiny", concatenator=None): data = data_readers.read_corpus(split) questions = [] response_times_sec = [] session_ids = [] sessions = data_util.get_sessions(data) progress = progressbar.ProgressBar(max_value=len(sessions)).start() for i, session in enumerate(sessions): for question, response in session.iter_question_and_response(concatenator=concatenator): questions.append(question.row.text) response_times_sec.append((response.row.created_at - question.row.created_at).seconds) session_ids.append(session.id) progress.update(i) dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "response_time_sec": response_times_sec}) progress.finish() return dataset
def build_label_counts(split="tiny"): data = data_readers.read_corpus(split) label_counts = [] response_times_sec = [] session_ids = [] sessions = data_util.get_sessions(data) for session in progressbar.progressbar(sessions): counts = defaultdict(int) for question, response in session.iter_question_and_response(): response_time_sec = (response.row.created_at - question.row.created_at).seconds response_times_sec.append(response_time_sec) label_counts.append(tuple(counts[label] for label in Config.LABELS)) session_ids.append(session.id) counts[get_response_time_label(response_time_sec)] += 1 dataset = pd.DataFrame.from_dict({"session_id": session_ids, "response_time_sec": response_times_sec, "label_counts": label_counts}) return dataset
def build_question_with_context_window(split="tiny", window_size=0): data = data_readers.read_corpus(split) sessions = data_util.get_sessions(data) questions = [] response_times_sec = [] session_ids = [] turn_texts = defaultdict(list) turn_speakers = defaultdict(list) turn_times = defaultdict(list) for session in progressbar.progressbar(sessions): for question, response in session.iter_question_and_response(): questions.append(question.row.text) response_times_sec.append((response.row.created_at - question.row.created_at).seconds) session_ids.append(session.id) times = defaultdict(lambda: 0) texts = defaultdict(lambda: []) speakers = defaultdict(lambda: Config.EMPTY_TAG) prev = question.row.created_at for i, turn in enumerate(session.iter_turns(start_row=question.index, num_turns=window_size+1, direction=-1)): texts[i] = turn.text speakers[i] = turn.sent_from times[i] = int((prev - turn.created_at).seconds) prev = turn.created_at for i in range(1, window_size+1): turn_texts["turn_text-%d" % i].append(texts[i]) turn_speakers["turn_speaker-%d" % i].append(speakers[i]) turn_times["turn_time-%d" % i].append(times[i]) columns = {"session_id": session_ids, "question": questions, "response_time_sec": response_times_sec} columns.update(turn_texts) columns.update(turn_speakers) columns.update(turn_times) dataset = pd.DataFrame.from_dict(columns) return dataset
def split_data(data, tiny_f=0.01, train_f=0.7, dev_f=0.15, test_f=0.15): session_ids = data.session_id.unique() assert abs(train_f + dev_f + test_f - 1.0) * len(session_ids) < 1 assert tiny_f < train_f log_info("Splitting %d session_ids" % len(session_ids)) log_info("Extracting Sessions") sessions = data_util.get_sessions(data) print("\tExtracted %d sessions" % len(sessions)) session_id_to_num_questions = defaultdict(int) num_questions_to_session_ids = defaultdict(list) for session in sessions: num_questions = len(tuple(session.iter_question_and_response())) session_id_to_num_questions[session.id] = num_questions num_questions_to_session_ids[num_questions].append(session.id) groups = get_stratified_session_ids(num_questions_to_session_ids, min([train_f, dev_f, test_f])) session_id_splits = defaultdict(list) for group in groups: np.random.seed(seed=Config.SEED) np.random.shuffle(group) train_split, dev_split, test_split = np.split(group, [int(np.round(train_f * len(group))), int(np.round((train_f + dev_f) * len(group)))]) session_id_splits["train"].extend(train_split) session_id_splits["dev"].extend(dev_split) session_id_splits["test"].extend(test_split) # tiny is a subset of train session_id_splits["tiny"] = session_id_splits["train"][:int(np.round(tiny_f * len(session_ids)))] for split, ids in session_id_splits.items(): num_q = sum([session_id_to_num_questions[id] for id in ids]) print("\t%s: %d sessions, %d questions" % (split, len(ids), num_q)) return data[data.session_id.isin(session_id_splits["tiny"])],\ data[data.session_id.isin(session_id_splits["train"])],\ data[data.session_id.isin(session_id_splits["dev"])],\ data[data.session_id.isin(session_id_splits["test"])]