def build_question_and_duration(split="tiny"): data = data_readers.read_corpus(split) questions = [] question_durations_sec = [] response_times_sec = [] session_ids = [] sessions = data_util.get_sessions(data) for session in progressbar.progressbar(sessions): for question, response in session.iter_question_and_response(): questions.append(question.row.text) question_durations_sec.append(question.duration) response_times_sec.append((response.row.created_at - question.row.created_at).seconds) session_ids.append(session.id) dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "question_duration_sec": question_durations_sec, "response_time_sec": response_times_sec}) return dataset
def build_question_and_sentiment(split="tiny"): data = data_readers.read_corpus(split) pool = Pool(12) sessions = data_util.get_sessions(data) combined_results = defaultdict(list) for session_result in progressbar.progressbar(pool.imap(process_session, sessions), max_value=len(sessions)): for k, v in session_result.items(): combined_results[k].extend(v) session_ids = combined_results["session_ids"] questions = combined_results["questions"] sentiments = combined_results["sentiments"] response_times_sec = combined_results["response_times_sec"] dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "question_sentiment": sentiments, "response_time_sec": response_times_sec}) return dataset
def build_question_only(split="tiny", concatenator=None): data = data_readers.read_corpus(split) questions = [] response_times_sec = [] session_ids = [] sessions = data_util.get_sessions(data) progress = progressbar.ProgressBar(max_value=len(sessions)).start() for i, session in enumerate(sessions): for question, response in session.iter_question_and_response(concatenator=concatenator): questions.append(question.row.text) response_times_sec.append((response.row.created_at - question.row.created_at).seconds) session_ids.append(session.id) progress.update(i) dataset = pd.DataFrame.from_dict({"session_id": session_ids, "question": questions, "response_time_sec": response_times_sec}) progress.finish() return dataset
def build_label_counts(split="tiny"): data = data_readers.read_corpus(split) label_counts = [] response_times_sec = [] session_ids = [] sessions = data_util.get_sessions(data) for session in progressbar.progressbar(sessions): counts = defaultdict(int) for question, response in session.iter_question_and_response(): response_time_sec = (response.row.created_at - question.row.created_at).seconds response_times_sec.append(response_time_sec) label_counts.append(tuple(counts[label] for label in Config.LABELS)) session_ids.append(session.id) counts[get_response_time_label(response_time_sec)] += 1 dataset = pd.DataFrame.from_dict({"session_id": session_ids, "response_time_sec": response_times_sec, "label_counts": label_counts}) return dataset
def build_question_with_context_window(split="tiny", window_size=0): data = data_readers.read_corpus(split) sessions = data_util.get_sessions(data) questions = [] response_times_sec = [] session_ids = [] turn_texts = defaultdict(list) turn_speakers = defaultdict(list) turn_times = defaultdict(list) for session in progressbar.progressbar(sessions): for question, response in session.iter_question_and_response(): questions.append(question.row.text) response_times_sec.append((response.row.created_at - question.row.created_at).seconds) session_ids.append(session.id) times = defaultdict(lambda: 0) texts = defaultdict(lambda: []) speakers = defaultdict(lambda: Config.EMPTY_TAG) prev = question.row.created_at for i, turn in enumerate(session.iter_turns(start_row=question.index, num_turns=window_size+1, direction=-1)): texts[i] = turn.text speakers[i] = turn.sent_from times[i] = int((prev - turn.created_at).seconds) prev = turn.created_at for i in range(1, window_size+1): turn_texts["turn_text-%d" % i].append(texts[i]) turn_speakers["turn_speaker-%d" % i].append(speakers[i]) turn_times["turn_time-%d" % i].append(times[i]) columns = {"session_id": session_ids, "question": questions, "response_time_sec": response_times_sec} columns.update(turn_texts) columns.update(turn_speakers) columns.update(turn_times) dataset = pd.DataFrame.from_dict(columns) return dataset
return pearsonr(data.train['response_time_sec'], data.train[col_name]) if __name__ == '__main__': results = {} data = read_dataset_splits( reader=data_readers.read_question_and_context_data, window_size=10, include_question_text=True, include_context_text=True, include_context_speaker=False, include_context_times=False) data = add_jensen_shannon(data) results['question_and_js'] = calc_correlation(data, 'jensen_shannon') df = read_corpus(split='train') all_words = [item for sublist in df.text for item in sublist] for N_words in [25, 50, 100]: top_words = [ item[0] for item in Counter(all_words).most_common(N_words) ] data = read_dataset_splits( reader=data_readers.read_question_and_context_data, window_size=10, include_question_text=True, include_context_text=True, include_context_speaker=False, include_context_times=False) data = add_jensen_shannon(data, stopwords=top_words) results['question_and_js_top' + str(N_words)] = calc_correlation( data, 'jensen_shannon')
question_text.extend(row.text) if concatenator is not None: question_text.append(concatenator) if concatenator is not None and question_text[-1] == concatenator: question_text = question_text[:-1] question.text = question_text duration = (self.rows.iloc[end - 1].created_at - self.rows.iloc[start].created_at).seconds prev_i = response_index yield IndexedRow(question_index, question, duration), IndexedRow(response_index, response) raise StopIteration def get_sessions(data): session_ids = data.session_id.unique() return [ Session(session_id, data.loc[data.session_id == session_id].reset_index()) for session_id in session_ids ] if __name__ == "__main__": import cProfile, data_readers data = data_readers.read_corpus("dev") sessions = get_sessions(data) cProfile.run("test(sessions)", filename="profile")
return data[data.session_id.isin(session_id_splits["tiny"])],\ data[data.session_id.isin(session_id_splits["train"])],\ data[data.session_id.isin(session_id_splits["dev"])],\ data[data.session_id.isin(session_id_splits["test"])] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-t", "--train" , dest="train", type=float, default = 0.7, help="Fraction of train data") parser.add_argument("-d", "--dev" , dest="dev", type=float, default = 0.15, help="Fraction of dev data") parser.add_argument("-e", "--test" , dest="test", type=float, default = 0.15, help="Fraction of test data") parser.add_argument("-y", "--tiny" , dest="tiny", type=float, default = 0.01, help="Fraction of tiny data") args = parser.parse_args() assert args.tiny < args.train log_info("Reading Corpus") data = data_readers.read_corpus() tiny, train, dev, test = split_data(data, tiny_f=args.tiny, train_f=args.train, dev_f=args.dev, test_f=args.test) splits = {"tiny": tiny, "train": train, "dev": dev, "test": test} for name, split in splits.items(): dest = get_dest(split=name) log_info("Writing %d %s rows to %s" % (split.shape[0], name, dest)) split.to_csv(dest, index=False)