def read_label_counts_data(split="tiny"): dtypes = {"response_time_sec": np.int32, "session_id": np.int32} converters = {"label_counts": ast.literal_eval} path = Config.LABEL_COUNTS_DATASET_FILE(split) data = pd.read_csv(path, sep=",", header=0, dtype=dtypes, converters=converters) log_info("Read %s data with %d rows" % (path.stem, data.shape[0])) return data
def read_question_and_response_data(split="tiny"): dtypes = {"response_time_sec": np.int32, "session_id": np.int32} converters = {"question": ast.literal_eval, "response": ast.literal_eval} path = Config.QUESTION_TEXT_AND_RESPONSE_TEXT_DATASET_FILE(split) data = pd.read_csv(path, sep=",", header=0, dtype=dtypes, converters=converters) log_info("Read %s data with %d rows" % (path.stem, data.shape[0])) return data
def read_question_and_context_data(split="tiny", window_size=1, include_question_text=True, include_context_text=True, include_context_speaker=True, include_context_times=False): assert window_size <= Config.MAX_CONTEXT_WINDOW_SIZE dtypes = {"response_time_sec": np.int32, "session_id": np.int32} converters = {} if include_context_speaker: for i in range(1, window_size + 1): dtypes["turn_speaker-%d" % i] = str def to_float(t): try: return np.float32(t) except: return -1 if include_context_times: for i in range(1, window_size + 1): converters["turn_time-%d" % i] = to_float if include_question_text: converters["question"] = ast.literal_eval if include_context_text: for i in range(1, window_size + 1): converters["turn_text-%d" % i] = ast.literal_eval path = Config.QUESTION_AND_CONTEXT_WINDOW_DATASET_FILE(split) data = pd.read_csv(path, sep=",", header=0, dtype=dtypes, converters=converters) drop_columns = set( data.columns.values) - (set(dtypes.keys()) | set(converters.keys())) data.drop(labels=drop_columns, axis="columns", inplace=True) log_info("Read %s data with %d rows" % (path.stem, data.shape[0])) return data
def split_data(data, tiny_f=0.01, train_f=0.7, dev_f=0.15, test_f=0.15): session_ids = data.session_id.unique() assert abs(train_f + dev_f + test_f - 1.0) * len(session_ids) < 1 assert tiny_f < train_f log_info("Splitting %d session_ids" % len(session_ids)) log_info("Extracting Sessions") sessions = data_util.get_sessions(data) print("\tExtracted %d sessions" % len(sessions)) session_id_to_num_questions = defaultdict(int) num_questions_to_session_ids = defaultdict(list) for session in sessions: num_questions = len(tuple(session.iter_question_and_response())) session_id_to_num_questions[session.id] = num_questions num_questions_to_session_ids[num_questions].append(session.id) groups = get_stratified_session_ids(num_questions_to_session_ids, min([train_f, dev_f, test_f])) session_id_splits = defaultdict(list) for group in groups: np.random.seed(seed=Config.SEED) np.random.shuffle(group) train_split, dev_split, test_split = np.split(group, [int(np.round(train_f * len(group))), int(np.round((train_f + dev_f) * len(group)))]) session_id_splits["train"].extend(train_split) session_id_splits["dev"].extend(dev_split) session_id_splits["test"].extend(test_split) # tiny is a subset of train session_id_splits["tiny"] = session_id_splits["train"][:int(np.round(tiny_f * len(session_ids)))] for split, ids in session_id_splits.items(): num_q = sum([session_id_to_num_questions[id] for id in ids]) print("\t%s: %d sessions, %d questions" % (split, len(ids), num_q)) return data[data.session_id.isin(session_id_splits["tiny"])],\ data[data.session_id.isin(session_id_splits["train"])],\ data[data.session_id.isin(session_id_splits["dev"])],\ data[data.session_id.isin(session_id_splits["test"])]
def read_corpus(split=None): dtypes = { 'session_id': np.int32, 'created_at': object, 'sent_from': str, 'sent_to': str, 'content_type': str } converters = {"text": ast.literal_eval} if split is None: path = Config.CORPUS_FILE split = "entire" else: path = Config.CORPUS_SPLIT_FILE(split) data = pd.read_csv(path, sep=",", header=0, dtype=dtypes, parse_dates=["created_at"], converters=converters) log_info("Read %s corpus with %d rows" % (split, data.shape[0])) return data
dest = os.path.join(Config.DATA_DIR, destname) return dest if __name__ == "__main__": assert Path(Config.CORPUS_FILE).exists(), "%s does not exist" % Config.CORPUS_FILE parser = argparse.ArgumentParser() parser.add_argument("-d", "--dataset", dest="dataset", type=str, default=Dataset.QUESTION_ONLY.name, help="Which dataset to build. Defaults to QUESTION_ONLY") args = parser.parse_args() args.dataset = Dataset[args.dataset] builders = {Dataset.QUESTION_ONLY: build_question_only, Dataset.QUESTION_AND_INDEX: build_question_and_index, Dataset.QUESTION_AND_DURATION: build_question_and_duration, Dataset.QUESTION_AND_SENTIMENT: build_question_and_sentiment, Dataset.QUESTION_AND_NEWLINES: lambda split: build_question_only(split, concatenator="\n"), Dataset.QUESTION_AND_CONTEXT_WINDOW: lambda split: build_question_with_context_window(split, window_size=Config.MAX_CONTEXT_WINDOW_SIZE), Dataset.QUESTION_TEXT_AND_RESPONSE_TEXT: build_question_text_and_response_text, Dataset.LABEL_COUNTS: build_label_counts} log_info("Building the %s dataset" % args.dataset.name.lower()) for split in Config.SPLITS: log_info("Building %s" % split) dataset = builders[args.dataset](split) print("\tExtracted %s samples" % dataset.shape[0]) dest = get_dest_name(split) print("\tWriting dataset to %s" % dest) dataset.to_csv(dest, index=False)
return data[data.session_id.isin(session_id_splits["tiny"])],\ data[data.session_id.isin(session_id_splits["train"])],\ data[data.session_id.isin(session_id_splits["dev"])],\ data[data.session_id.isin(session_id_splits["test"])] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-t", "--train" , dest="train", type=float, default = 0.7, help="Fraction of train data") parser.add_argument("-d", "--dev" , dest="dev", type=float, default = 0.15, help="Fraction of dev data") parser.add_argument("-e", "--test" , dest="test", type=float, default = 0.15, help="Fraction of test data") parser.add_argument("-y", "--tiny" , dest="tiny", type=float, default = 0.01, help="Fraction of tiny data") args = parser.parse_args() assert args.tiny < args.train log_info("Reading Corpus") data = data_readers.read_corpus() tiny, train, dev, test = split_data(data, tiny_f=args.tiny, train_f=args.train, dev_f=args.dev, test_f=args.test) splits = {"tiny": tiny, "train": train, "dev": dev, "test": test} for name, split in splits.items(): dest = get_dest(split=name) log_info("Writing %d %s rows to %s" % (split.shape[0], name, dest)) split.to_csv(dest, index=False)
"--dest", dest="dest", type=str, default=None, help="Path to destination file. Defaults to {datafile}_processed.csv") args = parser.parse_args() path = Path(args.datafile).resolve() assert path.exists() and path.is_file() and path.suffix == '.csv' if args.dest is None: args.dest = os.path.join(str(path.parent), path.stem + "_preprocessed" + path.suffix) if Path(Config.REMOVED_ROWS_FILE).exists(): log_info("Deleting %s" % Config.REMOVED_ROWS_FILE) os.remove(Config.REMOVED_ROWS_FILE) log_info("Reading CSV file") data = read_csv(args.datafile) log_info("Parsing timestamps") data = parse_timestamps(data) log_info("Sorting data") data = data.sort_values(by=["session_id", "created_at"], ascending=[True, True], axis="index") log_info("Deduping utterances") data = dedupe_utterances(data)