def standardize(self): """ Standardizes the features between 0 and self.feature_max. """ LOGGER.info(f"Standardizing target data for task {self.task}") features = self.targets["train"] scaler = MinMaxScaler(feature_range=[0, self.feature_max]) flat_features = [j for i in features for j in i] scaler.fit(flat_features) self.targets["train"] = [list(scaler.transform(i)) for i in features] self.targets["val"] = [ list(scaler.transform(i)) for i in self.targets["val"] ] self.targets["test"] = [ list(scaler.transform(i)) for i in self.targets["test"] ] filen = os.path.join("scaled-test-" + self.task + ".csv") print(filen) flat_preds = [j for i in self.targets["test"] for j in i] preds_pd = pd.DataFrame(flat_preds, columns=[ "n_fix", "first_fix_dur", "first_pass_dur", "total_fix_dur", "mean_fix_dur", "fix_prob", "n_refix", "reread_prob" ]) preds_pd.to_csv(filen) print("saved.")
def load_data(self): LOGGER.info(f"Loading data for task {self.task}") for mode in self.modes: print(mode) dataset_pd = pd.read_csv(os.path.join(self.dir, mode + "_dataset.csv"), na_filter=False, index_col=0) word_func = lambda s: [w for w in s["word"].values.tolist()] features_func = lambda s: [ np.array(s.drop(columns=["sentence_num", "word"]).iloc[i]) for i in range(len(s)) ] self.text_inputs[mode] = dataset_pd.groupby("sentence_num").apply( word_func).tolist() print(len(self.text_inputs[mode])) self.targets[mode] = dataset_pd.groupby("sentence_num").apply( features_func).tolist() # check for duplicate sentence in train and test set dups = [] for i, s in enumerate(self.text_inputs["train"]): if s in self.text_inputs["test"]: print("WARNING! Duplicate in test set....") dups.append(i) # remove duplicated from training data print(len(self.text_inputs["train"])) print(len(dups)) for d in sorted(dups, reverse=True): del self.text_inputs["train"][d] del self.targets["train"][d] print(len(self.text_inputs["train"]))
def read_words(self): """ Word list is extracted from the full subject. """ LOGGER.info(f"Reading words for task {self.task}") if self.task == "geco-nl": for frag in self.frags: for s in self.full_subj: flt_file = self.file isfrag = flt_file["PART"] == frag issubj = flt_file["PP_NR"] == s flt_file = flt_file[isfrag] flt_file = flt_file[issubj] self.flat_words.extend([str(w) for w in flt_file["WORD"].tolist()]) print(len(self.flat_words)) else: for frag in self.frags: flt_file = self.file isfrag = flt_file["PART"] == frag issubj = flt_file["PP_NR"] == self.full_subj flt_file = flt_file[isfrag] flt_file = flt_file[issubj] self.flat_words.extend([str(w) for w in flt_file["WORD"].tolist()]) print(len(self.flat_words))
def train(self): n_batches_one_epoch = len(self.train_dl) n_params = sum(p.numel() for p in self.model.parameters()) mlflow.log_metric("n_params", n_params) LOGGER.info(f"Num epochs: {self.n_epochs}") LOGGER.info(f"Num parameters: {n_params}") LOGGER.info(f"Begin training task {self.task}") self.model.to(self.device) self.model.train() epoch_loss_ls = [] it = 1 for epoch in tqdm(range(1, self.n_epochs + 1)): for batch in tqdm(self.train_dl): it += 1 loss = self.train_one_step(batch) self.writer.add_scalar("train/loss", loss, it) epoch_loss_ls.append(loss) epoch_loss_avg = sum(epoch_loss_ls) / len(epoch_loss_ls) epoch_loss_ls = [] LOGGER.info(f"Done epoch {epoch} / {self.n_epochs}") LOGGER.info(f"Avg loss epoch {epoch}: {epoch_loss_avg:.4f}") self.early_stop() for key, metric in self.early_stop.tester.metrics.items(): self.writer.add_scalar(f"val/{key}", metric, it // n_batches_one_epoch) if self.early_stop.stop: break
def calc_features(self): LOGGER.info(f"Start of features calculation for task {self.task}") for i, frag in enumerate(self.frags): LOGGER.info(f"Processing fragment {i + 1} out of {len(self.frags)}") empty = [[None] * len(self.subjs)] * len(self.frags_words[frag]) nfx_pd = pd.DataFrame(empty, columns=self.subjs) ffd_pd = pd.DataFrame(empty, columns=self.subjs) fpd_pd = pd.DataFrame(empty, columns=self.subjs) tfd_pd = pd.DataFrame(empty, columns=self.subjs) mfd_pd = pd.DataFrame(empty, columns=self.subjs) fxp_pd = pd.DataFrame(empty, columns=self.subjs) nrfx_pd = pd.DataFrame(empty, columns=self.subjs) rrdp_pd = pd.DataFrame(empty, columns=self.subjs) for j, subj in enumerate(self.subjs): with open(self.files[frag][subj][0], errors="ignore") as f: lines = [l.split() for l in f.readlines()[1:]] lines_1 = [[l[0]] + [int(i) for i in l[1:]] for l in lines] with open(self.files[frag][subj][1], errors="ignore") as f: lines = [l.split() for l in f.readlines()[1:]] lines_2 = [[l[0]] + [int(i) for i in l[1:]] for l in lines] for k, w in enumerate(self.frags_words[frag]): idx_1 = self.words_idxs[frag][subj][k][1] idx_2 = self.words_idxs[frag][subj][k][2] nfx = self.get_n_fix(w, lines_2[idx_2:]) ffd = 0 if idx_1 == -1 else self.get_first_fix_dur(lines_1[idx_1:]) fpd = 0 if idx_1 == -1 else self.get_first_pass_dur(w, lines_1[idx_1:]) tfd = self.get_total_fix_dur(w, lines_2[idx_2:]) # print(nfx_ls[k][j], tfd_ls[k][j]) mfd = get_mean_fix_dur(nfx, tfd) fxp = get_fix_prob(nfx) nrfx = get_n_refix(nfx) rrdp = get_reread_prob(nrfx) nfx_pd[subj][k] = nfx ffd_pd[subj][k] = ffd fpd_pd[subj][k] = fpd tfd_pd[subj][k] = tfd mfd_pd[subj][k] = mfd fxp_pd[subj][k] = fxp nrfx_pd[subj][k] = nrfx rrdp_pd[subj][k] = rrdp nfx = nfx_pd.mean(axis=1).tolist() ffd = ffd_pd.mean(axis=1).tolist() fpd = fpd_pd.mean(axis=1).tolist() tfd = tfd_pd.mean(axis=1).tolist() mfd = mfd_pd.mean(axis=1).tolist() fxp = fxp_pd.mean(axis=1).tolist() nrfx = nrfx_pd.mean(axis=1).tolist() rrdp = rrdp_pd.mean(axis=1).tolist() features = [nfx, ffd, fpd, tfd, mfd, fxp, nrfx, rrdp] self.frags_features[frag] = [np.array(i) for i in list(zip(*features))] self.flat_words.extend([w for w in self.frags_words[frag]]) self.flat_features.extend([f for f in self.frags_features[frag]])
def calc_features(self): LOGGER.info(f"Start of features calculation for task {self.task}") # dicts indexed by sentence number and word number inside the sentence nfx_ls = {} ffd_ls = {} fpd_ls = {} tfd_ls = {} mfd_ls = {} fxp_ls = {} nrfx_ls = {} rrdp_ls = {} for i, subj in enumerate(self.subjs): LOGGER.info(f"Processing subject {i + 1} out of {len(self.subjs)}") mat = scipy.io.loadmat(self.files[subj]) sentence_data = mat["sentenceData"][0] for j, row in enumerate(sentence_data): word_data = row["word"][0] cont = False try: cont = np.isnan(word_data[0]) except: pass if cont: continue if i == 0: nfx_ls[j] = [[] for _ in range(len(word_data))] ffd_ls[j] = [[] for _ in range(len(word_data))] fpd_ls[j] = [[] for _ in range(len(word_data))] tfd_ls[j] = [[] for _ in range(len(word_data))] mfd_ls[j] = [[] for _ in range(len(word_data))] fxp_ls[j] = [[] for _ in range(len(word_data))] nrfx_ls[j] = [[] for _ in range(len(word_data))] rrdp_ls[j] = [[] for _ in range(len(word_data))] for k, item in enumerate(word_data): nfx_ls[j][k].append(0 if len(item["nFixations"]) == 0 else item["nFixations"][0][0]) ffd_ls[j][k].append(0 if len(item["FFD"]) == 0 else item["FFD"][0][0]) fpd_ls[j][k].append(0 if len(item["GD"]) == 0 else item["GD"][0][0]) tfd_ls[j][k].append(0 if len(item["TRT"]) == 0 else item["TRT"][0][0]) mfd_ls[j][k].append(get_mean_fix_dur(nfx_ls[j][k][-1], tfd_ls[j][k][-1])) fxp_ls[j][k].append(get_fix_prob(nfx_ls[j][k][-1])) nrfx_ls[j][k].append(get_n_refix(nfx_ls[j][k][-1])) rrdp_ls[j][k].append(get_reread_prob(nrfx_ls[j][k][-1])) for s in nfx_ls: for f1, f2, f3, f4, f5, f6, f7, f8 in zip(nfx_ls[s], ffd_ls[s], fpd_ls[s], tfd_ls[s], mfd_ls[s], fxp_ls[s], nrfx_ls[s], rrdp_ls[s]): nfx = np.average(f1) ffd = np.average(f2) fpd = np.average(f3) tfd = np.average(f4) mfd = np.average(f5) fxp = np.average(f6) nrfx = np.average(f7) rrdp = np.average(f8) self.flat_features.append(np.array([nfx, ffd, fpd, tfd, mfd, fxp, nrfx, rrdp]))
def read_files(self): LOGGER.info(f"Reading files for task {self.task}") for file in sorted(os.listdir(self.dir)): if not file.endswith(".mat"): continue subj = file.split("_")[0][-3:] fpath = os.path.join(self.dir, file) self.files[subj] = fpath
def calc_attn_masks(self): """ Calculates key paddding attention masks for the BERT model. """ LOGGER.info(f"Calculating attention masks for task {self.task}") for mode in self.modes: self.masks[mode] = [[j != self.tokenizer.pad_token_id for j in i] for i in self.text_inputs[mode]]
def dummy_gaze(self): """ Builds dummy gaze numpy arrays filled with nans. """ LOGGER.info(f"Assigning dummy gaze for task {self.task}") for mode in self.modes: self.gaze_inputs[mode] = np.full( (len(self.text_inputs[mode]), 2, 1), np.nan)
def read_file(self): LOGGER.info(f"Reading file for task {self.task}") if self.task == "geco-nl": # Dutch part of the GECO corpus self.file = pd.read_excel(os.path.join(self.dir, "L1ReadingData.xlsx"), na_filter=False) else: # English part of the GECO corpus self.file = pd.read_excel(os.path.join(self.dir, "MonolingualReadingData.xlsx"), na_filter=False)
def calc_numpy(self): LOGGER.info(f"Calculating numpy arrays for task {self.task}") for mode in self.modes: input_numpy = np.asarray(self.text_inputs[mode], dtype=np.int64) mask_numpy = np.asarray(self.masks[mode], dtype=np.float32) target_numpy = np.asarray(self.targets[mode], dtype=np.float32) self.numpy[mode] = list(zip(input_numpy, target_numpy, mask_numpy))
def calc_features(self): LOGGER.info(f"Start of features calculation for task {self.task}") # for each feature, we build a pd.dataframe with shape (len(words), len(subjs)) to be filled with data; data # will be averaged over the subject dimension (columns) empty = [[None] * len(self.subjs)] * len(self.flat_words) nfx_pd = pd.DataFrame(empty, columns=self.subjs) ffd_pd = pd.DataFrame(empty, columns=self.subjs) fpd_pd = pd.DataFrame(empty, columns=self.subjs) tfd_pd = pd.DataFrame(empty, columns=self.subjs) mfd_pd = pd.DataFrame(empty, columns=self.subjs) fxp_pd = pd.DataFrame(empty, columns=self.subjs) nrfx_pd = pd.DataFrame(empty, columns=self.subjs) rrdp_pd = pd.DataFrame(empty, columns=self.subjs) for i in range(len(self.file)): if i % self.print_every == 0: LOGGER.info(f"Processing line {i + 1} out of {len(self.file)}") row = self.file.iloc[i] subj = row["PP_NR"] word_index = self.find_idx(i) # i is the index iterating through all the lines of the dataset, # word_index is the index of the corresponding word in the flat words list; this calculation requires # some care because not all subjects are complete and the list of missings is not reported #print(row['WORD_ID'], row['WORD']) nfx = 0 if row["WORD_FIXATION_COUNT"] == "." else row["WORD_FIXATION_COUNT"] ffd = 0 if row["WORD_FIRST_FIXATION_DURATION"] == "." else row["WORD_FIRST_FIXATION_DURATION"] fpd = 0 if row["WORD_GAZE_DURATION"] == "." else row["WORD_GAZE_DURATION"] tfd = 0 if row["WORD_TOTAL_READING_TIME"] == "." else row["WORD_TOTAL_READING_TIME"] mfd = get_mean_fix_dur(nfx, tfd) fxp = get_fix_prob(nfx) nrfx = get_n_refix(nfx) rrdp = get_reread_prob(nrfx) nfx_pd[subj][word_index] = nfx ffd_pd[subj][word_index] = ffd fpd_pd[subj][word_index] = fpd tfd_pd[subj][word_index] = tfd mfd_pd[subj][word_index] = mfd fxp_pd[subj][word_index] = fxp nrfx_pd[subj][word_index] = nrfx rrdp_pd[subj][word_index] = rrdp nfx = nfx_pd.mean(axis=1).tolist() ffd = ffd_pd.mean(axis=1).tolist() fpd = fpd_pd.mean(axis=1).tolist() tfd = tfd_pd.mean(axis=1).tolist() mfd = mfd_pd.mean(axis=1).tolist() fxp = fxp_pd.mean(axis=1).tolist() nrfx = nrfx_pd.mean(axis=1).tolist() rrdp = rrdp_pd.mean(axis=1).tolist() features = [nfx, ffd, fpd, tfd, mfd, fxp, nrfx, rrdp] self.flat_features = [np.array(i) for i in list(zip(*features))] print(len(self.flat_words)) print(len(self.flat_features))
def read_files(self): LOGGER.info(f"Reading files for task {self.task}") for file in sorted(os.listdir(self.dir)): if not file.endswith(".dat"): continue frag = int(file[2:4]) subj = file[:2] fpath = os.path.join(self.dir, file) self.add_file(fpath, frag, subj)
def read_subjs(self): """ Reads list of subjs and sorts it such that the full subject is in the first position. """ LOGGER.info(f"Reading subjects for task {self.task}") self.subjs = sorted(self.file["DATA_FILE"].unique()) print(self.subjs) print(len(self.subjs))
def read_files(self): LOGGER.info(f"Reading files for task {self.task}") for file in sorted(os.listdir(self.dir)): if not file.endswith(".txt"): continue frag = file.split("_")[2][4:6] subj = file.split("_")[1] fpath = os.path.join(self.dir, file) self.add_file(fpath, frag, subj)
def read_subjs(self): """ Reads list of subjs and sorts it such that the full subject is in the first position. """ LOGGER.info(f"Reading subjects for task {self.task}") self.subjs = sorted(self.file["PP_NR"].unique()) for i, subj in enumerate(self.subjs): if subj == self.full_subj: break self.subjs.insert(0, self.subjs.pop(i))
def digitize_gaze(self): """ Quantizes gaze features in self.n_bins bins. """ LOGGER.info(f"Digitizing gaze data for task {self.task}") bins = np.linspace(0, self.gaze_max, self.n_bins - 1) for mode in self.modes: self.gaze_inputs[mode] = [ np.digitize(i, bins) if not np.isnan(i).any() else i for i in self.gaze_inputs[mode] ]
def calc_input_ids(self): """ Converts tokens to ids for the BERT model. """ LOGGER.info(f"Calculating input ids for task {self.task}") for mode in self.modes: ids = [ self.tokenizer.prepare_for_model( self.tokenizer.convert_tokens_to_ids(s))["input_ids"] for s in self.text_inputs[mode] ] self.text_inputs[mode] = pad_sequences( ids, value=self.tokenizer.pad_token_id, padding="post")
def calc_features(self): LOGGER.info(f"Start of features calculation for task {self.task}") for i, frag in enumerate(self.frags): LOGGER.info(f"Processing fragment {i + 1} out of {len(self.frags)}") empty = [[None] * len(self.subjs)] * len(self.frags_words[frag]) nfx_pd = pd.DataFrame(empty, columns=self.subjs) ffd_pd = pd.DataFrame(empty, columns=self.subjs) fpd_pd = pd.DataFrame(empty, columns=self.subjs) tfd_pd = pd.DataFrame(empty, columns=self.subjs) mfd_pd = pd.DataFrame(empty, columns=self.subjs) fxp_pd = pd.DataFrame(empty, columns=self.subjs) nrfx_pd = pd.DataFrame(empty, columns=self.subjs) rrdp_pd = pd.DataFrame(empty, columns=self.subjs) for j, subj in enumerate(self.subjs): flt_file = pd.read_csv(self.files[frag][self.full_subj][0], sep=",", header=0) for k, w in enumerate(self.frags_words[frag]): ffd = flt_file["FFD"].tolist()[k] fpd = flt_file["FPRT"].tolist()[k] tfd = flt_file["TFT"].tolist()[k] nfx = flt_file["nFix"].tolist()[k] mfd = get_mean_fix_dur(nfx, tfd) fxp = get_fix_prob(nfx) nrfx = get_n_refix(nfx) rrdp = get_reread_prob(nrfx) nfx_pd[subj][k] = nfx ffd_pd[subj][k] = ffd fpd_pd[subj][k] = fpd tfd_pd[subj][k] = tfd mfd_pd[subj][k] = mfd fxp_pd[subj][k] = fxp nrfx_pd[subj][k] = nrfx rrdp_pd[subj][k] = rrdp nfx = nfx_pd.mean(axis=1).tolist() ffd = ffd_pd.mean(axis=1).tolist() fpd = fpd_pd.mean(axis=1).tolist() tfd = tfd_pd.mean(axis=1).tolist() mfd = mfd_pd.mean(axis=1).tolist() fxp = fxp_pd.mean(axis=1).tolist() nrfx = nrfx_pd.mean(axis=1).tolist() rrdp = rrdp_pd.mean(axis=1).tolist() features = [nfx, ffd, fpd, tfd, mfd, fxp, nrfx, rrdp] self.frags_features[frag] = [np.array(i) for i in list(zip(*features))] self.flat_words.extend([w for w in self.frags_words[frag]]) self.flat_features.extend([f for f in self.frags_features[frag]])
def read_words(self): """ Word list is extracted from the full subject. """ LOGGER.info(f"Reading words for task {self.task}") for frag in self.frags: self.frags_words[frag] = [] flt_file = pd.read_csv(self.files[frag][self.full_subj][0], sep=",", header=0) for index, row in flt_file.iterrows(): if index != len(flt_file)-1: if flt_file.at[index+1, 'SentenceBegin'] == 1: self.frags_words[frag].append(str(row['WORD'])+"<eos>") else: self.frags_words[frag].append(str(row['WORD'])) else: self.frags_words[frag].append(str(row['WORD']) + "<eos>")
def pad_gaze(self): """ Adds the pad tokens in the positions of the [CLS] and [SEP] tokens, and pads the gaze sequences with the pad token. """ LOGGER.info(f"Padding gaze data for task {self.task}") for mode in self.modes: gaze_pad_idxs = np.full((1, self.d_gaze), self.gaze_pad_idx) gaze_start_idxs = np.full((1, self.d_gaze), self.gaze_start_idx) gaze_end_idxs = np.full((1, self.d_gaze), self.gaze_end_idx) gaze_inputs = [ np.concatenate((gaze_start_idxs, i, gaze_end_idxs)) for i in self.gaze_inputs[mode] ] self.gaze_inputs[mode] = pad_sequences(gaze_inputs, value=gaze_pad_idxs, padding="post")
def predict_gaze(self): """ Predicts or uniformly samples gaze features. """ LOGGER.info(f"Predicting gaze data for task {self.task}") for mode in self.modes: self.gaze_inputs[mode] = [ np.full((len(i), self.d_gaze), np.nan) for i in self.text_inputs[mode] ] len_predict = round( len(self.text_inputs[mode]) * self.predict_percs[mode]) predict_idxs = random.sample(range(len(self.text_inputs[mode])), k=len_predict) for batch_index in range(0, len(predict_idxs), self.gaze_bs): batch_idxs = predict_idxs[batch_index:batch_index + self.gaze_bs] self.ensemble_predict(batch_idxs, mode)
def tokenize_with_gaze(self): """ Tokenizes the sentences in the dataset with the pre-trained tokenizer, preserving targets and predicted gaze features order. """ LOGGER.info(f"Tokenizing sentences for task {self.task}") for mode in self.modes: tokenized = [] gazes = [] maps = [] for s, g in zip(self.text_inputs[mode], self.gaze_inputs[mode]): tokens, gaze, map = self.tokenize_preserve_and_map(s, g) tokenized.append(tokens) gazes.append(gaze) maps.append(map) self.text_inputs[mode] = tokenized self.gaze_inputs[mode] = gazes self.maps[mode] = maps
def tokenize_from_words(self): """ Tokenizes the sentences in the dataset with the pre-trained tokenizer, storing the start index of each word. """ LOGGER.info(f"Tokenizing sentences for task {self.task}") for mode in self.modes: print(mode) print(len(self.text_inputs[mode])) tokenized = [] maps = [] for s in self.text_inputs[mode]: tokens, map = self.tokenize_and_map(s) tokenized.append(tokens) maps.append(map) #print(tokens) print("max tokenized seq len: ", max(len(l) for l in tokenized)) self.text_inputs[mode] = tokenized self.maps[mode] = maps
def __call__(self): if self.run_patience == self.patience: LOGGER.info("Patience exceeded, stopping") self.stop = True return self.tester.evaluate() score = self.tester.metrics[self.monitor] cf = Config.load_json(os.path.join("results/gaze/config.json")) self.model.train() if self.best_score is None or (self.monitor_mode == "min" and score < self.best_score) or \ (self.monitor_mode == "max" and score > self.best_score): for key, value in self.tester.metrics.items(): mlflow.log_metric(f"val_{key}", value) self.best_score = score LOGGER.info("Metric has improved, saving the model") torch.save( self.model.state_dict(), os.path.join( self.dir, "model-" + cf.model_pretrained + "-" + str(cf.full_finetuning) + "-" + str(RANDOM_STATE) + ".pth")) self.run_patience = 0 else: self.run_patience += 1 LOGGER.info( f"No improvement in the last epoch, patience {self.run_patience} out of {self.patience}" )
def read_pipeline(self): LOGGER.info(f"Begin loading data for task {self.task}") if self.task == "dundee": DundeeDataNormalizer.read_files(self) DundeeDataNormalizer.read_frags(self) DundeeDataNormalizer.read_subjs(self) DundeeDataNormalizer.read_words(self) DundeeDataNormalizer.calc_features(self) elif self.task == "geco" or self.task == "geco-nl": GECODataNormalizer.read_file(self) GECODataNormalizer.read_frags(self) GECODataNormalizer.read_subjs(self) GECODataNormalizer.read_words(self) GECODataNormalizer.calc_features(self) elif self.task == "zuco11" or self.task == "zuco12": # if using the old MATLAB file, change back to ZuCo1DataNormalizer ZuCo2DataNormalizer.read_files(self) ZuCo2DataNormalizer.read_subjs(self) ZuCo2DataNormalizer.read_words(self) ZuCo2DataNormalizer.calc_features(self) elif self.task == "zuco21": ZuCo2DataNormalizer.read_files(self) ZuCo2DataNormalizer.read_subjs(self) ZuCo2DataNormalizer.read_words(self) ZuCo2DataNormalizer.calc_features(self) elif self.task == "potsdam": PotsdamDataNormalizer.read_files(self) PotsdamDataNormalizer.read_frags(self) PotsdamDataNormalizer.read_subjs(self) PotsdamDataNormalizer.read_words(self) PotsdamDataNormalizer.calc_features(self) elif self.task == "rsc": RussSentCorpDataNormalizer.read_file(self) RussSentCorpDataNormalizer.read_frags(self) RussSentCorpDataNormalizer.read_subjs(self) RussSentCorpDataNormalizer.read_words(self) RussSentCorpDataNormalizer.calc_features(self) self.split()
def save_datasets(self): LOGGER.info(f"Saving flat dataset for task {self.task}") flat_words_pd = pd.DataFrame(self.flat_words, columns=["word"]) flat_features_pd = pd.DataFrame(self.flat_features, columns=["n_fix", "first_fix_dur", "first_pass_dur", "total_fix_dur", "mean_fix_dur", "fix_prob", "n_refix", "reread_prob"]) flat_dataset_pd = pd.concat((flat_words_pd, flat_features_pd), axis=1) flat_dataset_pd.to_csv(os.path.join(self.dir, "dataset.csv")) LOGGER.info(f"Saving split datasets for task {self.task}") for mode in self.modes: dataset = { "sentence_num": [], "word": [], "n_fix": [], "first_fix_dur": [], "first_pass_dur": [], "total_fix_dur": [], "mean_fix_dur": [], "fix_prob": [], "n_refix": [], "reread_prob": [] } for k in range(len(self.words[mode])): for j in range(len(self.words[mode][k])): dataset["sentence_num"].append(k) dataset["word"].append(self.words[mode][k][j]) dataset["n_fix"].append(self.features[mode][k][j][0]) dataset["first_fix_dur"].append(self.features[mode][k][j][1]) dataset["first_pass_dur"].append(self.features[mode][k][j][2]) dataset["total_fix_dur"].append(self.features[mode][k][j][3]) dataset["mean_fix_dur"].append(self.features[mode][k][j][4]) dataset["fix_prob"].append(self.features[mode][k][j][5]) dataset["n_refix"].append(self.features[mode][k][j][6]) dataset["reread_prob"].append(self.features[mode][k][j][7]) dataset_pd = pd.DataFrame(dataset) dataset_pd.to_csv(os.path.join(self.dir, mode + "_dataset.csv"))
def evaluate(self): LOGGER.info(f"Begin evaluation task {self.task}") self.predict() LOGGER.info("Calulating metrics") self.calc_metrics() for key in self.metrics: LOGGER.info(f"val_{key}: {self.metrics[key]:.4f} {self.units[key]}")
def split(self): LOGGER.info(f"Splitting the data into sentences for task {self.task}") flat_sentences = [] flat_features = [] sentence = [] features = [] for i in range(len(self.flat_words)): sentence.append(self.flat_words[i]) features.append(self.flat_features[i]) if sentence[-1].endswith(tuple(self.stopchars)): sentence[-1] = sentence[-1].replace("<eos>","") flat_sentences.append(sentence) flat_features.append(features) # sanity check: if len(sentence) > 100: print(sentence) print(len(sentence)) sentence = [] features = [] LOGGER.info(f"Splitting data for task {self.task}") pairs = list(zip(flat_sentences, flat_features)) shuffled_pairs = random.sample(pairs, len(pairs)) len_train = round(self.split_percs["train"] * len(pairs)) len_val = round(self.split_percs["val"] * len(pairs)) corpora = { "train": shuffled_pairs[:len_train], "val": shuffled_pairs[len_train:len_train + len_val], "test": shuffled_pairs[len_train + len_val:] } for mode in self.modes: self.words[mode], self.features[mode] = map(list, zip(*corpora[mode]))
def pad_targets(self): """ Adds the pad tokens in the positions of the [CLS] and [SEP] tokens, adds the pad tokens in the positions of the subtokens, and pads the targets with the pad token. """ LOGGER.info(f"Padding targets for task {self.task}") for mode in self.modes: targets = [ np.full((len(i), self.d_out), self.target_pad) for i in self.text_inputs[mode] ] for k, (i, j) in enumerate(zip(self.targets[mode], self.maps[mode])): targets[k][j, :] = i target_pad_vector = np.full((1, self.d_out), self.target_pad) targets = [ np.concatenate((target_pad_vector, i, target_pad_vector)) for i in targets ] self.targets[mode] = pad_sequences(targets, value=self.target_pad, padding="post")