Esempio n. 1
0
    def standardize(self):
        """
        Standardizes the features between 0 and self.feature_max.
        """
        LOGGER.info(f"Standardizing target data for task {self.task}")
        features = self.targets["train"]
        scaler = MinMaxScaler(feature_range=[0, self.feature_max])
        flat_features = [j for i in features for j in i]
        scaler.fit(flat_features)

        self.targets["train"] = [list(scaler.transform(i)) for i in features]
        self.targets["val"] = [
            list(scaler.transform(i)) for i in self.targets["val"]
        ]
        self.targets["test"] = [
            list(scaler.transform(i)) for i in self.targets["test"]
        ]

        filen = os.path.join("scaled-test-" + self.task + ".csv")

        print(filen)

        flat_preds = [j for i in self.targets["test"] for j in i]

        preds_pd = pd.DataFrame(flat_preds,
                                columns=[
                                    "n_fix", "first_fix_dur", "first_pass_dur",
                                    "total_fix_dur", "mean_fix_dur",
                                    "fix_prob", "n_refix", "reread_prob"
                                ])
        preds_pd.to_csv(filen)

        print("saved.")
Esempio n. 2
0
    def load_data(self):
        LOGGER.info(f"Loading data for task {self.task}")
        for mode in self.modes:
            print(mode)
            dataset_pd = pd.read_csv(os.path.join(self.dir,
                                                  mode + "_dataset.csv"),
                                     na_filter=False,
                                     index_col=0)
            word_func = lambda s: [w for w in s["word"].values.tolist()]
            features_func = lambda s: [
                np.array(s.drop(columns=["sentence_num", "word"]).iloc[i])
                for i in range(len(s))
            ]

            self.text_inputs[mode] = dataset_pd.groupby("sentence_num").apply(
                word_func).tolist()
            print(len(self.text_inputs[mode]))
            self.targets[mode] = dataset_pd.groupby("sentence_num").apply(
                features_func).tolist()

        # check for duplicate sentence in train and test set
        dups = []
        for i, s in enumerate(self.text_inputs["train"]):
            if s in self.text_inputs["test"]:
                print("WARNING! Duplicate in test set....")
                dups.append(i)

        # remove duplicated from training data
        print(len(self.text_inputs["train"]))
        print(len(dups))
        for d in sorted(dups, reverse=True):
            del self.text_inputs["train"][d]
            del self.targets["train"][d]
        print(len(self.text_inputs["train"]))
    def read_words(self):
        """
        Word list is extracted from the full subject.
        """


        LOGGER.info(f"Reading words for task {self.task}")
        if self.task == "geco-nl":
            for frag in self.frags:
                for s in self.full_subj:
                    flt_file = self.file
                    isfrag = flt_file["PART"] == frag
                    issubj = flt_file["PP_NR"] == s
                    flt_file = flt_file[isfrag]
                    flt_file = flt_file[issubj]
                    self.flat_words.extend([str(w) for w in flt_file["WORD"].tolist()])
            print(len(self.flat_words))



        else:
            for frag in self.frags:
                flt_file = self.file
                isfrag = flt_file["PART"] == frag
                issubj = flt_file["PP_NR"] == self.full_subj
                flt_file = flt_file[isfrag]
                flt_file = flt_file[issubj]
                self.flat_words.extend([str(w) for w in flt_file["WORD"].tolist()])
            print(len(self.flat_words))
Esempio n. 4
0
    def train(self):
        n_batches_one_epoch = len(self.train_dl)
        n_params = sum(p.numel() for p in self.model.parameters())
        mlflow.log_metric("n_params", n_params)
        LOGGER.info(f"Num epochs: {self.n_epochs}")
        LOGGER.info(f"Num parameters: {n_params}")
        LOGGER.info(f"Begin training task {self.task}")

        self.model.to(self.device)
        self.model.train()

        epoch_loss_ls = []
        it = 1

        for epoch in tqdm(range(1, self.n_epochs + 1)):
            for batch in tqdm(self.train_dl):
                it += 1

                loss = self.train_one_step(batch)
                self.writer.add_scalar("train/loss", loss, it)
                epoch_loss_ls.append(loss)

            epoch_loss_avg = sum(epoch_loss_ls) / len(epoch_loss_ls)
            epoch_loss_ls = []
            LOGGER.info(f"Done epoch {epoch} / {self.n_epochs}")
            LOGGER.info(f"Avg loss epoch {epoch}: {epoch_loss_avg:.4f}")

            self.early_stop()

            for key, metric in self.early_stop.tester.metrics.items():
                self.writer.add_scalar(f"val/{key}", metric, it // n_batches_one_epoch)

            if self.early_stop.stop:
                break
    def calc_features(self):
        LOGGER.info(f"Start of features calculation for task {self.task}")
        for i, frag in enumerate(self.frags):
            LOGGER.info(f"Processing fragment {i + 1} out of {len(self.frags)}")

            empty = [[None] * len(self.subjs)] * len(self.frags_words[frag])
            nfx_pd = pd.DataFrame(empty, columns=self.subjs)
            ffd_pd = pd.DataFrame(empty, columns=self.subjs)
            fpd_pd = pd.DataFrame(empty, columns=self.subjs)
            tfd_pd = pd.DataFrame(empty, columns=self.subjs)
            mfd_pd = pd.DataFrame(empty, columns=self.subjs)
            fxp_pd = pd.DataFrame(empty, columns=self.subjs)
            nrfx_pd = pd.DataFrame(empty, columns=self.subjs)
            rrdp_pd = pd.DataFrame(empty, columns=self.subjs)


            for j, subj in enumerate(self.subjs):
                with open(self.files[frag][subj][0], errors="ignore") as f:
                    lines = [l.split() for l in f.readlines()[1:]]
                    lines_1 = [[l[0]] + [int(i) for i in l[1:]] for l in lines]
                with open(self.files[frag][subj][1], errors="ignore") as f:
                    lines = [l.split() for l in f.readlines()[1:]]
                    lines_2 = [[l[0]] + [int(i) for i in l[1:]] for l in lines]
                for k, w in enumerate(self.frags_words[frag]):
                    idx_1 = self.words_idxs[frag][subj][k][1]
                    idx_2 = self.words_idxs[frag][subj][k][2]

                    nfx = self.get_n_fix(w, lines_2[idx_2:])
                    ffd = 0 if idx_1 == -1 else self.get_first_fix_dur(lines_1[idx_1:])
                    fpd = 0 if idx_1 == -1 else self.get_first_pass_dur(w, lines_1[idx_1:])
                    tfd = self.get_total_fix_dur(w, lines_2[idx_2:])

                    # print(nfx_ls[k][j], tfd_ls[k][j])
                    mfd = get_mean_fix_dur(nfx, tfd)
                    fxp = get_fix_prob(nfx)
                    nrfx = get_n_refix(nfx)
                    rrdp = get_reread_prob(nrfx)

                    nfx_pd[subj][k] = nfx
                    ffd_pd[subj][k] = ffd
                    fpd_pd[subj][k] = fpd
                    tfd_pd[subj][k] = tfd
                    mfd_pd[subj][k] = mfd
                    fxp_pd[subj][k] = fxp
                    nrfx_pd[subj][k] = nrfx
                    rrdp_pd[subj][k] = rrdp

            nfx = nfx_pd.mean(axis=1).tolist()
            ffd = ffd_pd.mean(axis=1).tolist()
            fpd = fpd_pd.mean(axis=1).tolist()
            tfd = tfd_pd.mean(axis=1).tolist()
            mfd = mfd_pd.mean(axis=1).tolist()
            fxp = fxp_pd.mean(axis=1).tolist()
            nrfx = nrfx_pd.mean(axis=1).tolist()
            rrdp = rrdp_pd.mean(axis=1).tolist()

            features = [nfx, ffd, fpd, tfd, mfd, fxp, nrfx, rrdp]
            self.frags_features[frag] = [np.array(i) for i in list(zip(*features))]
            self.flat_words.extend([w for w in self.frags_words[frag]])
            self.flat_features.extend([f for f in self.frags_features[frag]])
    def calc_features(self):
        LOGGER.info(f"Start of features calculation for task {self.task}")
        #  dicts indexed by sentence number and word number inside the sentence
        nfx_ls = {}
        ffd_ls = {}
        fpd_ls = {}
        tfd_ls = {}
        mfd_ls = {}
        fxp_ls = {}
        nrfx_ls = {}
        rrdp_ls = {}

        for i, subj in enumerate(self.subjs):
            LOGGER.info(f"Processing subject {i + 1} out of {len(self.subjs)}")
            mat = scipy.io.loadmat(self.files[subj])
            sentence_data = mat["sentenceData"][0]

            for j, row in enumerate(sentence_data):
                word_data = row["word"][0]
                cont = False
                try:
                    cont = np.isnan(word_data[0])
                except:
                    pass
                if cont:
                    continue

                if i == 0:
                    nfx_ls[j] = [[] for _ in range(len(word_data))]
                    ffd_ls[j] = [[] for _ in range(len(word_data))]
                    fpd_ls[j] = [[] for _ in range(len(word_data))]
                    tfd_ls[j] = [[] for _ in range(len(word_data))]
                    mfd_ls[j] = [[] for _ in range(len(word_data))]
                    fxp_ls[j] = [[] for _ in range(len(word_data))]
                    nrfx_ls[j] = [[] for _ in range(len(word_data))]
                    rrdp_ls[j] = [[] for _ in range(len(word_data))]

                for k, item in enumerate(word_data):
                    nfx_ls[j][k].append(0 if len(item["nFixations"]) == 0 else item["nFixations"][0][0])
                    ffd_ls[j][k].append(0 if len(item["FFD"]) == 0 else item["FFD"][0][0])
                    fpd_ls[j][k].append(0 if len(item["GD"]) == 0 else item["GD"][0][0])
                    tfd_ls[j][k].append(0 if len(item["TRT"]) == 0 else item["TRT"][0][0])
                    mfd_ls[j][k].append(get_mean_fix_dur(nfx_ls[j][k][-1], tfd_ls[j][k][-1]))
                    fxp_ls[j][k].append(get_fix_prob(nfx_ls[j][k][-1]))
                    nrfx_ls[j][k].append(get_n_refix(nfx_ls[j][k][-1]))
                    rrdp_ls[j][k].append(get_reread_prob(nrfx_ls[j][k][-1]))

        for s in nfx_ls:
            for f1, f2, f3, f4, f5, f6, f7, f8 in zip(nfx_ls[s], ffd_ls[s], fpd_ls[s], tfd_ls[s],
                                                      mfd_ls[s], fxp_ls[s], nrfx_ls[s], rrdp_ls[s]):
                nfx = np.average(f1)
                ffd = np.average(f2)
                fpd = np.average(f3)
                tfd = np.average(f4)
                mfd = np.average(f5)
                fxp = np.average(f6)
                nrfx = np.average(f7)
                rrdp = np.average(f8)
                self.flat_features.append(np.array([nfx, ffd, fpd, tfd, mfd, fxp, nrfx, rrdp]))
 def read_files(self):
     LOGGER.info(f"Reading files for task {self.task}")
     for file in sorted(os.listdir(self.dir)):
         if not file.endswith(".mat"):
             continue
         subj = file.split("_")[0][-3:]
         fpath = os.path.join(self.dir, file)
         self.files[subj] = fpath
Esempio n. 8
0
 def calc_attn_masks(self):
     """
     Calculates key paddding attention masks for the BERT model.
     """
     LOGGER.info(f"Calculating attention masks for task {self.task}")
     for mode in self.modes:
         self.masks[mode] = [[j != self.tokenizer.pad_token_id for j in i]
                             for i in self.text_inputs[mode]]
Esempio n. 9
0
 def dummy_gaze(self):
     """
     Builds dummy gaze numpy arrays filled with nans.
     """
     LOGGER.info(f"Assigning dummy gaze for task {self.task}")
     for mode in self.modes:
         self.gaze_inputs[mode] = np.full(
             (len(self.text_inputs[mode]), 2, 1), np.nan)
Esempio n. 10
0
 def read_file(self):
     LOGGER.info(f"Reading file for task {self.task}")
     if self.task == "geco-nl":
         # Dutch part of the GECO corpus
         self.file = pd.read_excel(os.path.join(self.dir, "L1ReadingData.xlsx"), na_filter=False)
     else:
         # English part of the GECO corpus
         self.file = pd.read_excel(os.path.join(self.dir, "MonolingualReadingData.xlsx"), na_filter=False)
Esempio n. 11
0
    def calc_numpy(self):
        LOGGER.info(f"Calculating numpy arrays for task {self.task}")
        for mode in self.modes:
            input_numpy = np.asarray(self.text_inputs[mode], dtype=np.int64)
            mask_numpy = np.asarray(self.masks[mode], dtype=np.float32)
            target_numpy = np.asarray(self.targets[mode], dtype=np.float32)

            self.numpy[mode] = list(zip(input_numpy, target_numpy, mask_numpy))
Esempio n. 12
0
    def calc_features(self):
        LOGGER.info(f"Start of features calculation for task {self.task}")
        # for each feature, we build a pd.dataframe with shape (len(words), len(subjs)) to be filled with data; data
        # will be averaged over the subject dimension (columns)
        empty = [[None] * len(self.subjs)] * len(self.flat_words)
        nfx_pd = pd.DataFrame(empty, columns=self.subjs)
        ffd_pd = pd.DataFrame(empty, columns=self.subjs)
        fpd_pd = pd.DataFrame(empty, columns=self.subjs)
        tfd_pd = pd.DataFrame(empty, columns=self.subjs)
        mfd_pd = pd.DataFrame(empty, columns=self.subjs)
        fxp_pd = pd.DataFrame(empty, columns=self.subjs)
        nrfx_pd = pd.DataFrame(empty, columns=self.subjs)
        rrdp_pd = pd.DataFrame(empty, columns=self.subjs)

        for i in range(len(self.file)):
            if i % self.print_every == 0:
                LOGGER.info(f"Processing line {i + 1} out of {len(self.file)}")

            row = self.file.iloc[i]
            subj = row["PP_NR"]

            word_index = self.find_idx(i)  # i is the index iterating through all the lines of the dataset,
                # word_index is the index of the corresponding word in the flat words list; this calculation requires
                # some care because not all subjects are complete and the list of missings is not reported
            #print(row['WORD_ID'], row['WORD'])

            nfx = 0 if row["WORD_FIXATION_COUNT"] == "." else row["WORD_FIXATION_COUNT"]
            ffd = 0 if row["WORD_FIRST_FIXATION_DURATION"] == "." else row["WORD_FIRST_FIXATION_DURATION"]
            fpd = 0 if row["WORD_GAZE_DURATION"] == "." else row["WORD_GAZE_DURATION"]
            tfd = 0 if row["WORD_TOTAL_READING_TIME"] == "." else row["WORD_TOTAL_READING_TIME"]
            mfd = get_mean_fix_dur(nfx, tfd)
            fxp = get_fix_prob(nfx)
            nrfx = get_n_refix(nfx)
            rrdp = get_reread_prob(nrfx)

            nfx_pd[subj][word_index] = nfx
            ffd_pd[subj][word_index] = ffd
            fpd_pd[subj][word_index] = fpd
            tfd_pd[subj][word_index] = tfd
            mfd_pd[subj][word_index] = mfd
            fxp_pd[subj][word_index] = fxp
            nrfx_pd[subj][word_index] = nrfx
            rrdp_pd[subj][word_index] = rrdp

        nfx = nfx_pd.mean(axis=1).tolist()
        ffd = ffd_pd.mean(axis=1).tolist()
        fpd = fpd_pd.mean(axis=1).tolist()
        tfd = tfd_pd.mean(axis=1).tolist()
        mfd = mfd_pd.mean(axis=1).tolist()
        fxp = fxp_pd.mean(axis=1).tolist()
        nrfx = nrfx_pd.mean(axis=1).tolist()
        rrdp = rrdp_pd.mean(axis=1).tolist()

        features = [nfx, ffd, fpd, tfd, mfd, fxp, nrfx, rrdp]
        self.flat_features = [np.array(i) for i in list(zip(*features))]

        print(len(self.flat_words))
        print(len(self.flat_features))
Esempio n. 13
0
 def read_files(self):
     LOGGER.info(f"Reading files for task {self.task}")
     for file in sorted(os.listdir(self.dir)):
         if not file.endswith(".dat"):
             continue
         frag = int(file[2:4])
         subj = file[:2]
         fpath = os.path.join(self.dir, file)
         self.add_file(fpath, frag, subj)
Esempio n. 14
0
    def read_subjs(self):
        """
        Reads list of subjs and sorts it such that the full subject is in the first position.
        """
        LOGGER.info(f"Reading subjects for task {self.task}")
        self.subjs = sorted(self.file["DATA_FILE"].unique())

        print(self.subjs)
        print(len(self.subjs))
Esempio n. 15
0
 def read_files(self):
     LOGGER.info(f"Reading files for task {self.task}")
     for file in sorted(os.listdir(self.dir)):
         if not file.endswith(".txt"):
             continue
         frag = file.split("_")[2][4:6]
         subj = file.split("_")[1]
         fpath = os.path.join(self.dir, file)
         self.add_file(fpath, frag, subj)
Esempio n. 16
0
 def read_subjs(self):
     """
     Reads list of subjs and sorts it such that the full subject is in the first position.
     """
     LOGGER.info(f"Reading subjects for task {self.task}")
     self.subjs = sorted(self.file["PP_NR"].unique())
     for i, subj in enumerate(self.subjs):
         if subj == self.full_subj:
             break
     self.subjs.insert(0, self.subjs.pop(i))
Esempio n. 17
0
    def digitize_gaze(self):
        """
        Quantizes gaze features in self.n_bins bins.
        """
        LOGGER.info(f"Digitizing gaze data for task {self.task}")
        bins = np.linspace(0, self.gaze_max, self.n_bins - 1)

        for mode in self.modes:
            self.gaze_inputs[mode] = [
                np.digitize(i, bins) if not np.isnan(i).any() else i
                for i in self.gaze_inputs[mode]
            ]
Esempio n. 18
0
 def calc_input_ids(self):
     """
     Converts tokens to ids for the BERT model.
     """
     LOGGER.info(f"Calculating input ids for task {self.task}")
     for mode in self.modes:
         ids = [
             self.tokenizer.prepare_for_model(
                 self.tokenizer.convert_tokens_to_ids(s))["input_ids"]
             for s in self.text_inputs[mode]
         ]
         self.text_inputs[mode] = pad_sequences(
             ids, value=self.tokenizer.pad_token_id, padding="post")
Esempio n. 19
0
    def calc_features(self):
        LOGGER.info(f"Start of features calculation for task {self.task}")

        for i, frag in enumerate(self.frags):
            LOGGER.info(f"Processing fragment {i + 1} out of {len(self.frags)}")
            empty = [[None] * len(self.subjs)] * len(self.frags_words[frag])
            nfx_pd = pd.DataFrame(empty, columns=self.subjs)
            ffd_pd = pd.DataFrame(empty, columns=self.subjs)
            fpd_pd = pd.DataFrame(empty, columns=self.subjs)
            tfd_pd = pd.DataFrame(empty, columns=self.subjs)
            mfd_pd = pd.DataFrame(empty, columns=self.subjs)
            fxp_pd = pd.DataFrame(empty, columns=self.subjs)
            nrfx_pd = pd.DataFrame(empty, columns=self.subjs)
            rrdp_pd = pd.DataFrame(empty, columns=self.subjs)

            for j, subj in enumerate(self.subjs):
                flt_file = pd.read_csv(self.files[frag][self.full_subj][0], sep=",", header=0)
                for k, w in enumerate(self.frags_words[frag]):
                    ffd = flt_file["FFD"].tolist()[k]
                    fpd = flt_file["FPRT"].tolist()[k]
                    tfd = flt_file["TFT"].tolist()[k]
                    nfx = flt_file["nFix"].tolist()[k]
                    mfd = get_mean_fix_dur(nfx, tfd)
                    fxp = get_fix_prob(nfx)
                    nrfx = get_n_refix(nfx)
                    rrdp = get_reread_prob(nrfx)

                    nfx_pd[subj][k] = nfx
                    ffd_pd[subj][k] = ffd
                    fpd_pd[subj][k] = fpd
                    tfd_pd[subj][k] = tfd
                    mfd_pd[subj][k] = mfd
                    fxp_pd[subj][k] = fxp
                    nrfx_pd[subj][k] = nrfx
                    rrdp_pd[subj][k] = rrdp

            nfx = nfx_pd.mean(axis=1).tolist()
            ffd = ffd_pd.mean(axis=1).tolist()
            fpd = fpd_pd.mean(axis=1).tolist()
            tfd = tfd_pd.mean(axis=1).tolist()
            mfd = mfd_pd.mean(axis=1).tolist()
            fxp = fxp_pd.mean(axis=1).tolist()
            nrfx = nrfx_pd.mean(axis=1).tolist()
            rrdp = rrdp_pd.mean(axis=1).tolist()

            features = [nfx, ffd, fpd, tfd, mfd, fxp, nrfx, rrdp]
            self.frags_features[frag] = [np.array(i) for i in list(zip(*features))]
            self.flat_words.extend([w for w in self.frags_words[frag]])
            self.flat_features.extend([f for f in self.frags_features[frag]])
Esempio n. 20
0
 def read_words(self):
     """
     Word list is extracted from the full subject.
     """
     LOGGER.info(f"Reading words for task {self.task}")
     for frag in self.frags:
         self.frags_words[frag] = []
         flt_file = pd.read_csv(self.files[frag][self.full_subj][0], sep=",", header=0)
         for index, row in flt_file.iterrows():
             if index != len(flt_file)-1:
                 if flt_file.at[index+1, 'SentenceBegin'] == 1:
                     self.frags_words[frag].append(str(row['WORD'])+"<eos>")
                 else:
                     self.frags_words[frag].append(str(row['WORD']))
             else:
                 self.frags_words[frag].append(str(row['WORD']) + "<eos>")
Esempio n. 21
0
    def pad_gaze(self):
        """
        Adds the pad tokens in the positions of the [CLS] and [SEP]
        tokens, and pads the gaze sequences with the pad token.
        """
        LOGGER.info(f"Padding gaze data for task {self.task}")
        for mode in self.modes:
            gaze_pad_idxs = np.full((1, self.d_gaze), self.gaze_pad_idx)
            gaze_start_idxs = np.full((1, self.d_gaze), self.gaze_start_idx)
            gaze_end_idxs = np.full((1, self.d_gaze), self.gaze_end_idx)
            gaze_inputs = [
                np.concatenate((gaze_start_idxs, i, gaze_end_idxs))
                for i in self.gaze_inputs[mode]
            ]

            self.gaze_inputs[mode] = pad_sequences(gaze_inputs,
                                                   value=gaze_pad_idxs,
                                                   padding="post")
Esempio n. 22
0
    def predict_gaze(self):
        """
        Predicts or uniformly samples gaze features.
        """
        LOGGER.info(f"Predicting gaze data for task {self.task}")
        for mode in self.modes:
            self.gaze_inputs[mode] = [
                np.full((len(i), self.d_gaze), np.nan)
                for i in self.text_inputs[mode]
            ]

            len_predict = round(
                len(self.text_inputs[mode]) * self.predict_percs[mode])
            predict_idxs = random.sample(range(len(self.text_inputs[mode])),
                                         k=len_predict)

            for batch_index in range(0, len(predict_idxs), self.gaze_bs):
                batch_idxs = predict_idxs[batch_index:batch_index +
                                          self.gaze_bs]
                self.ensemble_predict(batch_idxs, mode)
Esempio n. 23
0
    def tokenize_with_gaze(self):
        """
        Tokenizes the sentences in the dataset with the pre-trained
        tokenizer, preserving targets and predicted gaze features order.
        """
        LOGGER.info(f"Tokenizing sentences for task {self.task}")
        for mode in self.modes:
            tokenized = []
            gazes = []
            maps = []

            for s, g in zip(self.text_inputs[mode], self.gaze_inputs[mode]):
                tokens, gaze, map = self.tokenize_preserve_and_map(s, g)

                tokenized.append(tokens)
                gazes.append(gaze)
                maps.append(map)

            self.text_inputs[mode] = tokenized
            self.gaze_inputs[mode] = gazes
            self.maps[mode] = maps
Esempio n. 24
0
    def tokenize_from_words(self):
        """
        Tokenizes the sentences in the dataset with the pre-trained tokenizer, storing the start index of each word.
        """
        LOGGER.info(f"Tokenizing sentences for task {self.task}")
        for mode in self.modes:
            print(mode)
            print(len(self.text_inputs[mode]))
            tokenized = []
            maps = []

            for s in self.text_inputs[mode]:
                tokens, map = self.tokenize_and_map(s)

                tokenized.append(tokens)
                maps.append(map)
                #print(tokens)
            print("max tokenized seq len: ", max(len(l) for l in tokenized))

            self.text_inputs[mode] = tokenized
            self.maps[mode] = maps
Esempio n. 25
0
    def __call__(self):
        if self.run_patience == self.patience:
            LOGGER.info("Patience exceeded, stopping")
            self.stop = True
            return

        self.tester.evaluate()
        score = self.tester.metrics[self.monitor]

        cf = Config.load_json(os.path.join("results/gaze/config.json"))

        self.model.train()

        if self.best_score is None or (self.monitor_mode == "min" and score < self.best_score) or \
                (self.monitor_mode == "max" and score > self.best_score):
            for key, value in self.tester.metrics.items():
                mlflow.log_metric(f"val_{key}", value)

            self.best_score = score

            LOGGER.info("Metric has improved, saving the model")
            torch.save(
                self.model.state_dict(),
                os.path.join(
                    self.dir, "model-" + cf.model_pretrained + "-" +
                    str(cf.full_finetuning) + "-" + str(RANDOM_STATE) +
                    ".pth"))

            self.run_patience = 0
        else:
            self.run_patience += 1
            LOGGER.info(
                f"No improvement in the last epoch, patience {self.run_patience} out of {self.patience}"
            )
Esempio n. 26
0
    def read_pipeline(self):
        LOGGER.info(f"Begin loading data for task {self.task}")
        if self.task == "dundee":
            DundeeDataNormalizer.read_files(self)
            DundeeDataNormalizer.read_frags(self)
            DundeeDataNormalizer.read_subjs(self)
            DundeeDataNormalizer.read_words(self)
            DundeeDataNormalizer.calc_features(self)
        elif self.task == "geco" or self.task == "geco-nl":
            GECODataNormalizer.read_file(self)
            GECODataNormalizer.read_frags(self)
            GECODataNormalizer.read_subjs(self)
            GECODataNormalizer.read_words(self)
            GECODataNormalizer.calc_features(self)
        elif self.task == "zuco11" or self.task == "zuco12":
            # if using the old MATLAB file, change back to ZuCo1DataNormalizer
            ZuCo2DataNormalizer.read_files(self)
            ZuCo2DataNormalizer.read_subjs(self)
            ZuCo2DataNormalizer.read_words(self)
            ZuCo2DataNormalizer.calc_features(self)
        elif self.task == "zuco21":
            ZuCo2DataNormalizer.read_files(self)
            ZuCo2DataNormalizer.read_subjs(self)
            ZuCo2DataNormalizer.read_words(self)
            ZuCo2DataNormalizer.calc_features(self)
        elif self.task == "potsdam":
            PotsdamDataNormalizer.read_files(self)
            PotsdamDataNormalizer.read_frags(self)
            PotsdamDataNormalizer.read_subjs(self)
            PotsdamDataNormalizer.read_words(self)
            PotsdamDataNormalizer.calc_features(self)
        elif self.task == "rsc":
            RussSentCorpDataNormalizer.read_file(self)
            RussSentCorpDataNormalizer.read_frags(self)
            RussSentCorpDataNormalizer.read_subjs(self)
            RussSentCorpDataNormalizer.read_words(self)
            RussSentCorpDataNormalizer.calc_features(self)


        self.split()
Esempio n. 27
0
    def save_datasets(self):
        LOGGER.info(f"Saving flat dataset for task {self.task}")
        flat_words_pd = pd.DataFrame(self.flat_words, columns=["word"])
        flat_features_pd = pd.DataFrame(self.flat_features, columns=["n_fix", "first_fix_dur", "first_pass_dur",
                                                                     "total_fix_dur", "mean_fix_dur", "fix_prob",
                                                                     "n_refix", "reread_prob"])
        flat_dataset_pd = pd.concat((flat_words_pd, flat_features_pd), axis=1)
        flat_dataset_pd.to_csv(os.path.join(self.dir, "dataset.csv"))

        LOGGER.info(f"Saving split datasets for task {self.task}")
        for mode in self.modes:
            dataset = {
                "sentence_num": [],
                "word": [],
                "n_fix": [],
                "first_fix_dur": [],
                "first_pass_dur": [],
                "total_fix_dur": [],
                "mean_fix_dur": [],
                "fix_prob": [],
                "n_refix": [],
                "reread_prob": []
            }
            for k in range(len(self.words[mode])):
                for j in range(len(self.words[mode][k])):
                    dataset["sentence_num"].append(k)
                    dataset["word"].append(self.words[mode][k][j])
                    dataset["n_fix"].append(self.features[mode][k][j][0])
                    dataset["first_fix_dur"].append(self.features[mode][k][j][1])
                    dataset["first_pass_dur"].append(self.features[mode][k][j][2])
                    dataset["total_fix_dur"].append(self.features[mode][k][j][3])
                    dataset["mean_fix_dur"].append(self.features[mode][k][j][4])
                    dataset["fix_prob"].append(self.features[mode][k][j][5])
                    dataset["n_refix"].append(self.features[mode][k][j][6])
                    dataset["reread_prob"].append(self.features[mode][k][j][7])

            dataset_pd = pd.DataFrame(dataset)
            dataset_pd.to_csv(os.path.join(self.dir, mode + "_dataset.csv"))
Esempio n. 28
0
    def evaluate(self):
        LOGGER.info(f"Begin evaluation task {self.task}")
        self.predict()

        LOGGER.info("Calulating metrics")
        self.calc_metrics()

        for key in self.metrics:
            LOGGER.info(f"val_{key}: {self.metrics[key]:.4f} {self.units[key]}")
Esempio n. 29
0
    def split(self):
        LOGGER.info(f"Splitting the data into sentences for task {self.task}")
        flat_sentences = []
        flat_features = []
        sentence = []
        features = []

        for i in range(len(self.flat_words)):
            sentence.append(self.flat_words[i])
            features.append(self.flat_features[i])

            if sentence[-1].endswith(tuple(self.stopchars)):
                sentence[-1] = sentence[-1].replace("<eos>","")
                flat_sentences.append(sentence)
                flat_features.append(features)
                # sanity check:
                if len(sentence) > 100:
                    print(sentence)
                    print(len(sentence))
                sentence = []
                features = []

        LOGGER.info(f"Splitting data for task {self.task}")

        pairs = list(zip(flat_sentences, flat_features))
        shuffled_pairs = random.sample(pairs, len(pairs))
        len_train = round(self.split_percs["train"] * len(pairs))
        len_val = round(self.split_percs["val"] * len(pairs))

        corpora = {
            "train": shuffled_pairs[:len_train],
            "val": shuffled_pairs[len_train:len_train + len_val],
            "test": shuffled_pairs[len_train + len_val:]
        }
        for mode in self.modes:
            self.words[mode], self.features[mode] = map(list, zip(*corpora[mode]))
Esempio n. 30
0
    def pad_targets(self):
        """
        Adds the pad tokens in the positions of the [CLS] and [SEP] tokens, adds the pad
        tokens in the positions of the subtokens, and pads the targets with the pad token.
        """
        LOGGER.info(f"Padding targets for task {self.task}")
        for mode in self.modes:
            targets = [
                np.full((len(i), self.d_out), self.target_pad)
                for i in self.text_inputs[mode]
            ]
            for k, (i, j) in enumerate(zip(self.targets[mode],
                                           self.maps[mode])):
                targets[k][j, :] = i

            target_pad_vector = np.full((1, self.d_out), self.target_pad)
            targets = [
                np.concatenate((target_pad_vector, i, target_pad_vector))
                for i in targets
            ]

            self.targets[mode] = pad_sequences(targets,
                                               value=self.target_pad,
                                               padding="post")