Exemple #1
0
def do_train_test_split(args):
    logger.info("Performing train-test split for all datasets")
    task_dic = {"complexity": args.complexity, "eyetracking": args.eyetracking, "readability": args.readability}
    for task, do_task in task_dic.items():
        folder = f"{args.data_dir}/{task}/train_test"
        if not os.path.exists(folder) and do_task:
            os.makedirs(folder)
            if task == "complexity":
                train, test = train_test_split(args.pc, test_size=args.test_size, random_state=args.seed)
            elif task == "eyetracking":
                if args.eyetracking_mode == "word":
                    train, test = train_test_split_sentences(
                        args.et, test_frac=args.test_size, sentenceid_col="sentence_id"
                    )
                elif args.eyetracking_mode == "sentence":
                    train, test = train_test_split(args.et, test_size=args.test_size, random_state=args.seed)
            elif task == "readability":
                train, test = train_test_split(
                    args.ra, test_size=args.test_size, random_state=args.seed, stratify=args.ra[["reading_level"]]
                )
            save_tsv(train, f"{folder}/train.tsv")
            save_tsv(test, f"{folder}/test.tsv")
            logger.info(f"Train-test data saved in {folder}")
        else:
            if do_task:
                logger.info("Train-test data already exist in path, not overriding them.")
def get_et_metrics(sentences,
                   model=None,
                   save_path=None,
                   load_path=None,
                   id="model"):
    if load_path is not None and os.path.exists(load_path):
        logger.info(f"Loading predicted eye-tracking metrics from {load_path}")
        df = read_tsv(load_path)
    else:
        logger.info(f"Inferencing eye-tracking predictions with model {model}")
        # Remove all whitespaces before punctuation, to make sure that format actually
        # matches the one used in eye-tracking files on which the model was trained.
        sentences = ([{
            "text": re.sub(r"\s+([^\w\s])", r"\1", s)
        } for s in sentences] if type(sentences[0]) is str else sentences)
        model = MultitaskInferencer.load(model, gpu=True, level="token")
        res = model.inference_from_dicts(dicts=sentences)
        for i, sent in enumerate(res):
            for j, tok in enumerate(sent):
                res[i][j]["sentence_id"] = i
                res[i][j]["token_id"] = j
        res = [token for sentence in res for token in sentence]
        df = pd.DataFrame.from_dict(res)
        df["context"] = [c.rstrip() for c in df["context"]]
        if save_path is not None:
            logger.info(f"Saving inferenced predictions to {save_path}")
            save_tsv(df, f"{save_path}/{id}_preds.tsv")
    return df
def evaluate_kfold(args, data_silo, processor):
    silos = DataSiloForCrossVal.make(data_silo, n_splits=args.folds)
    # Run the whole training, earlystopping to get a model,
    # then evaluate the model on the test set of each fold
    dict_preds_labels = {}
    for task in args.label_columns:
        dict_preds_labels[task] = {}
        dict_preds_labels[task]["preds"], dict_preds_labels[task]["labels"] = [], []
    for num_fold, silo in enumerate(silos):
        if not args.do_eval_only:
            model = train_on_split(args, silo, processor, num_fold)
        else:
            model = CustomAdaptiveModel.load(f"{args.model_name}_{num_fold}", device=args.device,)
            model.connect_heads_with_processor(silo.processor.tasks, require_labels=True)
        evaluator_test = MultitaskEvaluator(
            data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=args.device, report=False
        )
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold)
        # Exclude total loss
        for res in result[1:]:
            dict_preds_labels[res["task_name"]]["preds"].extend(res.get("preds"))
            dict_preds_labels[res["task_name"]]["labels"].extend(res.get("labels"))
        if args.save_predictions:
            pred_tsv = pd.DataFrame()
            for res in result[1:]:
                pred_tsv[f"{res['task_name']}_preds"] = res.get("preds")
                pred_tsv[f"{res['task_name']}_labels"] = res.get("labels")
            save_tsv(pred_tsv, os.path.join(args.out_dir, f"{args.run_name}_{num_fold}.tsv"))
    args.logger.info("Final results:")
    for task_name, task in dict_preds_labels.items():
        args.logger.info(f"__{task_name}__")
        metrics = token_level_regression_metrics(task["preds"], task["labels"])
        for metric in metrics.keys():
            args.logger.info(f"{metric}: {metrics[metric]}")
Exemple #4
0
def compute_corr_ranks_over_bins(args, config):
    logger.info(
        "Correlate features with task scores over various length bins...")
    # Compute correlation lists for all the length bins
    corr_ranks_per_bin = []
    args.leave_nans = True
    for curr_binsize in range(args.start_bin, args.end_bin + 1, args.bin_step):
        corr_ranks = {}
        for data_name in config.keys():
            data = read_tsv(config[data_name]["path"])
            bin_data = data.loc[
                (data[config[data_name]["length_bin_feat"]] >= curr_binsize -
                 args.bin_width)
                & (data[config[data_name]["length_bin_feat"]] <= curr_binsize +
                   args.bin_width), :, ]
            logger.info(
                f"Bin {curr_binsize}±{args.bin_width} examples: {len(bin_data)}"
            )
            if args.save_binned_data:
                name = config[data_name]["path"].split(
                    ".")[0] + f"_bin{curr_binsize}.tsv"
                logger.info(
                    f"Saving {curr_binsize}±{args.bin_width} bin to {name}")
                save_tsv(bin_data, name)
            corr_ranks = {
                **corr_ranks,
                **(compute_corr_ranks(args, bin_data, data_name, config[data_name]))
            }
        for task_name in corr_ranks.keys():
            corr_ranks[task_name].sort(key=lambda tup: tup[1].correlation,
                                       reverse=True)
        corr_ranks_per_bin.append(corr_ranks)
    # Order first correlation lists by correlation intensity of features
    first_bin_ranks = corr_ranks_per_bin[0]
    for task in first_bin_ranks.keys():
        first_bin_ranks[task].sort(
            key=lambda tup: -1
            if np.isnan(tup[1].correlation) else tup[1].correlation,
            reverse=True)
    # Order all correlation lists based on the one for the first bin
    for i in range(len(corr_ranks_per_bin)):
        for task in corr_ranks_per_bin[i].keys():
            corr_ranks_per_bin[i][task].sort(key=lambda x: [
                first_bin_ranks[task].index(tup)
                for tup in first_bin_ranks[task] if tup[0] == x[0]
            ])
    return corr_ranks_per_bin
Exemple #5
0
def preprocess_readability_data(args):
    ra_dir = os.path.join(args.data_dir, RA_FOLDER)
    idxs, texts, reading_levels = [], [], []
    for filename in os.listdir(ra_dir):
        if not filename.endswith(".txt"):
            continue
        name = filename.split("-")[0]
        with open(os.path.join(ra_dir, filename), "r") as f:
            label = f.readline()
            label = label.rstrip("\n")
            sentences = f.readlines()
            sentences = [s.rstrip("\n") for s in sentences]
            sentences = [s for s in sentences if s]
        idxs += [f"{name}-{i}" for i in range(1, len(sentences) + 1)]
        texts += sentences
        reading_levels += [label for i in range(len(sentences))]
    df = pd.DataFrame({"index": idxs, "text": texts, "reading_level": [l.strip() for l in reading_levels]})
    out = os.path.join(args.out_dir, "readability_data.tsv")
    save_tsv(df, out)
    logger.info(f"Readability assessment data were preprocessed and saved as" f" {out} with shape {df.shape}")
    return df
Exemple #6
0
def preprocess_complexity_data(args):
    # Needed to avoid making the "null" word in text a NaN
    pc_file = os.path.join(args.data_dir, PC_DATA)
    pc = pd.read_csv(pc_file, na_values=["N/A"], keep_default_na=False)
    # Remove duplicates
    pc = pc[~pc.duplicated("SENTENCE")]
    pc_vals_start_idx = 2
    # Keep only annotations to compute agreement scores
    vals = pc.iloc[:, pc_vals_start_idx:]
    # Check if at least 10 participants agree on the complexity score
    agreement = [x >= args.complexity_min_agree for x in compute_agreement(vals, vals.mean(axis=1), vals.std(axis=1))]
    df = pd.DataFrame({"index": pc["ID"], "text": pc["SENTENCE"], "score": vals.mean(axis=1),})
    if args.do_features:
        # Load features
        pc_features_file = os.path.join(args.data_dir, PC_FEATURES)
        pc_features = pd.read_csv(pc_features_file, sep="\t")
        # Concatenate PC linguistic features
        df = pd.concat([df.reset_index(drop=True), pc_features.reset_index(drop=True)], axis=1)
    # Filter by agreement
    df = df[agreement]
    out = os.path.join(args.out_dir, "complexity_data.tsv")
    save_tsv(df, out)
    logger.info(f"Perceived complexity data were preprocessed and saved as" f" {out} with shape {df.shape}")
    return df
def finetune_sentence_level(args):
    logging.basicConfig(
        format="%(asctime)s %(levelname)s %(name)s  %(message)s",
        datefmt="%d-%m-%y %H:%M:%S",
        level=logging.INFO)
    args.logger = logging.getLogger(__name__)
    if args.do_logfile:
        filehandler = logging.FileHandler(
            os.path.join(args.log_dir, f"{args.run_name}.log"))
        args.logger.addHandler(filehandler)
    args.logger.info(vars(args))
    # Setup MLFlow
    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name=args.experiment_name,
                              run_name=args.run_name)
    set_all_seeds(seed=args.seed)
    args.device, args.n_gpu = initialize_device_settings(use_cuda=True)
    # Create a tokenizer
    tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer"
    tokenizer = CustomTokenizer.load(
        pretrained_model_name_or_path=args.model_name,
        do_lower_case=args.do_lower_case,
        tokenizer_class=tok_class)
    # Create a processor for the dataset
    processor = load_processor(args, tokenizer)
    # Create a DataSilo that loads several datasets (train/dev/test)
    # provides DataLoaders and calculates descriptive statistics
    data_silo = DataSilo(processor=processor, batch_size=args.batch_size)
    if args.do_feat_embeds:
        args.feat_size = processor.feat_size
    # We do cross-validation
    if args.folds > 1:
        evaluate_kfold(args, data_silo, processor)
    else:
        adapt_model = train_on_split(args, data_silo, processor)
        evaluator_test = MultitaskEvaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=args.device)
        result = evaluator_test.eval(adapt_model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(
                                       data_silo.get_data_loader("test")))
        pred_tsv = pd.DataFrame()
        args.logger.info("Test results:")
        for res in result[1:]:
            args.logger.info(f"__{res['task_name']}__")
            if args.train_mode == "classification":
                metrics = classification_metrics(res.get("preds"),
                                                 res.get("labels"))
                args.logger.info(metrics)
            else:
                metrics = regression_metrics(res.get("preds"),
                                             res.get("labels"))
                for metric in metrics.keys():
                    args.logger.info(f"{metric}: {metrics[metric]}")
            if args.save_predictions:
                pred_tsv[f"{res['task_name']}_preds"] = res.get("preds")[0]
                pred_tsv[f"{res['task_name']}_labels"] = res.get("labels")[0]
        if args.save_predictions:
            save_tsv(pred_tsv,
                     os.path.join(args.out_dir, f"{args.run_name}.tsv"))
        # Load trained model and perform inference
        dicts = [
            {
                "text":
                "The intense interest aroused in the public has now somewhat subsided."
            },
            {
                "text": "The quick brown fox jumped over the lazy dog."
            },
        ]
        model = MultitaskInferencer.load(args.save_dir,
                                         gpu=True,
                                         level="sentence")
        result = model.inference_from_dicts(dicts=dicts)
        args.logger.info("Inference example:")
        args.logger.info(result)
def get_surprisals(args):
    set_seed(args.seed, cuda=args.cuda)
    logger.info("Importing tokenizer and pre-trained model")
    tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer"
    ref = args.reference_hf_model if args.reference_hf_model is not None else args.model_name_or_path
    model = AutoModelWithLMHead.from_pretrained(ref)
    # Loading a local model, we need to replace the AutoModel with the local model
    if args.reference_hf_model is not None:
        farm_lm = LanguageModel.load(
            args.model_name_or_path,
            language_model_class=args.model_class_name)
        # Set the underlying model to the custom loaded model
        # The LM head used for surprisal is the original pretrained head
        logger.info(
            f"Setting model.{model.base_model_prefix} attribute with model: {args.model_name_or_path}"
        )
        setattr(model, model.base_model_prefix, farm_lm.model)
        tokenizer = CustomTokenizer.load(
            pretrained_model_name_or_path=args.model_name_or_path,
            do_lower_case=args.do_lower_case,
            tokenizer_class=tok_class,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(ref)
    device = torch.device("cuda" if args.cuda else "cpu")
    model.to(device)
    model.eval()
    logger.info(f"Reading sentences from {args.inputf}")
    if args.inputf.endswith(".tsv"):  # lingcomp tsv format
        df = read_tsv(args.inputf)
        sentences = list(df["text"])
    elif args.inputf.endswith(".json"):  # syntaxgym test suite format
        sentences = get_sentences_from_json(args.inputf)
    elif args.inputf.endswith(".txt"):  # one sentencen per line
        sentences = open(args.inputf, "r").read().split("\n")
    else:
        raise AttributeError(
            "Only .tsv, .json and .txt input files are supported.")
    dict_list = []
    for i, sentence in tqdm(enumerate(sentences)):
        surprisals = get_surprisal_scores(sentence, tokenizer, model, device)
        if args.mode in ["token", "sentence"]:
            for token, token_idx, surprisal, _, _ in surprisals:
                dict_list.append({
                    "sentence_id": i + 1,
                    "token_id": token_idx,
                    "token": token,
                    "surprisal": surprisal
                })
        elif args.mode == "word":
            words, word_surps, word_spans = aggregate_word_level(
                sentence, surprisals)
            for j, word in enumerate(words):
                dict_list.append({
                    "start": word_spans[j]["start"],
                    "end": word_spans[j]["end"],
                    "context": word,
                    "surprisal": word_surps[j],
                    "sentence_id": i + 1,
                    "token_id": j + 1,
                })
    out = pd.DataFrame(dict_list)
    if args.mode == "sentence":
        surprisals = list(
            out.groupby("sentence_id", sort=False).sum()["surprisal"])
        assert len(surprisals) == len(
            sentences), "Sentence-surprisal number mismatch"
        dict_list = []
        for k, sent in enumerate(sentences):
            dict_list.append({
                "sentence_id": k + 1,
                "sentence": sent,
                "surprisal": surprisals[k]
            })
        out = pd.DataFrame(dict_list)
    logger.info(
        f"Surprisal values at {args.mode}-level were saved to {args.outputf}")
    save_tsv(out, args.outputf)
 def create_preprocessed_dataset(self):
     if self.version in ["zuco1-nr", "zuco1-sr"]:
         df = read_zuco1_mat(self.mat_files_path)
     elif self.version == "zuco2":
         df = read_zuco2_mat(self.mat_files_path)
     else:
         raise AttributeError("Selected version of ZuCo does not exist.")
     # Clean up words since we need to rely on whitespaces for aligning
     # sentences with tokens.
     logger.info("Preprocessing values for the dataset...")
     df["content"] = [str(w).replace(" ", "") for w in df["content"]]
     word_skip = [int(v) for v in list(df["FXC"].isna())]
     # If FXC is NaN, it corresponds to 0 fixations
     df["FXC"] = df["FXC"].fillna(0)
     # Create new fields for the dataset
     word_id = [
         f"{x}-{y}-{z}" for x, y, z in zip(df["task_id"], df["sent_idx"],
                                           df["word_idx"].astype("int32"))
     ]
     length = [len(str(x)) for x in df["content"]]
     mean_fix_dur = []
     for x, y in zip(df["TRT"], df["FXC"]):
         if pd.isna(x) or pd.isna(y):
             mean_fix_dur.append(np.nan)
         elif y == 0:
             mean_fix_dur.append(0)
         else:
             mean_fix_dur.append(x / y)
     refix_count = [
         max(x - 1, 0) if pd.notna(x) else np.nan for x in df["FXC"]
     ]
     reread_prob = [x > 1 if pd.notna(x) else np.nan for x in df["FXC"]]
     # Since here we do not have the selective go past time as for GECO,
     # we approximate it using go-past time minus gaze duration.
     # Note that this approximation is a lower bound in case of multiple regressions.
     tot_regr_from_dur = []
     for x, y in zip(df["GPT"], df["GD"]):
         if pd.isna(x) or pd.isna(y):
             tot_regr_from_dur.append(np.nan)
         else:
             tot_regr_from_dur.append(max(x - y, 0))
     # We do not have POS info for ZuCo corpora
     pos = ["UNK" for x in range(len(df))]
     fix_prob = [1 - x for x in word_skip]
     # Format taken from Hollenstein et al. 2019 "NER at First Sight"
     out = pd.DataFrame({
         # Identifiers
         "participant": df["participant"],
         "text_id": df["task_id"],  # Name of the recorded reading portion
         "sentence_id":
         df["sent_idx"],  # Absolute sentence position in reading portion
         # AOI-level measures
         "word_id": word_id,
         "word": df["content"],
         "length": length,
         "pos": pos,
         # Basic measures
         "fix_count": df["FXC"],
         "fix_prob": fix_prob,
         "mean_fix_dur": mean_fix_dur,
         # Early measures
         "first_fix_dur": df["FFD"],
         "first_pass_dur": df["GD"],
         # Late measures
         "tot_fix_dur": df["TRT"],
         "refix_count": refix_count,
         "reread_prob": reread_prob,
         # Context measures
         "tot_regr_from_dur": tot_regr_from_dur,
         "n-2_fix_prob": ([0, 0] + fix_prob)[:len(df)],
         "n-1_fix_prob": ([0] + fix_prob)[:len(df)],
         "n+1_fix_prob": (fix_prob + [0])[1:],
         "n+2_fix_prob": (fix_prob + [0, 0])[2:],
         "n-2_fix_dur": ([0, 0] + list(df["TRT"]))[:len(df)],
         "n-1_fix_dur": ([0] + list(df["TRT"]))[:len(df)],
         "n+1_fix_dur": (list(df["TRT"]) + [0])[1:],
         "n+2_fix_dur": (list(df["TRT"]) + [0, 0])[2:],
     })
     # Convert to correct data types
     out = out.astype(self.out_types_word)
     # Caching preprocessed dataset for next Processor calls
     save_tsv(out, self.out_preprocessed)
     logger.info(f"{self.version} data were preprocessed and saved as"
                 f" {self.out_preprocessed} with shape {out.shape}")
     self.preprocessed_data = out
 def create_preprocessed_dataset(self):
     df = pd.read_csv(
         self.data_path,
         usecols=DUNDEE_DATA_COLS,
         sep="\t",
         quoting=csv.QUOTE_NONE,
         engine="python",
         na_values=[""],
         keep_default_na=False,
     )
     # Clean up words since we need to rely on whitespaces for aligning
     # sentences with tokens.
     df["WORD"] = [str(w).replace(" ", "") for w in df["WORD"]]
     logger.info("Preprocessing values for the dataset...")
     keep_idx = []
     curr_sent_id, curr_wnum = 1, 0
     curr_val = df.loc[0, "SentenceID"]
     curr_pp = df.loc[0, "Participant"]
     sent_ids, word_ids = [], []
     for _, r in tqdm(df.iterrows()):
         # Tokens are split from punctuation for POS tagging, we need to reassemble regions.
         # We use WNUM to check if the token belongs to the same region.
         if r["WNUM"] == curr_wnum:
             keep_idx.append(False)
             continue
         keep_idx.append(True)
         curr_wnum = r["WNUM"]
         # Advance sentence id
         if r["SentenceID"] != curr_val:
             curr_sent_id += 1
             curr_val = r["SentenceID"]
         # Data are ordered, so we can reset sentence indexes when switching participants
         if r["Participant"] != curr_pp:
             curr_sent_id = 1
             curr_pp = r["Participant"]
         sent_ids.append(curr_sent_id)
         word_ids.append(
             f'{int(r["Itemno"])}-{int(r["SentenceID"])}-{int(r["ID"])}')
     # Filter out duplicates
     df = df[keep_idx]
     out = pd.DataFrame({
         # Identifiers
         "participant": df["Participant"],
         "text_id": df["Itemno"],
         "sentence_id": sent_ids,
         # AOI-level measures
         "word_id": word_ids,
         "word": df["WORD"],
         "length": df["WLEN"],
         "pos": df["UniversalPOS"],
         # Basic measures
         "fix_count": df["nFix"],
         "fix_prob": df["Fix_prob"],
         "mean_fix_dur": df["Mean_fix_dur"],
         # Early measures
         "first_fix_dur": df["First_fix_dur"],
         "first_pass_dur": df["First_pass_dur"],
         # Late measures
         "tot_fix_dur": df["Tot_fix_dur"],
         "refix_count": df["nRefix"],
         "reread_prob": df["Re-read_prob"],
         # Context measures
         "tot_regr_from_dur": df["Tot_regres_from_dur"],
         "n-2_fix_prob": df["n-2_fix_prob"],
         "n-1_fix_prob": df["n-1_fix_prob"],
         "n+1_fix_prob": df["n+1_fix_prob"],
         "n+2_fix_prob": df["n+2_fix_prob"],
         "n-2_fix_dur": df["n-2_fix_dur"],
         "n-1_fix_dur": df["n-1_fix_dur"],
         "n+1_fix_dur": df["n+1_fix_dur"],
         "n+2_fix_dur": df["n+2_fix_dur"],
     })
     # Convert to correct data types
     out = out.astype(self.out_types_word)
     # Caching preprocessed dataset for next Processor calls
     save_tsv(out, self.out_preprocessed)
     logger.info(f"Dundee data were preprocessed and saved as"
                 f" {self.out_preprocessed} with shape {out.shape}")
     self.preprocessed_data = out
 def create_preprocessed_dataset(self):
     data = pd.read_excel(
         self.data_path,
         usecols=GECO_DATA_COLS,
         sheet_name="DATA",
         na_values=GECO_NA_VALUES,
         keep_default_na=False,
     )
     extra = pd.read_excel(self.materials_path,
                           sheet_name="ALL",
                           na_values=["N/A"],
                           keep_default_na=False,
                           usecols=GECO_MATERIAL_COLS)
     sent_ids = read_tsv(self.sentence_ids_path)
     logger.info("Preprocessing values for the dataset...")
     df = pd.merge(data, extra, how="left", on="WORD_ID")
     df = pd.merge(df, sent_ids, how="left", on="WORD_ID")
     # Clean up words since we need to rely on whitespaces for aligning
     # sentences with tokens.
     df["WORD"] = [str(w).replace(" ", "") for w in df["WORD"]]
     # Create new fields for the dataset
     text_id = [f"{x}-{y}" for x, y in zip(df["PART"], df["TRIAL"])]
     length = [len(str(x)) for x in df["WORD"]]
     # Handle the case where we don't fill NaN values
     mean_fix_dur = []
     for x, y in zip(df["WORD_TOTAL_READING_TIME"],
                     df["WORD_FIXATION_COUNT"]):
         if pd.isna(x):
             mean_fix_dur.append(np.nan)
         elif y == 0:
             mean_fix_dur.append(0)
         else:
             mean_fix_dur.append(x / y)
     refix_count = [max(x - 1, 0) for x in df["WORD_RUN_COUNT"]]
     reread_prob = [x > 1 for x in df["WORD_FIXATION_COUNT"]]
     # Handle the case where we don't fill NaN values
     tot_regr_from_dur = []
     for x, y in zip(df["WORD_GO_PAST_TIME"],
                     df["WORD_SELECTIVE_GO_PAST_TIME"]):
         if pd.isna(x) or pd.isna(y):
             tot_regr_from_dur.append(np.nan)
         else:
             tot_regr_from_dur.append(max(x - y, 0))
     # 2050 tokens per participant do not have POS info.
     # We use a special UNK token for missing pos tags.
     pos = [
         GECO_POS_MAP[x] if not pd.isnull(x) else GECO_POS_MAP["UNK"]
         for x in df["PART_OF_SPEECH"]
     ]
     fix_prob = [1 - x for x in df["WORD_SKIP"]]
     # Format taken from Hollenstein et al. 2019 "NER at First Sight"
     out = pd.DataFrame({
         # Identifiers
         "participant":
         df["PP_NR"],
         "text_id":
         text_id,  # PART-TRIAL for GECO
         "sentence_id":
         df["SENTENCE_ID"],  # Absolute sentence position for GECO
         # AOI-level measures
         "word_id":
         df["WORD_ID"],
         "word":
         df["WORD"],
         "length":
         length,
         "pos":
         pos,
         # Basic measures
         "fix_count":
         df["WORD_FIXATION_COUNT"],
         "fix_prob":
         fix_prob,
         "mean_fix_dur":
         mean_fix_dur,
         # Early measures
         "first_fix_dur":
         df["WORD_FIRST_FIXATION_DURATION"],
         "first_pass_dur":
         df["WORD_GAZE_DURATION"],
         # Late measures
         "tot_fix_dur":
         df["WORD_TOTAL_READING_TIME"],
         "refix_count":
         refix_count,
         "reread_prob":
         reread_prob,
         # Context measures
         "tot_regr_from_dur":
         tot_regr_from_dur,
         "n-2_fix_prob": ([0, 0] + fix_prob)[:len(df)],
         "n-1_fix_prob": ([0] + fix_prob)[:len(df)],
         "n+1_fix_prob": (fix_prob + [0])[1:],
         "n+2_fix_prob": (fix_prob + [0, 0])[2:],
         "n-2_fix_dur":
         ([0, 0] + list(df["WORD_TOTAL_READING_TIME"]))[:len(df)],
         "n-1_fix_dur":
         ([0] + list(df["WORD_TOTAL_READING_TIME"]))[:len(df)],
         "n+1_fix_dur": (list(df["WORD_TOTAL_READING_TIME"]) + [0])[1:],
         "n+2_fix_dur": (list(df["WORD_TOTAL_READING_TIME"]) + [0, 0])[2:],
     })
     # Convert to correct data types
     out = out.astype(self.out_types_word)
     # Caching preprocessed dataset for next Processor calls
     save_tsv(out, self.out_preprocessed)
     logger.info(f"GECO data were preprocessed and saved as"
                 f" {self.out_preprocessed} with shape {out.shape}")
     self.preprocessed_data = out
def evaluate_model_on_suite(model_name_or_path,
                            suite_path,
                            save_path=None,
                            conf_interval=0.95):
    """
    Given a model or its predictions, computes its performances on a test suite
    based on the formula specified in it
    Args:
        model_name_or_path: A path to a local folder containing model files
            (HuggingFace or FARM format), or a model name from the HuggingFace
            model hub, or a path to a local TSV file containing model predictions.
        suite_path: A path to a local JSON file containing a suite in SyntaxGym format.
        save_path: If model_name_or_path is a model, its inferred predictions will be saved
            to this path.
        conf_interval: Float between 0 and 1, confidence interval computed on metric values
    Returns:
        A dataframe containing average scores across items for each condition and region,
        along with confidence bounds, and a dataframe containing success ratios over suite
        formula for each score column and prediction formula. E.g.

        condition_name  region_number  score_name  metric_name  mean  sem  count  region  up_conf  low_conf
        ambig_comma  1  first_fix_dur_score  sum  395  14  24  Start  365  424
        ambig_comma  2  first_fix_dur_score  sum  179   5  24  Verb   167  191
        ambig_comma  4  first_fix_dur_score  sum  228   7  24  NP/Z   213  244
        ambig_comma  5  first_fix_dur_score  sum  158   7  24  Verb   143  173

        prediction_id  prediction_formula  score_column  result
        0  (((5;%ambig_nocomma%) > (5;%ambig_comma%)))        first_fix_dur_score  0.66
        1  (((5;%ambig_nocomma%) > (5;%unambig_nocomma%)))    first_fix_dur_score  0.33
        2  ((((5;%ambig_nocomma%) - (5;%ambig_comma%)) > ...  first_fix_dur_score  0.33
    """
    if os.path.exists(model_name_or_path) and model_name_or_path.endswith(
            ".tsv"):
        pred_suite, df = compute_suite_et_metrics(suite_path,
                                                  load_path=model_name_or_path)
    else:
        pred_suite, df = compute_suite_et_metrics(suite_path,
                                                  model=model_name_or_path,
                                                  save_path=save_path)
    # Averaging predictions across conditions, regions, scores and metric names
    grp = df.groupby(
        ["condition_name", "region_number", "score_name", "metric_name"])
    avg_df = grp["metric_val"].agg(["mean", "sem", "count"]).reset_index()
    avg_df["region"] = [
        pred_suite.region_names[i - 1] for i in avg_df.region_number
    ]
    # Compute confidence intervals
    avg_df["up_conf"], avg_df["low_conf"] = zip(
        *
        [confidence_intervals(r, conf_interval) for _, r in avg_df.iterrows()])
    avg_df = avg_df.sort_values(
        ["score_name", "condition_name", "region_number"])
    avg_df = avg_df[[
        "condition_name",
        "region_number",
        "region",
        "score_name",
        "metric_name",
        "mean",
        "sem",
        "count",
        "up_conf",
        "low_conf",
    ]]
    pred_df = evaluate_suite(pred_suite)
    res_df = pred_df.groupby(
        ["prediction_id", "prediction_formula",
         "score_column"]).mean()["result"]
    res_df = res_df.reset_index().sort_values(
        ["score_column", "prediction_id"])
    res_df = res_df[[
        "prediction_id", "result", "score_column", "prediction_formula"
    ]]
    if save_path:
        logger.info(f"Saving dataframes to {save_path}")
        save_tsv(avg_df, f"{save_path}/{pred_suite.meta['name']}_avg.tsv")
        save_tsv(res_df, f"{save_path}/{pred_suite.meta['name']}_res.tsv")
    return avg_df, res_df