コード例 #1
0
def train(features,
          targets,
          num_folds,
          classifiers,
          output_folder,
          seed=None,
          filename_tag="",
          classifier_obj=None):
    kf = KFold(n_splits=num_folds, random_state=seed)
    folds_idxs = list(kf.split(features))
    trips_array = np.asarray(features)
    targets = np.asarray(targets)

    if type(classifiers) != list:
        classifiers = [classifiers]
    # train/val accuracies
    accuracies = {}
    mean_accuracies = {}
    # classify
    for classifier in classifiers:
        classif_start = utils.tic()
        accuracies[classifier] = []
        print("\nTesting classifier [%s]" % classifier)
        # train & test each classifier
        # for each fold
        for i, (train_idx, val_idx) in enumerate(folds_idxs):
            print("\tClassifying fold %d/%d" % (i + 1, len(folds_idxs)),
                  end=" ")
            train = (trips_array[train_idx], targets[train_idx])
            val = (trips_array[val_idx], targets[val_idx])
            if classifier == "knn":
                k = 5
                accTrain, accVal = knn_classification(
                    train, val, k, classifier_obj=classifier_obj)
            elif classifier == "logreg":
                accTrain, accVal = logreg_classification(
                    train, val, classifier_obj=classifier_obj)
            elif classifier == "randfor":
                accTrain, accVal = randfor_classification(
                    train, val, seed, classifier_obj=classifier_obj)
            accuracies[classifier].append((accTrain, accVal))
            print("- accuracies train/val:", accuracies[classifier][-1])
        elapsed = utils.tictoc(classif_start)
        print("Done in:", elapsed)

        # accuracy across all folds
        mean_accuracies[classifier] = [np.mean([x[0] for x in accuracies[classifier]]), \
                                       np.mean([x[1] for x in accuracies[classifier]])]
        titlestr = "%s, overall accuracy train/val: %s" % (
            classifier, str(mean_accuracies[classifier]))
        chart_filename = os.path.join(
            output_folder, classifier + "_" + filename_tag + "_chart")
        utils.barchart(list(range(1, num_folds + 1)),
                       accuracies[classifier],
                       title=titlestr,
                       ylabel="accuracy",
                       legend=["train", "val"],
                       save=chart_filename)

    return mean_accuracies
コード例 #2
0
def main(config_file, ignore_undefined=False, load_models_first=False):
    """The main function

    Arguments:
        config_file {str} -- Path for the run's configuration file
    """
    # # time the entire run
    with tictoc("Total run"):
        # initialize configuration
        global_config, pipeline, triggers = ConfigReader.read_configuration(
            config_file, ignore_undefined)

        pipeline.configure_names()

        #
        should_load_models = load_models_first or any(
            trig.requires_model_loading() for trig in triggers)
        if should_load_models:
            error(
                "Should load models but model deserialization is not enabled!",
                not global_config.misc.allow_model_deserialization)
            pipeline.load_models()

        for trig in sorted(triggers, key=lambda x: x.is_blocking):
            trig.link_pipeline(pipeline)
            trig.setup()

        for trig in triggers:
            trig.arm()

        if num_warnings > 0:
            warning("{} warnings occured.".format(num_warnings - 1))
        info("Logfile is at: {}".format(global_config.logfile))
    tictoc.log(global_config.logfile + ".timings")
コード例 #3
0
    def produce_outputs(self):
        if self.loaded_preprocessed or self.loaded_aggregated:
            return
        info("Mapping to {} embeddings.".format(self.name))
        train_words = [wp[0] for doc in self.text_train for wp in doc]
        self.fit_doc2vec(train_words, self.labels_train)
        d2v = self.get_model()
        self.embeddings = np.ndarray((0, self.dimension), np.float32)

        # loop over input text bundles (e.g. train & test)
        for dset_idx in range(len(self.text)):
            dset_word_list = self.text[dset_idx]
            with tictoc("Embedding mapping for text bundle {}/{}".format(
                    dset_idx + 1, len(self.text))):
                info("Mapping text bundle {}/{}: {} texts".format(
                    dset_idx + 1, len(self.text), len(self.text[dset_idx])))
                num_docs = len(dset_word_list)
                starting_instance_idx = dset_idx * len(self.embeddings)
                for doc_word_pos in tqdm.tqdm(dset_word_list):
                    doc_words = [wp[0] for wp in doc_word_pos]
                    # debug("Inferring word list:{}".format(doc_words))
                    vec = np.expand_dims(d2v.infer_vector(doc_words), axis=0)
                    self.embeddings = np.append(self.embeddings, vec, axis=0)

        self.set_constant_elements_per_instance()
コード例 #4
0
def question_c(features_file, grid_file, test_file, output_folder, seed,
               classif_file, num_folds):
    total_start = utils.tic()
    df_features = pd.read_csv(features_file)
    features, jid_mapping, targets = jcp.preprocess_train_data(
        df_features, seed)
    classifiers = ["knn", "logreg", "randfor"]
    # classifiers = ["randfor"]
    mean_accuracies = jcp.train(features,
                                targets,
                                num_folds,
                                classifiers,
                                output_folder,
                                seed=seed)

    # print mean accuracy per classifier
    print()
    for classifier in mean_accuracies:
        print(classifier, "accuracy train/val:", mean_accuracies[classifier])

    # select the random forest algorithm to beat the benchmark
    impr_classifier_name = "randfor"
    baseline_accuracy = mean_accuracies[impr_classifier_name][-1]

    print()
    print("Improving classification for classifier", impr_classifier_name)
    best_classifier, best_technique, best_accuracy = jcp.improve_randfor(
        baseline_accuracy, features_file, num_folds, output_folder,
        impr_classifier_name, seed)
    jcp.test(best_classifier, best_technique, test_file, grid_file,
             jid_mapping, classif_file)
    elapsed = utils.tictoc(total_start)
    print("Done in:", elapsed)
コード例 #5
0
def find_similar_subroutes_per_test_trip(test_points,
                                         train_df,
                                         k,
                                         paropts=None,
                                         verbosity=False):
    if paropts:
        print("Parallelizing with", paropts)
        partype, numpar = paropts
    else:
        partype, numpar = None, None

    timestart = utils.tic()
    test_lonlat = utils.idx_to_lonlat(test_points, format="tuples")
    max_subseqs = []
    if partype:
        # num threads or processes
        if partype == "processes":
            max_subseqs = exec_with_processes(train_df, numpar, test_lonlat, k)
        elif partype == "threads":
            max_subseqs = exec_with_threads(train_df, numpar, test_lonlat, k)
    else:
        max_subseqs = serial_execution(train_df,
                                       test_lonlat,
                                       k,
                                       verbosity=verbosity)
    if len(max_subseqs) != k:
        print("WARNING: Specified %d subseqs!" % k)
    print("Extracted %d nearest subsequences of a %d-long test tring in: %s" %
          (len(test_points), k, utils.tictoc(timestart)))
    return max_subseqs
コード例 #6
0
def serial_execution(df, test_lonlat, k, verbosity=False):
    max_subseqs = []
    # for each trip in the training data
    for index, row in df.iterrows():
        train_points = row["points"]
        train_points = eval(train_points)
        train_lonlat = utils.idx_to_lonlat(train_points, format="tuples")
        timestart = utils.tic()
        # compute common subsequences between the test trip and the current candidate
        _, subseqs_idx_list = calc_lcss(test_lonlat, train_lonlat)
        # consider non-consequtive subroutes
        subseqs_idx = list(
            set([idx for seq in subseqs_idx_list for idx in seq]))
        elapsed = utils.tictoc(timestart)
        # sort by decr. length
        subseqs_idx.sort(reverse=True)
        # update the list of the longest subsequences
        if subseqs_idx:
            max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k,
                                                   elapsed, row)
            # print("Max subseq length:",len(max_subseqs))
            #print([x[0] for x in max_subseqs])
            # print("Updated max subseqs, lens now:",[len(x[0]) for x in max_subseqs])
    if verbosity:
        print("Got %d subseqs:" % len(max_subseqs),
              [(x, y, z["tripId"]) for (x, y, z) in max_subseqs])

    #max_subseqs = check_reverse_lcss(max_subseqs, test_lonlat, k)
    if verbosity:
        print("Got %d reversed: subseqs:" % len(max_subseqs),
              [(x, y, z["tripId"]) for (x, y, z) in max_subseqs])

    return max_subseqs
コード例 #7
0
def exec_with_threads(df, numpar, test_lonlat, k):
    max_subseqs = []
    res1 = [[] for _ in range(numpar)]
    res2 = [[] for _ in range(numpar)]
    subframes = utils.get_sub_dataframes(df, numpar)
    # assign data and start the threads
    threads = []
    timestart = utils.tic()
    for i in range(numpar):
        train_lonlat = []
        for index, row in subframes[i].iterrows():
            train_points = row["points"]
            train_points = eval(train_points)
            train_lonlat = utils.idx_to_lonlat(train_points, format="tuples")
        threads.append(
            threading.Thread(target=calc_lcss,
                             args=(test_lonlat, train_lonlat, res1, res2)))
        threads[i].start()
    # gather and merge results
    subseqs = []
    subseqs_idx = []
    for i in range(numpar):
        threads[i].join()
        subseqs += res1[i]
        subseqs_idx += res2[i]
    subseqs_idx = sorted(subseqs_idx, key=lambda x: len(x), reverse=True)
    elapsed = utils.tictoc(timestart)
    max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k,
                                           elapsed, row)
    return max_subseqs
コード例 #8
0
    def produce_outputs(self):
        """Apply preprocessing"""

        self.setup_nltk_resources()
        # make indices object -- this filters down non-existent (with no instances) roles
        if type(self.indices) is not Indices:
            self.indices = Indices(self.indices, tags=self.roles)
        self.roles = self.indices.tags
        train_idx, test_idx = self.indices.get_train_test()
        test_idx = self.indices.get_tag_instances(defs.roles.test, must_exist=False)
        error("Neither train or test indices found to process dataset", not (train_idx.size > 0 or test_idx.size > 0))
        preproc_data = []
        preproc_targets = []

        with tictoc("Preprocessing {}".format(self.name)):
            info("Mapping text training data to word collections.")
            txts, self.vocabulary, discarded_indexes = self.preprocess_text_collection(self.data, train_idx, track_vocabulary=True)
            self.vocabulary = set(self.vocabulary)
            preproc_data.extend(txts)

            if self.has_text_targets():
                info("Mapping text training targets to word collections.")
                txts, voc, _ = self.preprocess_text_collection(self.targets, train_idx, track_vocabulary=True)
                self.vocabulary.update(voc)
                preproc_targets.extend(txts)

            # if discarded_indexes:
            #     warning(f"Discarded {len(discarded_indexes)} instances from preprocessing.")
            #     if self.train_labels is not None:
            #         self.train_labels = [self.train_labels[i] for i in range(len(self.train_labels)) if i not in discarded_indexes]

            info("Mapping text test data to word collections.")
            txts, _, discarded_indexes = self.preprocess_text_collection(self.data, test_idx)
            preproc_data.extend(txts)
            if self.has_text_targets():
                info("Mapping text test targets to word collections.")
                txts, _, _ = self.preprocess_text_collection(self.targets, test_idx)
                preproc_targets.extend(txts)

            # if discarded_indexes:
            #     warning(f"Discarded {len(discarded_indexes)} instances from preprocessing.")
            #     if self.test_labels is not None:
            #         self.test_labels = [self.test_labels[i] for i in discarded_indexes]
            # fix word order and get word indexes
            self.vocabulary = list(self.vocabulary)
            for index, word in enumerate(self.vocabulary):
                self.word_to_index[word] = index
                self.vocabulary_index.append(index)
            # add another for the missing word
            self.undefined_word_index = len(self.vocabulary)

            self.data = preproc_data
            self.targets = preproc_targets
コード例 #9
0
def calculate_nns(test_points, train_df, paropts=None, k=5, unique_jids=False):
    # parallelization type
    if paropts:
        print("Parallelizing with", paropts)
        partype, numpar = paropts
    else:
        partype, numpar = None, None

    timestart = utils.tic()
    test_lonlat = utils.idx_to_lonlat(test_points, format="tuples")
    nearest_neighbours = [-1 for _ in range(len(train_df.index))]

    if partype:
        # num threads or processes
        if partype == "processes":
            nearest_neighbours = run_with_processes(numpar, test_lonlat,
                                                    train_df)
        elif partype == "threads":
            nearest_neighbours = run_with_threads(numpar, test_lonlat,
                                                  train_df)
    else:
        # serial execution
        nearest_neighbours = calculate_dists(test_lonlat, train_df)
    # sort the list to increasing distance
    nearest_neighbours = sorted(nearest_neighbours, key=lambda k: k[1])
    # keep unique jids, if needed
    if unique_jids:
        print("Restricting to single neighbour per jid")
        keep = [0 for _ in range(len(nearest_neighbours))]
        already_encountered = []
        for i, nn in enumerate(nearest_neighbours):
            jid = nn[2]
            if jid not in already_encountered:
                already_encountered.append(jid)
                keep[i] = True
                continue
        nearest_neighbours = [
            nearest_neighbours[i] for i in range(len(nearest_neighbours))
            if keep[i]
        ]

    # return the top 5
    nearest_neighbours = nearest_neighbours[:k]
    print("Neighbours:", [n[0] for n in nearest_neighbours])
    print("Extracted %d nearest neighbours of a %d-long test trip in: %s" %
          (len(test_points), k, utils.tictoc(timestart)))
    return nearest_neighbours
コード例 #10
0
    def execute_training(self):
        with tictoc("Training run", do_print=self.do_folds, announce=False):

            # get training - validation instance indexes for building the model
            self.configure_trainval_indexes()

            # iterate over required runs (e.g. portion split or folds)
            # # as per the validation setting
            for iteration_index, trainval in enumerate(
                    self.validation.get_trainval_indexes()):
                # set the train/val data indexes
                self.train_index, self.val_index = trainval

                # train and keep track of the model
                self.model_index = iteration_index
                model = self.acquire_trained_model()
                self.append_model_instance(model)
コード例 #11
0
 def acquire_trained_model(self):
     """Trains the learning model or load an existing instance from a persisted file."""
     with tictoc("Training run [{}] - {} on {} training and {} val data.".
                 format(
                     get_info_string(self.config), self.model_index,
                     len(self.train_index),
                     len(self.val_index)
                     if self.val_index is not None else "[none]")):
         model = None
         if not model:
             model = self.train_model()
             # create directories
             makedirs(self.models_folder, exist_ok=True)
         else:
             info(
                 "Skipping training due to existing model successfully loaded."
             )
     return model
コード例 #12
0
def exec_with_processes(df, process_num, test_lonlat, k):
    max_subseqs = []
    pool = ThreadPool(processes=process_num)
    for index, row in df.iterrows():
        train_points = row["points"]
        train_points = eval(train_points)
        train_lonlat = utils.idx_to_lonlat(train_points, format="tuples")
        timestart = utils.tic()
        # compute common subsequences between the test trip and the current candidate
        async_result = pool.apply_async(calc_lcss, (test_lonlat, train_lonlat))
        subseqs, subseqs_idx = async_result.get()
        elapsed = utils.tictoc(timestart)
        # sort by decr. length
        subseqs_idx = sorted(subseqs_idx, key=lambda x: len(x), reverse=True)
        # update the list of the longest subsequences
        max_subseqs = update_current_maxsubseq(max_subseqs, subseqs_idx, k,
                                               elapsed, row)
    print("Got %d common subsequences" % len(max_subseqs))
    pool.close()
    pool.join()
    return max_subseqs
コード例 #13
0
def question_b(train_file, number_of_cells, output_folder):
    # specify files
    grid_file = os.path.join(output_folder, "grid.pickle")
    feature_file = os.path.join(output_folder, "tripFeatures.csv")
    # read data and make the grid
    train_df = pd.read_csv(train_file)
    max_lonlat, min_lonlat, all_lats, all_lons = gvp.find_min_max_latlon(
        train_df, output_folder)
    grid = gvp.create_grid(number_of_cells,
                           max_lonlat,
                           min_lonlat,
                           all_lats,
                           all_lons,
                           output_folder=output_folder)
    # save grid and transform data
    with open(grid_file, "wb") as f:
        pickle.dump(grid, f)

    feats_start = utils.tic()
    gvp.map_to_features_bow(train_df, grid, feature_file)
    print("Generated features in", utils.tictoc(feats_start))
    return feature_file, grid_file
コード例 #14
0
def question_a1(output_folder, clean_file, test_file, paropts, k):
    test_df = pd.read_csv(test_file, delimiter="\n")
    train_df = pd.read_csv(clean_file)
    print(
        "Extracting %d nearest neighbours out of %d cleaned train data, for each test trip"
        % (k, len(train_df)))
    print("Using parallelization options:", paropts)
    for index, row in test_df.iterrows():
        print("Examining test element %d / %d" % (index + 1, len(test_df)))
        outfile_name = os.path.join(output_folder, "nn_%d_" % (index + 1))
        # prepare to count time
        millis_start = utils.tic()
        # compute nearest neighbours
        test_points = eval(row["Trajectory"])
        nns_ids_distances = nn.calculate_nns(test_points,
                                             train_df,
                                             paropts=paropts)
        # get time elapsed
        elapsed = utils.tictoc(millis_start)
        # visualize
        nn.visualize_nns(test_points, nns_ids_distances, outfile_name, elapsed,
                         index)
コード例 #15
0
    def produce_outputs(self):
        # get input configuration data
        self.topk = None
        self.messages = []
        self.input_parameters_dict = [
            dp for dp in self.data_pool.data if type(dp.data) == Dictionary
        ][0].data.instances
        self.input_parameters = to_namedtuple(self.input_parameters_dict,
                                              "input_parameters")

        # get reference data by chain name output
        self.label_mapping = []
        for mapping in self.params.label_mappings:
            # read json
            if type(mapping) is str and mapping.endswith(".json"):
                with open(mapping) as f:
                    mapping = json.load(f)
            mapping_dict = {ix: val for (ix, val) in enumerate(mapping)}
            self.label_mapping.append(mapping_dict)

        datapack = [
            x for x in self.data_pool.data if x.chain == self.params.data_chain
        ][0]
        predictions, tagged_idx = [], []
        for i, chain_name in enumerate(self.params.pred_chains):
            # predictions
            chain_preds = [
                x for x in self.data_pool.data if x.chain == chain_name
            ][0]
            predictions.append(chain_preds)

            # get tagged index
            idx_tag_name = self.params.idx_tags[i]
            if idx_tag_name is None:
                continue
            # get data with indices
            idx_data = [
                x for x in self.data_pool.data if type(x.data) == DummyData
                and x.has_usage(Indices, allow_superclasses=False)
            ]
            # get data with indices with the desired tag
            idx_data = [
                x for x in idx_data
                if idx_tag_name in x.get_usage(Indices,
                                               allow_superclasses=False).tags
            ][0]
            idx_data = idx_data.get_usage(Indices, allow_superclasses=False)
            idx = idx_data.get_tag_instances(idx_tag_name)
            tagged_idx.append(idx)

        # for text data, keep just the words
        if type(datapack.data) == Text:
            data = [x["words"] for x in datapack.data.instances]
        res = []

        # get final scores
        # final_preds = predictions[len(predictions)-1].data.instances
        # final_surv_idx = tagged_idx[len(predictions)-1]

        # curr_surv_idx = final_surv_idx
        # # find which words the survivors belong to
        # for idx in reversed(tagged_idx[:-1]):
        #     curr_surv_idx = idx[curr_surv_idx]

        res = []
        # contextualize wrt. each instance (specified by the ngram tag)
        num_all_ngrams = len(predictions[0].data.instances)
        num_steps = len(predictions)
        index_mapper = IndexMapper(num_all_ngrams, tagged_idx)

        ngram_tags = sorted(
            [x for x in datapack.usages[0].tags if x.startswith("ngram_inst")])
        with tictoc("Classification report building", announce=False):
            for n, ngram_tag in enumerate(ngram_tags):
                # indexes of the tokens for the current instance
                # to the entire data container
                original_instance_ix_data = datapack.usages[
                    0].get_tag_instances(ngram_tag)
                inst_obj = {
                    "instance": n,
                    "data": [data[i] for i in original_instance_ix_data],
                    "predictions": []
                }

                for local_word_idx, ix in enumerate(original_instance_ix_data):
                    if data[ix] == "δυο":
                        print()
                    word_obj = {
                        "word": data[ix],
                        "word_idx": int(local_word_idx),
                        "detailed_preds": [],
                        "overall_preds": {}
                    }
                    # final stages
                    final_stages_for_word = []
                    # for each step
                    for step_idx in range(num_steps):
                        preds = predictions[step_idx].data.instances
                        step_name = self.params.pred_chains[step_idx]
                        step_obj = {"name": step_name, "step_index": step_idx}

                        if step_idx == 0 or index_mapper.index_survives(
                                ix, target_level=step_idx):
                            # we want the position of in the pred. container previous to the step
                            surv_idx = index_mapper.convert_index(
                                ix, target_level=step_idx - 1)
                            step_preds = preds[surv_idx, :]
                            scores, classes = self.get_topK_preds(
                                step_preds, self.label_mapping[step_idx],
                                self.params.only_report_labels[step_idx])
                            step_obj["step_preds"] = {
                                c: s
                                for (c, s) in zip(classes[0], scores[0])
                            }

                            if step_idx == num_steps - 1:
                                word_obj["overall_preds"] = step_obj[
                                    "step_preds"]

                            # add to detailed predictions, if not omitted
                            if not self.omit_detailed_results():
                                word_obj["detailed_preds"].append(step_obj)

                        else:
                            if self.params.report_if_fail is not None:
                                if step_name in self.params.report_if_fail:
                                    surv_idx = index_mapper.convert_index(
                                        ix, target_level=step_idx - 1)
                                    if surv_idx is None:
                                        break
                                    step_preds = preds[surv_idx, :]
                                    scores, classes = self.get_topK_preds(
                                        step_preds,
                                        self.label_mapping[step_idx], self.
                                        params.only_report_labels[step_idx])
                                    step_obj["step_preds"] = {
                                        c: s
                                        for (c,
                                             s) in zip(classes[0], scores[0])
                                    }

                                    # since it fails, it's def. a final step for this word
                                    word_obj["overall_preds"] = step_obj[
                                        "step_preds"]

                                    # add to detailed predictions, if not omitted
                                    if not self.omit_detailed_results():
                                        word_obj["detailed_preds"].append(
                                            step_obj)
                            else:
                                # add the score of the last classification
                                scores, classes = [], []
                                step_obj["step_preds"] = {}
                                break

                    # add if there's info in it
                    if not word_obj["detailed_preds"]:
                        del word_obj["detailed_preds"]
                    inst_obj["predictions"].append(word_obj)
                res.append(inst_obj)

        self.result = {
            "results": res,
            "input_params": self.input_parameters_dict,
            "messages": self.messages
        }
コード例 #16
0
    def produce_outputs(self):
        # get input configuration data
        self.topk = None
        self.messages = []
        self.input_parameters_dict = [dp for dp in self.data_pool.data if type(dp.data) == Dictionary][0].data.instances
        self.input_parameters = to_namedtuple(self.input_parameters_dict, "input_parameters")

        # get reference data by chain name output
        self.label_mapping = []
        for mapping in self.params.label_mappings:
            # read json
            if type(mapping) is str:
                try:
                    with open(mapping) as f:
                        mapping = json.load(f)
                except:
                    error("Requires json labelmapping or literal list")
            mapping_dict = {ix: val for (ix, val) in enumerate(mapping)}
            self.label_mapping.append(mapping_dict)


        # thresholding
        for th in self.params.thresholds:
            if th not in self.input_parameters_dict:
                self.result =  {"results": [], "input_params": self.input_parameters_dict, "messages": [f"Threshold {th} missing from input parameters"]}
                return


        datapack = [x for x in self.data_pool.data if x.chain == self.params.data_chain][0]
        predictions, tagged_idx = [], []
        for i, chain_name in enumerate(self.params.pred_chains):
            # predictions
            chain_preds = [x for x in self.data_pool.data if x.chain == chain_name][0]
            predictions.append(chain_preds)


        # for text data, keep just the words
        if type(datapack.data) == Text:
            data = [x["words"] for x in datapack.data.instances]

        res = []
        
        predictions = [x.data.instances for x in predictions]
        num_all_ngrams = len(predictions[0])
        num_steps = len(predictions)

        # compute thresholding values
        thresholding = np.zeros((num_all_ngrams, len(self.params.thresholds)), bool)
        # for i, th in enumerate(self.params.thresholds):
        #     th_val = float(self.input_parameters_dict[th])


        thresholding[:, 0] = predictions[0][:, 1] > float(self.input_parameters_dict[self.params.thresholds[0]])
        thresholding[:, 1] = predictions[1][:, 1] > float(self.input_parameters_dict[self.params.thresholds[1]])
        thresholding[:, 2] = np.any(predictions[2] > float(self.input_parameters_dict[self.params.thresholds[2]]), axis=1)


        ngram_tags = sorted([x for x in datapack.usages[0].tags if x.startswith("ngram_inst")])

        with tictoc("Classification report building", announce=False):
            for n, ngram_tag in enumerate(ngram_tags):
                # indexes of the tokens for the current instance
                # to the entire data container
                original_instance_ix_data = datapack.usages[0].get_tag_instances(ngram_tag)
                inst_obj = {"instance": n, "data": [data[i] for i in original_instance_ix_data], "predictions": []}

                for local_word_idx, ix in enumerate(original_instance_ix_data):
                    word_obj = {"word": data[ix], "word_idx": int(local_word_idx), "overall_predictions": {}}
                    detailed = []
                    
                    # for each step
                    for step_idx in range(num_steps):
                        preds = predictions[step_idx]
                        step_name = self.params.pred_chains[step_idx]
                        step_obj = {"name": step_name, "step_index": step_idx}

                        survives = thresholding[ix, step_idx]
                        step_preds = np.expand_dims(preds[ix, :], axis=0)
                        scores, classes = self.get_topK_preds(step_preds, self.label_mapping[step_idx], self.params.only_report_labels[step_idx])
                        step_preds = {c: round(s, 4) for (c, s) in zip(classes[0], scores[0])}
                        step_obj["step_preds"] = step_preds
                        detailed.append(step_preds)

                    modified, deleted, replaced = thresholding[ix, :]
                    modify_obj = {"modified": int(modified), "prob": detailed[0]["modify"]}
                    word_obj["overall_predictions"]["modify_prediction"] = modify_obj

                    delete_obj = detailed[1]
                    # replaced
                    objs = []
                    for word, prob in detailed[2].items():
                        objs.append({"word": word, "prob": prob})
                    replace_obj = objs

                    if modified:
                        if deleted:
                            # deleted
                            word_obj["overall_predictions"]["delete_prediction"] = delete_obj
                        elif replaced:
                            word_obj["overall_predictions"]["replace_prediction"] = replace_obj

                    if not self.omit_detailed_results():
                        word_obj["detailed_predictions"] = {"modify_prediction": modify_obj, "delete_prediction": delete_obj, "replace_prediction": replace_obj}

                    inst_obj["predictions"].append(word_obj)
                res.append(inst_obj)

        self.result = {"results": res, "input_params": self.input_parameters_dict, "messages": self.messages}