예제 #1
0
    def recall_at_limit(self, limit=0, result_format="percentage",
                        final_labels=False):
        if final_labels:
            labels = self.final_labels
        else:
            labels = self.labels

        n_included = np.sum(labels)
        all_n_labeled = []
        for logger in self.loggers.values():
            n_current_included = 0
            n_labeled = 0
            _, n_initial = _get_labeled_order(logger)
            for query_i in range(logger.n_queries()):
                label_idx = logger.get("label_idx", query_i=query_i)
                for idx in label_idx:
                    inclusion = labels[idx]
                    n_current_included += inclusion
                    n_labeled += 1
                    if n_current_included == n_included - limit:
                        all_n_labeled.append(n_labeled)
                        break
                if n_current_included == n_included - limit:
                    break
        if result_format == "percentage":
            mult = 100/(len(labels)-n_initial)
        else:
            mult = 1
        return mult*(np.average(all_n_labeled)-n_initial)
예제 #2
0
    def avg_time_to_discovery(self, result_format="number"):
        """Estimate the Time to Discovery (TD) for each paper.

        Get the best/last estimate on how long it takes to find a paper.

        Arguments
        ---------
        result_format: str
            Desired output format: "number", "fraction" or "percentage".

        Returns
        -------
        dict:
            For each inclusion, key=paper_id, value=avg time.
        """
        labels = self.labels
        one_labels = np.where(labels == 1)[0]
        time_results = {label: [] for label in one_labels}

        # Iterate over all state files
        for state in self.states.values():
            # Get the order in which records were labeled
            label_order, n = _get_labeled_order(state)
            # Get the ranking of all papers at the last query
            proba_order = _get_last_proba_order(state)

            # Adjust factor, depending on the desired output format
            if result_format == "percentage":
                time_mult = 100 / (len(labels) - n)
            elif result_format == "fraction":
                time_mult = 1 / (len(labels) - n)
            else:
                time_mult = 1

            # Get the time to discovery
            for i_time, idx in enumerate(label_order[n:]):
                # for all inclusions that were found/labeled
                if labels[idx] == 1:
                    time_results[idx].append(time_mult * (i_time + 1))
            for i_time, idx in enumerate(proba_order):
                # for all inclusions that weren't found/labeled
                if labels[idx] == 1 and idx not in label_order[:n]:
                    time_results[idx].append(time_mult *
                                             (i_time + len(label_order) + 1))

        results = {}

        # Merge the results of all state files
        for label, trained_time in time_results.items():
            if len(trained_time) > 0:
                results[label] = np.average(trained_time)

        return results
    def avg_time_to_discovery(self, result_format="number"):
        """Get the best/last estimate on how long it takes to find a paper.

        Returns
        -------
        dict:
            For each inclusion, key=paper_id, value=avg time.
        """
        labels = self.labels

        one_labels = np.where(labels == 1)[0]
        time_results = {label: [] for label in one_labels}
        n_initial = []

        for i_file, logger in enumerate(self.loggers.values()):
            label_order, n = _get_labeled_order(logger)
            proba_order = _get_last_proba_order(logger)
            n_initial.append(n)

            for i_time, idx in enumerate(label_order):
                if labels[idx] == 1:
                    time_results[idx].append(i_time)

            for i_time, idx in enumerate(proba_order):
                if labels[idx] == 1 and len(time_results[idx]) <= i_file:
                    time_results[idx].append(i_time + len(label_order))

            for idx in time_results:
                if len(time_results[idx]) <= i_file:
                    time_results[idx].append(
                        len(label_order) + len(proba_order))

        results = {}
        for label in time_results:
            trained_time = []
            for i_file, time in enumerate(time_results[label]):
                if time >= n_initial[i_file]:
                    if result_format == "percentage":
                        time_measure = 100 * time / (len(labels) -
                                                     n_initial[i_file])
                    else:
                        time_measure = time
                    trained_time.append(time_measure)
            if len(trained_time) == 0:
                results[label] = 0
            else:
                results[label] = np.average(trained_time)
        return results
    def _print_logs(self):
        self._log_dict["time"]["end_time"] = str(datetime.now())
        label_order, _ = _get_labeled_order(self)
        try:
            labels_assigned = self.get("labels")[label_order]
        except (KeyError, IndexError):
            return ""
        labels = list(zip(label_order, labels_assigned))

        log_str = "Labeled during review:\n\n"
        for label in labels:
            log_str += f"{label[0]} => {label[1]}\n"

        pool_order = _get_last_proba_order(self)
        if len(pool_order) > 0:
            log_str += "\n\n Most likely included according to ASReview:\n\n"
            for idx in pool_order:
                log_str += f"{idx}\n"

        return log_str
예제 #5
0
    def avg_time_to_discovery(self, result_format="number"):
        """Get the best/last estimate on how long it takes to find a paper.

        Returns
        -------
        dict:
            For each inclusion, key=paper_id, value=avg time.
        """
        labels = self.labels

        one_labels = np.where(labels == 1)[0]
        time_results = {label: [] for label in one_labels}

        for state in self.states.values():
            label_order, n = _get_labeled_order(state)
            proba_order = _get_last_proba_order(state)
            if result_format == "percentage":
                time_mult = 100 / (len(labels) - n)
            elif result_format == "fraction":
                time_mult = 1 / (len(labels) - n)
            else:
                time_mult = 1

            for i_time, idx in enumerate(label_order[n:]):
                if labels[idx] == 1:
                    time_results[idx].append(time_mult * (i_time + 1))

            for i_time, idx in enumerate(proba_order):
                if labels[idx] == 1 and idx not in label_order[:n]:
                    time_results[idx].append(time_mult *
                                             (i_time + len(label_order) + 1))

        results = {}
        for label, trained_time in time_results.items():
            if len(trained_time) > 0:
                results[label] = np.average(trained_time)

        return results
예제 #6
0
    def limits(self, prob_allow_miss=[0.1], result_format="percentage"):
        """For each query, compute the number of papers for a criterium.

        A criterium is the average number of papers missed. For example,
        with 0.1, the criterium is that after reading x papers, there is
        (about) a 10% chance that one paper is not included. Another example,
        with 2.0, there are on average 2 papers missed after reading x papers.
        The value for x is returned for each query and probability by the
        function.

        Arguments
        ---------
        prob_allow_miss: list, float
            Sets the criterium for how many papers can be missed.

        returns
        -------
        dict:
            One entry, "x_range" with the number of papers read.
            List, "limits" of results for each probability and
            at # papers read.
        """
        if not isinstance(prob_allow_miss, list):
            prob_allow_miss = [prob_allow_miss]
        state = self.states[self._first_file]
        n_queries = state.n_queries()
        results = {
            "x_range": [],
            "limits": [[] for _ in range(len(prob_allow_miss))],
        }

        n_train = 0
        _, n_initial = _get_labeled_order(state)
        for query_i in range(n_queries):
            new_limits = _get_limits(self.states,
                                     query_i,
                                     self.labels,
                                     proba_allow_miss=prob_allow_miss)

            try:
                new_train_idx = state.get("train_idx", query_i)
            except KeyError:
                new_train_idx = None

            if new_train_idx is not None:
                n_train = len(new_train_idx)

            if new_limits is not None:
                if result_format == "percentage":
                    normalizer = 100 / (len(self.labels) - n_initial)
                else:
                    normalizer = 1
                results["x_range"].append((n_train - n_initial) * normalizer)
                for i_prob in range(len(prob_allow_miss)):
                    results["limits"][i_prob].append(
                        (new_limits[i_prob] - n_initial) * normalizer)

        if result_format == "percentage":
            res_dtype = np.float
        else:
            res_dtype = np.int

        results["x_range"] = np.array(results["x_range"], dtype=res_dtype)
        for i_prob in range(len(prob_allow_miss)):
            results["limits"][i_prob] = np.array(results["limits"][i_prob],
                                                 res_dtype)
        return results