def recall_at_limit(self, limit=0, result_format="percentage", final_labels=False): if final_labels: labels = self.final_labels else: labels = self.labels n_included = np.sum(labels) all_n_labeled = [] for logger in self.loggers.values(): n_current_included = 0 n_labeled = 0 _, n_initial = _get_labeled_order(logger) for query_i in range(logger.n_queries()): label_idx = logger.get("label_idx", query_i=query_i) for idx in label_idx: inclusion = labels[idx] n_current_included += inclusion n_labeled += 1 if n_current_included == n_included - limit: all_n_labeled.append(n_labeled) break if n_current_included == n_included - limit: break if result_format == "percentage": mult = 100/(len(labels)-n_initial) else: mult = 1 return mult*(np.average(all_n_labeled)-n_initial)
def avg_time_to_discovery(self, result_format="number"): """Estimate the Time to Discovery (TD) for each paper. Get the best/last estimate on how long it takes to find a paper. Arguments --------- result_format: str Desired output format: "number", "fraction" or "percentage". Returns ------- dict: For each inclusion, key=paper_id, value=avg time. """ labels = self.labels one_labels = np.where(labels == 1)[0] time_results = {label: [] for label in one_labels} # Iterate over all state files for state in self.states.values(): # Get the order in which records were labeled label_order, n = _get_labeled_order(state) # Get the ranking of all papers at the last query proba_order = _get_last_proba_order(state) # Adjust factor, depending on the desired output format if result_format == "percentage": time_mult = 100 / (len(labels) - n) elif result_format == "fraction": time_mult = 1 / (len(labels) - n) else: time_mult = 1 # Get the time to discovery for i_time, idx in enumerate(label_order[n:]): # for all inclusions that were found/labeled if labels[idx] == 1: time_results[idx].append(time_mult * (i_time + 1)) for i_time, idx in enumerate(proba_order): # for all inclusions that weren't found/labeled if labels[idx] == 1 and idx not in label_order[:n]: time_results[idx].append(time_mult * (i_time + len(label_order) + 1)) results = {} # Merge the results of all state files for label, trained_time in time_results.items(): if len(trained_time) > 0: results[label] = np.average(trained_time) return results
def avg_time_to_discovery(self, result_format="number"): """Get the best/last estimate on how long it takes to find a paper. Returns ------- dict: For each inclusion, key=paper_id, value=avg time. """ labels = self.labels one_labels = np.where(labels == 1)[0] time_results = {label: [] for label in one_labels} n_initial = [] for i_file, logger in enumerate(self.loggers.values()): label_order, n = _get_labeled_order(logger) proba_order = _get_last_proba_order(logger) n_initial.append(n) for i_time, idx in enumerate(label_order): if labels[idx] == 1: time_results[idx].append(i_time) for i_time, idx in enumerate(proba_order): if labels[idx] == 1 and len(time_results[idx]) <= i_file: time_results[idx].append(i_time + len(label_order)) for idx in time_results: if len(time_results[idx]) <= i_file: time_results[idx].append( len(label_order) + len(proba_order)) results = {} for label in time_results: trained_time = [] for i_file, time in enumerate(time_results[label]): if time >= n_initial[i_file]: if result_format == "percentage": time_measure = 100 * time / (len(labels) - n_initial[i_file]) else: time_measure = time trained_time.append(time_measure) if len(trained_time) == 0: results[label] = 0 else: results[label] = np.average(trained_time) return results
def _print_logs(self): self._log_dict["time"]["end_time"] = str(datetime.now()) label_order, _ = _get_labeled_order(self) try: labels_assigned = self.get("labels")[label_order] except (KeyError, IndexError): return "" labels = list(zip(label_order, labels_assigned)) log_str = "Labeled during review:\n\n" for label in labels: log_str += f"{label[0]} => {label[1]}\n" pool_order = _get_last_proba_order(self) if len(pool_order) > 0: log_str += "\n\n Most likely included according to ASReview:\n\n" for idx in pool_order: log_str += f"{idx}\n" return log_str
def avg_time_to_discovery(self, result_format="number"): """Get the best/last estimate on how long it takes to find a paper. Returns ------- dict: For each inclusion, key=paper_id, value=avg time. """ labels = self.labels one_labels = np.where(labels == 1)[0] time_results = {label: [] for label in one_labels} for state in self.states.values(): label_order, n = _get_labeled_order(state) proba_order = _get_last_proba_order(state) if result_format == "percentage": time_mult = 100 / (len(labels) - n) elif result_format == "fraction": time_mult = 1 / (len(labels) - n) else: time_mult = 1 for i_time, idx in enumerate(label_order[n:]): if labels[idx] == 1: time_results[idx].append(time_mult * (i_time + 1)) for i_time, idx in enumerate(proba_order): if labels[idx] == 1 and idx not in label_order[:n]: time_results[idx].append(time_mult * (i_time + len(label_order) + 1)) results = {} for label, trained_time in time_results.items(): if len(trained_time) > 0: results[label] = np.average(trained_time) return results
def limits(self, prob_allow_miss=[0.1], result_format="percentage"): """For each query, compute the number of papers for a criterium. A criterium is the average number of papers missed. For example, with 0.1, the criterium is that after reading x papers, there is (about) a 10% chance that one paper is not included. Another example, with 2.0, there are on average 2 papers missed after reading x papers. The value for x is returned for each query and probability by the function. Arguments --------- prob_allow_miss: list, float Sets the criterium for how many papers can be missed. returns ------- dict: One entry, "x_range" with the number of papers read. List, "limits" of results for each probability and at # papers read. """ if not isinstance(prob_allow_miss, list): prob_allow_miss = [prob_allow_miss] state = self.states[self._first_file] n_queries = state.n_queries() results = { "x_range": [], "limits": [[] for _ in range(len(prob_allow_miss))], } n_train = 0 _, n_initial = _get_labeled_order(state) for query_i in range(n_queries): new_limits = _get_limits(self.states, query_i, self.labels, proba_allow_miss=prob_allow_miss) try: new_train_idx = state.get("train_idx", query_i) except KeyError: new_train_idx = None if new_train_idx is not None: n_train = len(new_train_idx) if new_limits is not None: if result_format == "percentage": normalizer = 100 / (len(self.labels) - n_initial) else: normalizer = 1 results["x_range"].append((n_train - n_initial) * normalizer) for i_prob in range(len(prob_allow_miss)): results["limits"][i_prob].append( (new_limits[i_prob] - n_initial) * normalizer) if result_format == "percentage": res_dtype = np.float else: res_dtype = np.int results["x_range"] = np.array(results["x_range"], dtype=res_dtype) for i_prob in range(len(prob_allow_miss)): results["limits"][i_prob] = np.array(results["limits"][i_prob], res_dtype) return results