def get_statistics(project_id): fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): # get the index of the active iteration label_history = read_label_history(project_id) current_labels = read_current_labels(project_id, label_history=label_history) n_since_last_inclusion = 0 for _, inclusion in reversed(label_history): if inclusion == 1: break n_since_last_inclusion += 1 n_included = len(np.where(current_labels == 1)[0]) n_excluded = len(np.where(current_labels == 0)[0]) n_papers = len(current_labels) stats = { "n_included": n_included, "n_excluded": n_excluded, "n_since_last_inclusion": n_since_last_inclusion, "n_papers": n_papers, "n_pool": n_papers - n_included - n_excluded } return stats
def move_label_from_labeled_to_pool(project_id, paper_i): # load the papers from the pool pool_list = read_pool(project_id) # Add the paper to the reviewed papers. labeled_list = read_label_history(project_id) labeled_list_new = [] for item_id, item_label in labeled_list: item_id = int(item_id) item_label = int(item_label) paper_i = int(paper_i) if paper_i == item_id: pool_list.append(item_id) else: labeled_list_new.append([item_id, item_label]) # write the papers to the label dataset write_pool(project_id, pool_list) # load the papers from the pool write_label_history(project_id, labeled_list_new)
def get_statistics(project_id): """Get statistics from project files. Arguments --------- project_id: str The id of the current project. Returns ------- dict: Dictonary with statistics. """ fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active", project_id=project_id): # get the index of the active iteration labeled = read_label_history(project_id) pool = read_pool(project_id) # compute metrics n_included = n_relevant(labeled) n_excluded = n_irrelevant(labeled) n_pool = len(pool) return { "n_included": n_included, "n_excluded": n_excluded, "n_since_last_inclusion": stop_n_since_last_relevant(labeled), "n_papers": n_pool + n_included + n_excluded, "n_pool": n_pool }
def api_get_prior(project_id): # noqa: F401 """Get all papers classified as prior documents """ lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active"): label_history = read_label_history(project_id) indices = [x[0] for x in label_history] records = read_data(project_id).record(indices) payload = {"result": []} for i, record in enumerate(records): payload["result"].append({ "id": int(record.record_id), "title": record.title, "abstract": record.abstract, "authors": record.authors, "keywords": record.keywords, "included": int(label_history[i][1]) }) response = jsonify(payload) response.headers.add('Access-Control-Allow-Origin', '*') return response
def api_get_progress_history(project_id): """Get progress history on the article""" # get label history labeled = read_label_history(project_id) data = [] for [key, value] in labeled: data.append(value) # create a dataset with the rolling mean of every 10 papers df = pd.DataFrame(data, columns=["Relevant"]).rolling(10, min_periods=1).mean() df["Total"] = df.index + 1 # transform mean(percentage) to number for i in range(0, len(df)): if df.loc[i, "Total"] < 10: df.loc[i, "Irrelevant"] = (1 - df.loc[i, "Relevant"]) * df.loc[i, "Total"] df.loc[i, "Relevant"] = df.loc[i, "Total"] - df.loc[i, "Irrelevant"] else: df.loc[i, "Irrelevant"] = (1 - df.loc[i, "Relevant"]) * 10 df.loc[i, "Relevant"] = 10 - df.loc[i, "Irrelevant"] df = df.round(1).to_dict(orient="records") response = jsonify(df) response.headers.add('Access-Control-Allow-Origin', '*') return response
def api_get_prior(project_id): # noqa: F401 """Get all papers classified as prior documents """ subset = request.args.get('subset', default=None, type=str) # check if the subset exists if subset is not None and subset not in ["included", "excluded"]: message = "Unkown subset parameter" return jsonify(message=message), 400 lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active"): label_history = read_label_history(project_id, subset=subset) indices = [x[0] for x in label_history] records = read_data(project_id).record(indices) payload = {"result": []} for i, record in enumerate(records): payload["result"].append({ "id": int(record.record_id), "title": record.title, "abstract": record.abstract, "authors": record.authors, "keywords": record.keywords, "included": int(label_history[i][1]) }) response = jsonify(payload) response.headers.add('Access-Control-Allow-Origin', '*') return response
def api_get_progress_efficiency(project_id): """Get cumulative number of inclusions by ASReview/at random""" try: statistics = get_data_statistics(project_id) labeled = read_label_history(project_id) data = [] for [key, value] in labeled: data.append(value) # create a dataset with the cumulative number of inclusions df = pd.DataFrame(data, columns=["Relevant"]).cumsum() df["Total"] = df.index + 1 df["Random"] = ( df["Total"] * (df["Relevant"][-1:] / statistics["n_rows"]).values).round() df = df.round(1).to_dict(orient="records") except Exception as err: logging.error(err) return jsonify(message="Failed to load efficiency plot."), 500 response = jsonify(df) response.headers.add('Access-Control-Allow-Origin', '*') return response
def export_to_string(project_id, export_type="csv"): # read the dataset into a ASReview data object as_data = read_data(project_id) # set the lock to safely read labeled, pool, and proba fp_lock = get_lock_path(project_id) with SQLiteLock( fp_lock, blocking=True, lock_name="active", project_id=project_id ): proba = read_proba(project_id) pool = read_pool(project_id) labeled = read_label_history(project_id) # get the record_id of the inclusions and exclusions inclusion_record_id = [int(x[0]) for x in labeled if x[1] == 1] exclusion_record_id = [int(x[0]) for x in labeled if x[1] == 0] # order the pool from high to low proba if proba is not None: pool_ordered = proba.loc[pool, :] \ .sort_values("proba", ascending=False).index.values else: pool_ordered = pool_ordered # get the ranking of the 3 subcategories ranking = np.concatenate( ( # add the inclusions first inclusion_record_id, # add the ordered pool second pool_ordered, # add the exclusions last exclusion_record_id ), axis=None ) # export the data to file if export_type == "csv": return as_data.to_csv(fp=None, labels=labeled, ranking=ranking) if export_type == "tsv": return as_data.to_csv( fp=None, sep="\t", labels=labeled, ranking=ranking) if export_type == "excel": get_tmp_path(project_id).mkdir(exist_ok=True) fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx") return as_data.to_excel( fp=fp_tmp_export, labels=labeled, ranking=ranking) else: raise ValueError("This export type isn't implemented.")
def api_get_prior_stats(project_id): # noqa: F401 """Get all papers classified as prior documents """ lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active"): label_history = read_label_history(project_id) counter_prior = Counter([x[1] for x in label_history]) response = jsonify({ "n_prior": len(label_history), "n_inclusions": counter_prior[1], "n_exclusions": counter_prior[0] }) response.headers.add('Access-Control-Allow-Origin', '*') return response
def move_label_from_pool_to_labeled(project_id, paper_i, label): # load the papers from the pool pool_idx = read_pool(project_id) # Remove the paper from the pool. try: pool_idx.remove(int(paper_i)) except (IndexError, ValueError): return write_pool(project_id, pool_idx) # Add the paper to the reviewed papers. labeled = read_label_history(project_id) labeled.append([int(paper_i), int(label)]) write_label_history(project_id, labeled)
def api_get_prior_stats(project_id): # noqa: F401 """Get all papers classified as prior documents """ try: lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active", project_id=project_id): label_history = read_label_history(project_id) counter_prior = Counter([x[1] for x in label_history]) except Exception as err: logging.error(err) return jsonify(message="Failed to load prior information."), 500 response = jsonify({ "n_prior": len(label_history), "n_inclusions": counter_prior[1], "n_exclusions": counter_prior[0] }) response.headers.add('Access-Control-Allow-Origin', '*') return response
def train_model(project_id, label_method=None): """Add the new labels to the review and do the modeling. It uses a lock to ensure only one model is running at the same time. Old results directories are deleted after 4 iterations. It has one argument on the CLI, which is the base project directory. """ logging.info(f"Project {project_id} - Train a new model for project") # get file locations asr_kwargs_file = get_kwargs_path(project_id) lock_file = get_lock_path(project_id) # Lock so that only one training run is running at the same time. # It doesn't lock the flask server/client. with SQLiteLock(lock_file, blocking=False, lock_name="training", project_id=project_id) as lock: # If the lock is not acquired, another training instance is running. if not lock.locked(): logging.info("Project {project_id} - " "Cannot acquire lock, other instance running.") return # Lock the current state. We want to have a consistent active state. # This does communicate with the flask backend; it prevents writing and # reading to the same files at the same time. with SQLiteLock(lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # Get the all labels since last run. If no new labels, quit. new_label_history = read_label_history(project_id) data_fp = str(get_data_file_path(project_id)) as_data = read_data(project_id) state_file = get_state_path(project_id) # collect command line arguments and pass them to the reviewer with open(asr_kwargs_file, "r") as fp: asr_kwargs = json.load(fp) asr_kwargs['state_file'] = str(state_file) reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs) with open_state(state_file) as state: old_label_history = get_label_train_history(state) diff_history = get_diff_history(new_label_history, old_label_history) if len(diff_history) == 0: logging.info( "Project {project_id} - No new labels since last run.") return query_idx = np.array([x[0] for x in diff_history], dtype=int) inclusions = np.array([x[1] for x in diff_history], dtype=int) # Classify the new labels, train and store the results. with open_state(state_file) as state: reviewer.classify(query_idx, inclusions, state, method=label_method) reviewer.train() reviewer.log_probabilities(state) new_query_idx = reviewer.query(reviewer.n_pool()).tolist() reviewer.log_current_query(state) proba = state.pred_proba.tolist() with SQLiteLock(lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: current_pool = read_pool(project_id) in_current_pool = np.zeros(len(as_data)) in_current_pool[current_pool] = 1 new_pool = [x for x in new_query_idx if in_current_pool[x]] write_pool(project_id, new_pool) write_proba(project_id, proba)
def train_model(project_id, label_method=None): """Add the new labels to the review and do the modeling. It uses a lock to ensure only one model is running at the same time. Old results directories are deleted after 4 iterations. It has one argument on the CLI, which is the base project directory. """ logging.info(f"Project {project_id} - Train a new model for project") # get file locations asr_kwargs_file = get_kwargs_path(project_id) lock_file = get_lock_path(project_id) # Lock so that only one training run is running at the same time. # It doesn't lock the flask server/client. with SQLiteLock( lock_file, blocking=False, lock_name="training", project_id=project_id) as lock: # If the lock is not acquired, another training instance is running. if not lock.locked(): logging.info("Project {project_id} - " "Cannot acquire lock, other instance running.") return # Lock the current state. We want to have a consistent active state. # This does communicate with the flask backend; it prevents writing and # reading to the same files at the same time. with SQLiteLock( lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # Get the all labels since last run. If no new labels, quit. new_label_history = read_label_history(project_id) data_fp = str(get_data_file_path(project_id)) as_data = read_data(project_id) state_file = get_state_path(project_id) # collect command line arguments and pass them to the reviewer with open(asr_kwargs_file, "r") as fp: asr_kwargs = json.load(fp) try: del asr_kwargs["abstract_only"] except KeyError: pass asr_kwargs['state_file'] = str(state_file) reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs) with open_state(state_file) as state: old_label_history = _get_label_train_history(state) diff_history = _get_diff_history(new_label_history, old_label_history) if len(diff_history) == 0: logging.info( "Project {project_id} - No new labels since last run.") return query_record_ids = np.array([x[0] for x in diff_history], dtype=int) inclusions = np.array([x[1] for x in diff_history], dtype=int) query_idx = convert_id_to_idx(as_data, query_record_ids) # Classify the new labels, train and store the results. with open_state(state_file) as state: reviewer.classify( query_idx, inclusions, state, method=label_method) reviewer.train() reviewer.log_probabilities(state) new_query_idx = reviewer.query(reviewer.n_pool()).tolist() reviewer.log_current_query(state) # write the proba to a pandas dataframe with record_ids as index proba = pd.DataFrame( {"proba": state.pred_proba.tolist()}, index=pd.Index(as_data.record_ids, name="record_id") ) # update the pool and output the proba's # important: pool is sorted on query with SQLiteLock( lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # read the pool current_pool = read_pool(project_id) # diff pool and new_query_ind current_pool_idx = convert_id_to_idx(as_data, current_pool) current_pool_idx = frozenset(current_pool_idx) new_pool_idx = [x for x in new_query_idx if x in current_pool_idx] # convert new_pool_idx back to record_ids new_pool = convert_idx_to_id(as_data, new_pool_idx) # write the pool and proba write_pool(project_id, new_pool) write_proba(project_id, proba)