def export_to_string(project_id, export_type="csv"): fp_lock = get_lock_path(project_id) as_data = read_data(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): proba = read_proba(project_id) if proba is None: proba = np.flip(np.arange(len(as_data))) else: proba = np.array(proba) labels = read_current_labels(project_id, as_data=as_data) pool_idx = np.where(labels == LABEL_NA)[0] one_idx = np.where(labels == 1)[0] zero_idx = np.where(labels == 0)[0] proba_order = np.argsort(-proba[pool_idx]) ranking = np.concatenate((one_idx, pool_idx[proba_order], zero_idx), axis=None) if export_type == "csv": return as_data.to_csv(fp=None, labels=labels, ranking=ranking) if export_type == "excel": get_tmp_path(project_id).mkdir(exist_ok=True) fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx") return as_data.to_excel(fp=fp_tmp_export, labels=labels, ranking=ranking) else: raise ValueError("This export type isn't implemented.")
def remove_dataset_to_project(project_id, file_name): """Remove dataset from project """ project_file_path = get_project_file_path(project_id) fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active", project_id=project_id): # open the projects file with open(project_file_path, "r") as f_read: project_dict = json.load(f_read) # remove the path from the project file data_fn = project_dict["dataset_path"] del project_dict["dataset_path"] with open(project_file_path, "w") as f_write: json.dump(project_dict, f_write) # files to remove data_path = get_data_file_path(project_id, data_fn) pool_path = get_pool_path(project_id) labeled_path = get_labeled_path(project_id) os.remove(str(data_path)) os.remove(str(pool_path)) os.remove(str(labeled_path))
def api_get_prior(project_id): # noqa: F401 """Get all papers classified as prior documents """ subset = request.args.get('subset', default=None, type=str) # check if the subset exists if subset is not None and subset not in ["included", "excluded"]: message = "Unkown subset parameter" return jsonify(message=message), 400 lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active"): label_history = read_label_history(project_id, subset=subset) indices = [x[0] for x in label_history] records = read_data(project_id).record(indices) payload = {"result": []} for i, record in enumerate(records): payload["result"].append({ "id": int(record.record_id), "title": record.title, "abstract": record.abstract, "authors": record.authors, "keywords": record.keywords, "included": int(label_history[i][1]) }) response = jsonify(payload) response.headers.add('Access-Control-Allow-Origin', '*') return response
def api_get_prior(project_id): # noqa: F401 """Get all papers classified as prior documents """ lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active"): label_history = read_label_history(project_id) indices = [x[0] for x in label_history] records = read_data(project_id).record(indices) payload = {"result": []} for i, record in enumerate(records): payload["result"].append({ "id": int(record.record_id), "title": record.title, "abstract": record.abstract, "authors": record.authors, "keywords": record.keywords, "included": int(label_history[i][1]) }) response = jsonify(payload) response.headers.add('Access-Control-Allow-Origin', '*') return response
def get_statistics(project_id): fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active", project_id=project_id): # get the index of the active iteration label_history = read_label_history(project_id) current_labels = read_current_labels(project_id, label_history=label_history) n_since_last_inclusion = 0 for _, inclusion in reversed(label_history): if inclusion == 1: break n_since_last_inclusion += 1 n_included = len(np.where(current_labels == 1)[0]) n_excluded = len(np.where(current_labels == 0)[0]) n_papers = len(current_labels) stats = { "n_included": n_included, "n_excluded": n_excluded, "n_since_last_inclusion": n_since_last_inclusion, "n_papers": n_papers, "n_pool": n_papers - n_included - n_excluded } return stats
def label_instance(project_id, paper_i, label, retrain_model=True): """Label a paper after reviewing the abstract. """ paper_i = int(paper_i) label = int(label) fp_lock = get_lock_path(project_id) with SQLiteLock( fp_lock, blocking=True, lock_name="active", project_id=project_id): # get the index of the active iteration if int(label) in [0, 1]: move_label_from_pool_to_labeled(project_id, paper_i, label) else: move_label_from_labeled_to_pool(project_id, paper_i) if retrain_model: # Update the model (if it isn't busy). py_exe = _get_executable() run_command = [py_exe, "-m", "asreview", "web_run_model", project_id] subprocess.Popen(run_command)
def export_to_string(project_id, export_type="csv"): # read the dataset into a ASReview data object as_data = read_data(project_id) # set the lock to safely read labeled, pool, and proba fp_lock = get_lock_path(project_id) with SQLiteLock( fp_lock, blocking=True, lock_name="active", project_id=project_id ): proba = read_proba(project_id) pool = read_pool(project_id) labeled = read_label_history(project_id) # get the record_id of the inclusions and exclusions inclusion_record_id = [int(x[0]) for x in labeled if x[1] == 1] exclusion_record_id = [int(x[0]) for x in labeled if x[1] == 0] # order the pool from high to low proba if proba is not None: pool_ordered = proba.loc[pool, :] \ .sort_values("proba", ascending=False).index.values else: pool_ordered = pool_ordered # get the ranking of the 3 subcategories ranking = np.concatenate( ( # add the inclusions first inclusion_record_id, # add the ordered pool second pool_ordered, # add the exclusions last exclusion_record_id ), axis=None ) # export the data to file if export_type == "csv": return as_data.to_csv(fp=None, labels=labeled, ranking=ranking) if export_type == "tsv": return as_data.to_csv( fp=None, sep="\t", labels=labeled, ranking=ranking) if export_type == "excel": get_tmp_path(project_id).mkdir(exist_ok=True) fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx") return as_data.to_excel( fp=fp_tmp_export, labels=labeled, ranking=ranking) else: raise ValueError("This export type isn't implemented.")
def api_random_prior_papers(project_id): # noqa: F401 """Get a selection of random papers to find exclusions. This set of papers is extracted from the pool, but without the already labeled items. """ lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active", project_id=project_id): pool = read_pool(project_id) # with open(get_labeled_path(project_id, 0), "r") as f_label: # prior_labeled = json.load(f_label) # excluded the already labeled items from our random selection. # prior_labeled_index = [int(label) for label in prior_labeled.keys()] # pool = [i for i in pool if i not in prior_labeled_index] # sample from the pool (this is already done atm of initializing # the pool. But doing it again because a double shuffle is always # best) try: pool_random = np.random.choice(pool, 1, replace=False)[0] except Exception: raise ValueError("Not enough random indices to sample from.") try: record = read_data(project_id).record(pool_random, by_index=False) payload = {"result": []} payload["result"].append({ "id": int(record.record_id), "title": record.title, "abstract": record.abstract, "authors": record.authors, "keywords": record.keywords, "included": None }) except Exception as err: logging.error(err) return jsonify(message="Failed to load random documents."), 500 response = jsonify(payload) response.headers.add('Access-Control-Allow-Origin', '*') return response
def add_dataset_to_project(project_id, file_name): """Add file path to the project file. Add file to data subfolder and fill the pool of iteration 0. """ project_file_path = get_project_file_path(project_id) # clean temp project files clean_project_tmp_files(project_id) with SQLiteLock( get_lock_path(project_id), blocking=True, lock_name="active", project_id=project_id ): # open the projects file with open(project_file_path, "r") as f_read: project_dict = json.load(f_read) # add path to dict (overwrite if already exists) project_dict["dataset_path"] = file_name with open(project_file_path, "w") as f_write: json.dump(project_dict, f_write) # fill the pool of the first iteration as_data = read_data(project_id) if as_data.labels is not None: unlabeled = np.where(as_data.labels == LABEL_NA)[0] pool_indices = as_data.record_ids[unlabeled] labeled_indices = np.where(as_data.labels != LABEL_NA)[0] label_indices = list(zip( as_data.record_ids[labeled_indices].tolist(), as_data.labels[labeled_indices].tolist() )) else: pool_indices = as_data.record_ids label_indices = [] np.random.shuffle(pool_indices) write_pool(project_id, pool_indices.tolist()) # make a empty qeue for the items to label write_label_history(project_id, label_indices)
def get_instance(project_id): """Get a new instance to review. Arguments --------- project_id: str The id of the current project. """ fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): pool_idx = read_pool(project_id) logging.info(f"Requesting {pool_idx[0]} from project {project_id}") return pool_idx[0]
def api_get_prior_stats(project_id): # noqa: F401 """Get all papers classified as prior documents """ lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active"): label_history = read_label_history(project_id) counter_prior = Counter([x[1] for x in label_history]) response = jsonify({ "n_prior": len(label_history), "n_inclusions": counter_prior[1], "n_exclusions": counter_prior[0] }) response.headers.add('Access-Control-Allow-Origin', '*') return response
def export_to_string(project_id): fp_lock = get_lock_path(project_id) as_data = read_data(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): proba = read_proba(project_id) if proba is None: proba = np.flip(np.arange(len(as_data))) else: proba = np.array(proba) labels = read_current_labels(project_id, as_data=as_data) pool_idx = np.where(labels == LABEL_NA)[0] one_idx = np.where(labels == 1)[0] zero_idx = np.where(labels == 0)[0] proba_order = np.argsort(-proba[pool_idx]) ranking = np.concatenate((one_idx, pool_idx[proba_order], zero_idx), axis=None) return as_data.to_csv(fp=None, labels=labels, ranking=ranking)
def get_instance(project_id): """Get a new instance to review. Arguments --------- project_id: str The id of the current project. """ fp_lock = get_lock_path(project_id) with SQLiteLock( fp_lock, blocking=True, lock_name="active", project_id=project_id): pool_idx = read_pool(project_id) if len(pool_idx) > 0: return pool_idx[0] else: # end of pool return None
def add_dataset_to_project(project_id, file_name): """Add file path to the project file. Add file to data subfolder and fill the pool of iteration 0. """ project_file_path = get_project_file_path(project_id) fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): # open the projects file with open(project_file_path, "r") as f_read: project_dict = json.load(f_read) # add path to dict (overwrite if already exists) project_dict["dataset_path"] = file_name with open(project_file_path, "w") as f_write: json.dump(project_dict, f_write) # fill the pool of the first iteration as_data = read_data(project_id) if as_data.labels is not None: unlabeled = np.where(as_data.labels == LABEL_NA)[0] pool_indices = as_data.record_ids[unlabeled] label_indices_included = \ [[int(x), 1] for x in np.where(as_data.labels == 1)[0]] label_indices_excluded = \ [[int(x), 0] for x in np.where(as_data.labels == 0)[0]] label_indices = label_indices_included + label_indices_excluded else: pool_indices = as_data.record_ids label_indices = [] np.random.shuffle(pool_indices) write_pool(project_id, pool_indices.tolist()) # make a empty qeue for the items to label write_label_history(project_id, label_indices)
def label_instance(project_id, paper_i, label, retrain_model=True): """Label a paper after reviewing the abstract. """ paper_i = int(paper_i) label = int(label) fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): # get the index of the active iteration if int(label) in [0, 1]: move_label_from_pool_to_labeled(project_id, paper_i, label) else: move_label_from_labeled_to_pool(project_id, paper_i, label) if retrain_model: # Update the model (if it isn't busy). run_command = f"python -m asreview web_run_model '{project_id}'" subprocess.Popen(shlex.split(run_command))
def get_statistics(project_id): """Get statistics from project files. Arguments --------- project_id: str The id of the current project. Returns ------- dict: Dictonary with statistics. """ fp_lock = get_lock_path(project_id) with SQLiteLock( fp_lock, blocking=True, lock_name="active", project_id=project_id ): # get the index of the active iteration labeled = read_label_history(project_id) pool = read_pool(project_id) # compute metrics n_included = n_relevant(labeled) n_excluded = n_irrelevant(labeled) n_pool = len(pool) return { "n_included": n_included, "n_excluded": n_excluded, "n_since_last_inclusion": stop_n_since_last_relevant(labeled), "n_papers": n_pool + n_included + n_excluded, "n_pool": n_pool }
def api_get_prior_stats(project_id): # noqa: F401 """Get all papers classified as prior documents """ try: lock_fp = get_lock_path(project_id) with SQLiteLock(lock_fp, blocking=True, lock_name="active", project_id=project_id): label_history = read_label_history(project_id) counter_prior = Counter([x[1] for x in label_history]) except Exception as err: logging.error(err) return jsonify(message="Failed to load prior information."), 500 response = jsonify({ "n_prior": len(label_history), "n_inclusions": counter_prior[1], "n_exclusions": counter_prior[0] }) response.headers.add('Access-Control-Allow-Origin', '*') return response