def get_paper_data(project_id, paper_id, return_title=True, return_authors=True, return_abstract=True, return_debug_label=False): """Get the title/authors/abstract for a paper.""" as_data = read_data(project_id) record = as_data.record(int(paper_id), by_index=False) paper_data = {} if return_title and record.title is not None: paper_data['title'] = record.title if return_authors and record.authors is not None: paper_data['authors'] = record.authors if return_abstract and record.abstract is not None: paper_data['abstract'] = record.abstract # return the publication data if available pub_time = record.extra_fields.get("publish_time", None) paper_data['publish_time'] = pub_time if pd.notnull(pub_time) else None # return the doi if available doi = record.extra_fields.get("doi", None) paper_data['doi'] = doi if pd.notnull(doi) else None # return the debug label debug_label = record.extra_fields.get("debug_label", None) paper_data['_debug_label'] = \ int(debug_label) if pd.notnull(debug_label) else None return paper_data
def add_dataset_to_project(project_id, file_name): """Add file path to the project file. Add file to data subfolder and fill the pool of iteration 0. """ project_file_path = get_project_file_path(project_id) fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): # open the projects file with open(project_file_path, "r") as f_read: project_dict = json.load(f_read) # add path to dict (overwrite if already exists) project_dict["dataset_path"] = file_name with open(project_file_path, "w") as f_write: json.dump(project_dict, f_write) # fill the pool of the first iteration pool_indices = read_data(project_id).record_ids np.random.shuffle(pool_indices) write_pool(project_id, pool_indices.tolist()) # make a empty qeue for the items to label write_label_history(project_id, [])
def get_paper_data(project_id, paper_id, return_title=True, return_authors=True, return_abstract=True, return_debug_label=False): """Get the title/authors/abstract for a paper.""" as_data = read_data(project_id) record = as_data.record(int(paper_id)) paper_data = {} if return_title and record.title is not None: paper_data['title'] = record.title if return_authors and record.authors is not None: paper_data['authors'] = record.authors if return_abstract and record.abstract is not None: paper_data['abstract'] = record.abstract # return the publication data if available if record.extra_fields.get("publish_time", None) is not None: paper_data['publish_time'] = \ record.extra_fields.get("publish_time", None) # return the debug label if return_debug_label and \ record.extra_fields.get("debug_label", None) is not None: paper_data['_debug_label'] = \ int(record.extra_fields.get("debug_label", None)) return paper_data
def export_to_string(project_id, export_type="csv"): fp_lock = get_lock_path(project_id) as_data = read_data(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): proba = read_proba(project_id) if proba is None: proba = np.flip(np.arange(len(as_data))) else: proba = np.array(proba) labels = read_current_labels(project_id, as_data=as_data) pool_idx = np.where(labels == LABEL_NA)[0] one_idx = np.where(labels == 1)[0] zero_idx = np.where(labels == 0)[0] proba_order = np.argsort(-proba[pool_idx]) ranking = np.concatenate((one_idx, pool_idx[proba_order], zero_idx), axis=None) if export_type == "csv": return as_data.to_csv(fp=None, labels=labels, ranking=ranking) if export_type == "excel": get_tmp_path(project_id).mkdir(exist_ok=True) fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx") return as_data.to_excel(fp=fp_tmp_export, labels=labels, ranking=ranking) else: raise ValueError("This export type isn't implemented.")
def export_to_string(project_id, export_type="csv"): # read the dataset into a ASReview data object as_data = read_data(project_id) # set the lock to safely read labeled, pool, and proba fp_lock = get_lock_path(project_id) with SQLiteLock( fp_lock, blocking=True, lock_name="active", project_id=project_id ): proba = read_proba(project_id) pool = read_pool(project_id) labeled = read_label_history(project_id) # get the record_id of the inclusions and exclusions inclusion_record_id = [int(x[0]) for x in labeled if x[1] == 1] exclusion_record_id = [int(x[0]) for x in labeled if x[1] == 0] # order the pool from high to low proba if proba is not None: pool_ordered = proba.loc[pool, :] \ .sort_values("proba", ascending=False).index.values else: pool_ordered = pool_ordered # get the ranking of the 3 subcategories ranking = np.concatenate( ( # add the inclusions first inclusion_record_id, # add the ordered pool second pool_ordered, # add the exclusions last exclusion_record_id ), axis=None ) # export the data to file if export_type == "csv": return as_data.to_csv(fp=None, labels=labeled, ranking=ranking) if export_type == "tsv": return as_data.to_csv( fp=None, sep="\t", labels=labeled, ranking=ranking) if export_type == "excel": get_tmp_path(project_id).mkdir(exist_ok=True) fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx") return as_data.to_excel( fp=fp_tmp_export, labels=labeled, ranking=ranking) else: raise ValueError("This export type isn't implemented.")
def search_data(project_id, q, n_max=100): """Get the title/authors/abstract for a paper.""" # read the dataset as_data = read_data(project_id) # search for the keywords paper_ids = as_data.fuzzy_find(q, max_return=n_max, exclude=[]) # return full information on the records return as_data.record(paper_ids)
def get_data_statistics(project_id): """Get the title/authors/abstract for a paper.""" # read the dataset as_data = read_data(project_id) result = { "n_rows": as_data.df.shape[0], "n_cols": as_data.df.shape[1], } # return full information on the records return result
def search_data(project_id, q, n_max=100): """Get the title/authors/abstract for a paper.""" # read the dataset as_data = read_data(project_id) # search for the keywords result_idx = fuzzy_find(as_data, q, max_return=n_max, exclude=[], by_index=True) # return full information on the records return as_data.record(result_idx, by_index=True)
def add_dataset_to_project(project_id, file_name): """Add file path to the project file. Add file to data subfolder and fill the pool of iteration 0. """ project_file_path = get_project_file_path(project_id) # clean temp project files clean_project_tmp_files(project_id) with SQLiteLock( get_lock_path(project_id), blocking=True, lock_name="active", project_id=project_id ): # open the projects file with open(project_file_path, "r") as f_read: project_dict = json.load(f_read) # add path to dict (overwrite if already exists) project_dict["dataset_path"] = file_name with open(project_file_path, "w") as f_write: json.dump(project_dict, f_write) # fill the pool of the first iteration as_data = read_data(project_id) if as_data.labels is not None: unlabeled = np.where(as_data.labels == LABEL_NA)[0] pool_indices = as_data.record_ids[unlabeled] labeled_indices = np.where(as_data.labels != LABEL_NA)[0] label_indices = list(zip( as_data.record_ids[labeled_indices].tolist(), as_data.labels[labeled_indices].tolist() )) else: pool_indices = as_data.record_ids label_indices = [] np.random.shuffle(pool_indices) write_pool(project_id, pool_indices.tolist()) # make a empty qeue for the items to label write_label_history(project_id, label_indices)
def get_paper_data(project_id, paper_id, return_debug_label=False): """Get the title/authors/abstract for a paper.""" as_data = read_data(project_id) record = as_data.record(int(paper_id), by_index=False) paper_data = {} paper_data['title'] = record.title paper_data['authors'] = record.authors paper_data['abstract'] = record.abstract paper_data['doi'] = record.doi # return the debug label debug_label = record.extra_fields.get("debug_label", None) paper_data['_debug_label'] = \ int(debug_label) if pd.notnull(debug_label) else None return paper_data
def export_to_string(project_id): fp_lock = get_lock_path(project_id) as_data = read_data(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): proba = read_proba(project_id) if proba is None: proba = np.flip(np.arange(len(as_data))) else: proba = np.array(proba) labels = read_current_labels(project_id, as_data=as_data) pool_idx = np.where(labels == LABEL_NA)[0] one_idx = np.where(labels == 1)[0] zero_idx = np.where(labels == 0)[0] proba_order = np.argsort(-proba[pool_idx]) ranking = np.concatenate((one_idx, pool_idx[proba_order], zero_idx), axis=None) return as_data.to_csv(fp=None, labels=labels, ranking=ranking)
def add_dataset_to_project(project_id, file_name): """Add file path to the project file. Add file to data subfolder and fill the pool of iteration 0. """ project_file_path = get_project_file_path(project_id) fp_lock = get_lock_path(project_id) with SQLiteLock(fp_lock, blocking=True, lock_name="active"): # open the projects file with open(project_file_path, "r") as f_read: project_dict = json.load(f_read) # add path to dict (overwrite if already exists) project_dict["dataset_path"] = file_name with open(project_file_path, "w") as f_write: json.dump(project_dict, f_write) # fill the pool of the first iteration as_data = read_data(project_id) if as_data.labels is not None: unlabeled = np.where(as_data.labels == LABEL_NA)[0] pool_indices = as_data.record_ids[unlabeled] label_indices_included = \ [[int(x), 1] for x in np.where(as_data.labels == 1)[0]] label_indices_excluded = \ [[int(x), 0] for x in np.where(as_data.labels == 0)[0]] label_indices = label_indices_included + label_indices_excluded else: pool_indices = as_data.record_ids label_indices = [] np.random.shuffle(pool_indices) write_pool(project_id, pool_indices.tolist()) # make a empty qeue for the items to label write_label_history(project_id, label_indices)