def api_set_algorithms(project_id): # noqa: F401 # check if there is a kwargs file try: # open the projects file with open(get_kwargs_path(project_id), "r") as f_read: kargs_dict = json.load(f_read) except FileNotFoundError: # set the kwargs dict to setup kwargs kargs_dict = deepcopy(app.config['asr_kwargs']) # add the machine learning model to the kwargs # TODO@{Jonathan} validate model choice on server side ml_model = request.form.get("model", None) ml_query_strategy = request.form.get("query_strategy", None) ml_feature_extraction = request.form.get("feature_extraction", None) if ml_model: kargs_dict["model"] = ml_model if ml_query_strategy: kargs_dict["query_strategy"] = ml_query_strategy if ml_feature_extraction: kargs_dict["feature_extraction"] = ml_feature_extraction # write the kwargs to a file with open(get_kwargs_path(project_id), "w") as f_write: json.dump(kargs_dict, f_write) response = jsonify({'success': True}) response.headers.add('Access-Control-Allow-Origin', '*') return response
def api_get_project_info(project_id): # noqa: F401 """Get info on the article""" logging.info("get project info") try: # read the file with project info with open(get_project_file_path(project_id), "r") as fp: project_info = json.load(fp) # check if there is a dataset try: get_data_file_path(project_id) project_info["projectHasDataset"] = True except Exception: project_info["projectHasDataset"] = False # check if there is a prior knowledge (check if there is a model set), # if this is the case, the reviewer past the prior knowledge screen. project_info["projectHasPriorKnowledge"] = \ get_kwargs_path(project_id).exists() # check if there is a prior knowledge (check if there is a model set), # if this is the case, the reviewer past the prior knowledge screen. project_info["projectHasAlgorithms"] = \ get_kwargs_path(project_id).exists() # backwards support <0.10 if "projectInitReady" not in project_info: if project_info["projectHasPriorKnowledge"]: project_info["projectInitReady"] = True else: project_info["projectInitReady"] = False response = jsonify(project_info) except FileNotFoundError as err: logging.error(err) response = jsonify(message="Project not found.") return response, 400 except Exception as err: logging.error(err) response = jsonify(message="Internal Server Error.") return response, 500 return response
def api_start(project_id): # noqa: F401 """Start training the model """ # get the CLI arguments asr_kwargs = deepcopy(app.config['asr_kwargs']) # add the machine learning model to the kwargs # TODO@{Jonathan} validate model choice on server side ml_model = request.form.get("machine_learning_model", None) if ml_model: asr_kwargs["model"] = ml_model # write the kwargs to a file with open(get_kwargs_path(project_id), "w") as fp: json.dump(asr_kwargs, fp) # start training the model py_exe = _get_executable() run_command = [ py_exe, "-m", "asreview", "web_run_model", project_id, "--label_method", "prior" ] subprocess.Popen(run_command) response = jsonify({'success': True}) response.headers.add('Access-Control-Allow-Origin', '*') return response
def api_get_project_info(project_id): # noqa: F401 """Get info on the article""" try: # read the file with project info with open(get_project_file_path(project_id), "r") as fp: project_info = json.load(fp) # check if there is a dataset try: get_data_file_path(project_id) project_info["projectHasDataset"] = True except Exception: project_info["projectHasDataset"] = False # check if there is a prior knowledge (check if there is a model set), # if this is the case, the reviewer past the prior knowledge screen. project_info["projectHasPriorKnowledge"] = \ get_kwargs_path(project_id).exists() # check if there is a prior knowledge (check if there is a model set), # if this is the case, the reviewer past the prior knowledge screen. project_info["projectHasAlgorithms"] = \ get_kwargs_path(project_id).exists() # backwards support <0.10 if "projectInitReady" not in project_info: if project_info["projectHasPriorKnowledge"]: project_info["projectInitReady"] = True else: project_info["projectInitReady"] = False except FileNotFoundError: raise ProjectNotFoundError() return jsonify(project_info)
def api_get_algorithms(project_id): # noqa: F401 # check if there is a kwargs file try: # open the projects file with open(get_kwargs_path(project_id), "r") as f_read: kargs_dict = json.load(f_read) except FileNotFoundError: # set the kwargs dict to setup kwargs kargs_dict = deepcopy(app.config['asr_kwargs']) response = jsonify(kargs_dict) response.headers.add('Access-Control-Allow-Origin', '*') return response
def init_project(project_id, project_name=None, project_description=None, project_authors=None): """Initialize the necessary files specific to the web app.""" if not project_id and not isinstance(project_id, str) \ and len(project_id) >= 3: raise ValueError("Project name can't be None or empty string") # get the directory with the projects project_dir = asreview_path() / project_id if project_dir.exists(): raise ValueError("Project already exists") try: project_dir.mkdir() fp_data = project_dir / "data" fp_data.mkdir() # create a file with project info with open(get_project_file_path(project_id), "w") as fp: json.dump( { 'version': asreview_version, # todo: Fail without git? 'id': project_id, 'name': project_name, 'description': project_description, 'authors': project_authors }, fp) asr_kwargs = deepcopy(app.config['asr_kwargs']) # remove config with open(get_kwargs_path(project_id), "w") as fp: json.dump(asr_kwargs, fp) # make a copy of the arguments to the state file asr_kwargs['state_file'] = str(get_state_path(project_id)) except Exception as err: # remove all generated folders and raise error shutil.rmtree(project_dir) raise err
def api_get_algorithms(project_id): # noqa: F401 # check if there is a kwargs file try: # open the projects file with open(get_kwargs_path(project_id), "r") as f_read: kwargs_dict = json.load(f_read) except FileNotFoundError: # set the kwargs dict to setup kwargs kwargs_dict = deepcopy(app.config['asr_kwargs']) kwargs_dict["model"] = DEFAULT_MODEL kwargs_dict["feature_extraction"] = DEFAULT_FEATURE_EXTRACTION kwargs_dict["query_strategy"] = DEFAULT_QUERY_STRATEGY kwargs_dict["balance_strategy"] = DEFAULT_BALANCE_STRATEGY kwargs_dict["n_instances"] = DEFAULT_N_INSTANCES response = jsonify(kwargs_dict) response.headers.add('Access-Control-Allow-Origin', '*') return response
def train_model(project_id, label_method=None): """Add the new labels to the review and do the modeling. It uses a lock to ensure only one model is running at the same time. Old results directories are deleted after 4 iterations. It has one argument on the CLI, which is the base project directory. """ logging.info(f"Project {project_id} - Train a new model for project") # get file locations asr_kwargs_file = get_kwargs_path(project_id) lock_file = get_lock_path(project_id) # Lock so that only one training run is running at the same time. # It doesn't lock the flask server/client. with SQLiteLock(lock_file, blocking=False, lock_name="training", project_id=project_id) as lock: # If the lock is not acquired, another training instance is running. if not lock.locked(): logging.info("Project {project_id} - " "Cannot acquire lock, other instance running.") return # Lock the current state. We want to have a consistent active state. # This does communicate with the flask backend; it prevents writing and # reading to the same files at the same time. with SQLiteLock(lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # Get the all labels since last run. If no new labels, quit. new_label_history = read_label_history(project_id) data_fp = str(get_data_file_path(project_id)) as_data = read_data(project_id) state_file = get_state_path(project_id) # collect command line arguments and pass them to the reviewer with open(asr_kwargs_file, "r") as fp: asr_kwargs = json.load(fp) asr_kwargs['state_file'] = str(state_file) reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs) with open_state(state_file) as state: old_label_history = get_label_train_history(state) diff_history = get_diff_history(new_label_history, old_label_history) if len(diff_history) == 0: logging.info( "Project {project_id} - No new labels since last run.") return query_idx = np.array([x[0] for x in diff_history], dtype=int) inclusions = np.array([x[1] for x in diff_history], dtype=int) # Classify the new labels, train and store the results. with open_state(state_file) as state: reviewer.classify(query_idx, inclusions, state, method=label_method) reviewer.train() reviewer.log_probabilities(state) new_query_idx = reviewer.query(reviewer.n_pool()).tolist() reviewer.log_current_query(state) proba = state.pred_proba.tolist() with SQLiteLock(lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: current_pool = read_pool(project_id) in_current_pool = np.zeros(len(as_data)) in_current_pool[current_pool] = 1 new_pool = [x for x in new_query_idx if in_current_pool[x]] write_pool(project_id, new_pool) write_proba(project_id, proba)
def train_model(project_id, label_method=None): """Add the new labels to the review and do the modeling. It uses a lock to ensure only one model is running at the same time. Old results directories are deleted after 4 iterations. It has one argument on the CLI, which is the base project directory. """ logging.info(f"Project {project_id} - Train a new model for project") # get file locations asr_kwargs_file = get_kwargs_path(project_id) lock_file = get_lock_path(project_id) # Lock so that only one training run is running at the same time. # It doesn't lock the flask server/client. with SQLiteLock( lock_file, blocking=False, lock_name="training", project_id=project_id) as lock: # If the lock is not acquired, another training instance is running. if not lock.locked(): logging.info("Project {project_id} - " "Cannot acquire lock, other instance running.") return # Lock the current state. We want to have a consistent active state. # This does communicate with the flask backend; it prevents writing and # reading to the same files at the same time. with SQLiteLock( lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # Get the all labels since last run. If no new labels, quit. new_label_history = read_label_history(project_id) data_fp = str(get_data_file_path(project_id)) as_data = read_data(project_id) state_file = get_state_path(project_id) # collect command line arguments and pass them to the reviewer with open(asr_kwargs_file, "r") as fp: asr_kwargs = json.load(fp) try: del asr_kwargs["abstract_only"] except KeyError: pass asr_kwargs['state_file'] = str(state_file) reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs) with open_state(state_file) as state: old_label_history = _get_label_train_history(state) diff_history = _get_diff_history(new_label_history, old_label_history) if len(diff_history) == 0: logging.info( "Project {project_id} - No new labels since last run.") return query_record_ids = np.array([x[0] for x in diff_history], dtype=int) inclusions = np.array([x[1] for x in diff_history], dtype=int) query_idx = convert_id_to_idx(as_data, query_record_ids) # Classify the new labels, train and store the results. with open_state(state_file) as state: reviewer.classify( query_idx, inclusions, state, method=label_method) reviewer.train() reviewer.log_probabilities(state) new_query_idx = reviewer.query(reviewer.n_pool()).tolist() reviewer.log_current_query(state) # write the proba to a pandas dataframe with record_ids as index proba = pd.DataFrame( {"proba": state.pred_proba.tolist()}, index=pd.Index(as_data.record_ids, name="record_id") ) # update the pool and output the proba's # important: pool is sorted on query with SQLiteLock( lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # read the pool current_pool = read_pool(project_id) # diff pool and new_query_ind current_pool_idx = convert_id_to_idx(as_data, current_pool) current_pool_idx = frozenset(current_pool_idx) new_pool_idx = [x for x in new_query_idx if x in current_pool_idx] # convert new_pool_idx back to record_ids new_pool = convert_idx_to_id(as_data, new_pool_idx) # write the pool and proba write_pool(project_id, new_pool) write_proba(project_id, proba)