Python get_data_file_path Examples, asreview.webapp.utils.paths.get_data_file_path Python Examples

Example #1

0

Show file

File: api.py Project: MathieuRietman/asreview

def api_get_project_info(project_id):  # noqa: F401
    """Get info on the article"""

    logging.info("get project info")

    try:

        # read the file with project info
        with open(get_project_file_path(project_id), "r") as fp:

            project_info = json.load(fp)

        # check if there is a dataset
        try:
            get_data_file_path(project_id)
            project_info["projectHasDataset"] = True
        except Exception:
            project_info["projectHasDataset"] = False

        # check if there is a prior knowledge (check if there is a model set),
        # if this is the case, the reviewer past the prior knowledge screen.
        project_info["projectHasPriorKnowledge"] = \
            get_kwargs_path(project_id).exists()

        # check if there is a prior knowledge (check if there is a model set),
        # if this is the case, the reviewer past the prior knowledge screen.
        project_info["projectHasAlgorithms"] = \
            get_kwargs_path(project_id).exists()

        # backwards support <0.10
        if "projectInitReady" not in project_info:
            if project_info["projectHasPriorKnowledge"]:
                project_info["projectInitReady"] = True
            else:
                project_info["projectInitReady"] = False

        response = jsonify(project_info)

    except FileNotFoundError as err:
        logging.error(err)
        response = jsonify(message="Project not found.")

        return response, 400

    except Exception as err:
        logging.error(err)
        response = jsonify(message="Internal Server Error.")

        return response, 500

    return response

Example #2

0

Show file

def remove_dataset_to_project(project_id, file_name):
    """Remove dataset from project

    """

    project_file_path = get_project_file_path(project_id)
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock,
                    blocking=True,
                    lock_name="active",
                    project_id=project_id):

        # open the projects file
        with open(project_file_path, "r") as f_read:
            project_dict = json.load(f_read)

        # remove the path from the project file
        data_fn = project_dict["dataset_path"]
        del project_dict["dataset_path"]

        with open(project_file_path, "w") as f_write:
            json.dump(project_dict, f_write)

        # files to remove
        data_path = get_data_file_path(project_id, data_fn)
        pool_path = get_pool_path(project_id)
        labeled_path = get_labeled_path(project_id)

        os.remove(str(data_path))
        os.remove(str(pool_path))
        os.remove(str(labeled_path))

Example #3

0

Show file

File: io.py Project: valmelnikov/asreview

def read_data(project_id, use_cache=True, save_cache=True):
    """Get ASReviewData object from file.

    Parameters
    ----------
    project_id: str, iterable
        The project identifier.
    use_cache: bool
        Use the pickle file if available.
    save_cache: bool
        Save the file to a pickle file if not available.

    Returns
    -------
    ASReviewData:
        The data object for internal use in ASReview.

    """

    # use cache file
    if use_cache:
        try:
            return _read_data_from_cache(project_id)
        except CacheDataError:
            pass

    # load from file
    fp_data = get_data_file_path(project_id)
    data_obj = ASReviewData.from_file(fp_data)

    # save a pickle version
    if save_cache:
        _write_data_to_cache(project_id, data_obj)

    return data_obj

Example #4

0

Show file

File: api.py Project: MathieuRietman/asreview

def api_get_project_data(project_id):  # noqa: F401
    """Get info on the article"""

    if not is_project(project_id):
        response = jsonify(message="Project not found.")
        return response, 404

    try:

        filename = get_data_file_path(project_id).stem

        # get statistics of the dataset
        statistics = get_data_statistics(project_id)
        statistics["filename"] = filename

    except FileNotFoundError as err:
        print(err)
        statistics = {"filename": None}

    except Exception as err:
        print(err)
        message = f"Failed to get file. {err}"
        return jsonify(message=message), 400

    response = jsonify(statistics)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response

Example #5

0

Show file

File: api.py Project: jm4rc05/asreview

def api_get_project_info(project_id):  # noqa: F401
    """Get info on the article"""

    try:

        # read the file with project info
        with open(get_project_file_path(project_id), "r") as fp:

            project_info = json.load(fp)

        # check if there is a dataset
        try:
            get_data_file_path(project_id)
            project_info["projectHasDataset"] = True
        except Exception:
            project_info["projectHasDataset"] = False

        # check if there is a prior knowledge (check if there is a model set),
        # if this is the case, the reviewer past the prior knowledge screen.
        project_info["projectHasPriorKnowledge"] = \
            get_kwargs_path(project_id).exists()

        # check if there is a prior knowledge (check if there is a model set),
        # if this is the case, the reviewer past the prior knowledge screen.
        project_info["projectHasAlgorithms"] = \
            get_kwargs_path(project_id).exists()

        # backwards support <0.10
        if "projectInitReady" not in project_info:
            if project_info["projectHasPriorKnowledge"]:
                project_info["projectInitReady"] = True
            else:
                project_info["projectInitReady"] = False

    except FileNotFoundError:
        raise ProjectNotFoundError()

    return jsonify(project_info)

Example #6

0

Show file

def read_data(project_id, save_tmp=True):
    """Get ASReviewData object from file.

    Parameters
    ----------
    project_id: str, iterable
        The project identifier.
    save_tmp: bool
        Save the file to a pickle file if not available.

    Returns
    -------
    ASReviewData:
        The data object for internal use in ASReview.

    """
    fp_data = get_data_file_path(project_id)
    fp_data_pickle = Path(fp_data).with_suffix(fp_data.suffix + ".pickle")

    try:
        # get the pickle data
        with open(fp_data_pickle, 'rb') as f_pickle_read:
            data_obj = pickle.load(f_pickle_read)
        return data_obj
    except FileNotFoundError:
        # file not available
        data_obj = ASReviewData.from_file(fp_data)
    except pickle.PickleError:
        # problem loading pickle file
        # remove the pickle file
        os.remove(fp_data_pickle)

        data_obj = ASReviewData.from_file(fp_data)

    # save a pickle version
    if save_tmp:
        logging.info("Store a copy of the data in a pickle file.")
        with open(fp_data_pickle, 'wb') as f_pickle:
            pickle.dump(data_obj, f_pickle)

    return data_obj

Example #7

0

Show file

File: io.py Project: mao-wang/asreview

def read_data(project_id):
    """Get ASReviewData object of the dataset"""
    dataset = get_data_file_path(project_id)
    return ASReviewData.from_file(dataset)

Example #8

0

Show file

File: io.py Project: valmelnikov/asreview

def _get_cache_data_path(project_id):

    fp_data = get_data_file_path(project_id)

    return get_data_file_path(project_id) \
        .with_suffix(fp_data.suffix + ".pickle")

Example #9

0

Show file

def train_model(project_id, label_method=None):
    """Add the new labels to the review and do the modeling.

    It uses a lock to ensure only one model is running at the same time.
    Old results directories are deleted after 4 iterations.

    It has one argument on the CLI, which is the base project directory.
    """

    logging.info(f"Project {project_id} - Train a new model for project")

    # get file locations
    asr_kwargs_file = get_kwargs_path(project_id)
    lock_file = get_lock_path(project_id)

    # Lock so that only one training run is running at the same time.
    # It doesn't lock the flask server/client.
    with SQLiteLock(lock_file,
                    blocking=False,
                    lock_name="training",
                    project_id=project_id) as lock:

        # If the lock is not acquired, another training instance is running.
        if not lock.locked():
            logging.info("Project {project_id} - "
                         "Cannot acquire lock, other instance running.")
            return

        # Lock the current state. We want to have a consistent active state.
        # This does communicate with the flask backend; it prevents writing and
        # reading to the same files at the same time.
        with SQLiteLock(lock_file,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id) as lock:
            # Get the all labels since last run. If no new labels, quit.
            new_label_history = read_label_history(project_id)

        data_fp = str(get_data_file_path(project_id))
        as_data = read_data(project_id)
        state_file = get_state_path(project_id)

        # collect command line arguments and pass them to the reviewer
        with open(asr_kwargs_file, "r") as fp:
            asr_kwargs = json.load(fp)
        asr_kwargs['state_file'] = str(state_file)
        reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs)

        with open_state(state_file) as state:
            old_label_history = get_label_train_history(state)

        diff_history = get_diff_history(new_label_history, old_label_history)

        if len(diff_history) == 0:
            logging.info(
                "Project {project_id} - No new labels since last run.")
            return

        query_idx = np.array([x[0] for x in diff_history], dtype=int)
        inclusions = np.array([x[1] for x in diff_history], dtype=int)

        # Classify the new labels, train and store the results.
        with open_state(state_file) as state:
            reviewer.classify(query_idx,
                              inclusions,
                              state,
                              method=label_method)
            reviewer.train()
            reviewer.log_probabilities(state)
            new_query_idx = reviewer.query(reviewer.n_pool()).tolist()
            reviewer.log_current_query(state)
            proba = state.pred_proba.tolist()

        with SQLiteLock(lock_file,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id) as lock:
            current_pool = read_pool(project_id)
            in_current_pool = np.zeros(len(as_data))
            in_current_pool[current_pool] = 1
            new_pool = [x for x in new_query_idx if in_current_pool[x]]
            write_pool(project_id, new_pool)
            write_proba(project_id, proba)

Example #10

0

Show file

def train_model(project_id, label_method=None):
    """Add the new labels to the review and do the modeling.

    It uses a lock to ensure only one model is running at the same time.
    Old results directories are deleted after 4 iterations.

    It has one argument on the CLI, which is the base project directory.
    """

    logging.info(f"Project {project_id} - Train a new model for project")

    # get file locations
    asr_kwargs_file = get_kwargs_path(project_id)
    lock_file = get_lock_path(project_id)

    # Lock so that only one training run is running at the same time.
    # It doesn't lock the flask server/client.
    with SQLiteLock(
            lock_file, blocking=False, lock_name="training",
            project_id=project_id) as lock:

        # If the lock is not acquired, another training instance is running.
        if not lock.locked():
            logging.info("Project {project_id} - "
                         "Cannot acquire lock, other instance running.")
            return

        # Lock the current state. We want to have a consistent active state.
        # This does communicate with the flask backend; it prevents writing and
        # reading to the same files at the same time.
        with SQLiteLock(
                lock_file,
                blocking=True,
                lock_name="active",
                project_id=project_id) as lock:
            # Get the all labels since last run. If no new labels, quit.
            new_label_history = read_label_history(project_id)

        data_fp = str(get_data_file_path(project_id))
        as_data = read_data(project_id)
        state_file = get_state_path(project_id)

        # collect command line arguments and pass them to the reviewer
        with open(asr_kwargs_file, "r") as fp:
            asr_kwargs = json.load(fp)

        try:
            del asr_kwargs["abstract_only"]
        except KeyError:
            pass

        asr_kwargs['state_file'] = str(state_file)
        reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs)

        with open_state(state_file) as state:
            old_label_history = _get_label_train_history(state)

        diff_history = _get_diff_history(new_label_history, old_label_history)

        if len(diff_history) == 0:
            logging.info(
                "Project {project_id} - No new labels since last run.")
            return

        query_record_ids = np.array([x[0] for x in diff_history], dtype=int)
        inclusions = np.array([x[1] for x in diff_history], dtype=int)

        query_idx = convert_id_to_idx(as_data, query_record_ids)

        # Classify the new labels, train and store the results.
        with open_state(state_file) as state:
            reviewer.classify(
                query_idx, inclusions, state, method=label_method)
            reviewer.train()
            reviewer.log_probabilities(state)
            new_query_idx = reviewer.query(reviewer.n_pool()).tolist()
            reviewer.log_current_query(state)

            # write the proba to a pandas dataframe with record_ids as index
            proba = pd.DataFrame(
                {"proba": state.pred_proba.tolist()},
                index=pd.Index(as_data.record_ids, name="record_id")
            )

        # update the pool and output the proba's
        # important: pool is sorted on query
        with SQLiteLock(
                lock_file,
                blocking=True,
                lock_name="active",
                project_id=project_id) as lock:

            # read the pool
            current_pool = read_pool(project_id)

            # diff pool and new_query_ind
            current_pool_idx = convert_id_to_idx(as_data, current_pool)
            current_pool_idx = frozenset(current_pool_idx)
            new_pool_idx = [x for x in new_query_idx if x in current_pool_idx]

            # convert new_pool_idx back to record_ids
            new_pool = convert_idx_to_id(as_data, new_pool_idx)

            # write the pool and proba
            write_pool(project_id, new_pool)
            write_proba(project_id, proba)