Exemple #1
0
def get_statistics(project_id):
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        # get the index of the active iteration
        label_history = read_label_history(project_id)
        current_labels = read_current_labels(project_id,
                                             label_history=label_history)

    n_since_last_inclusion = 0
    for _, inclusion in reversed(label_history):
        if inclusion == 1:
            break
        n_since_last_inclusion += 1

    n_included = len(np.where(current_labels == 1)[0])
    n_excluded = len(np.where(current_labels == 0)[0])
    n_papers = len(current_labels)
    stats = {
        "n_included": n_included,
        "n_excluded": n_excluded,
        "n_since_last_inclusion": n_since_last_inclusion,
        "n_papers": n_papers,
        "n_pool": n_papers - n_included - n_excluded
    }
    return stats
Exemple #2
0
def move_label_from_labeled_to_pool(project_id, paper_i):

    # load the papers from the pool
    pool_list = read_pool(project_id)

    # Add the paper to the reviewed papers.
    labeled_list = read_label_history(project_id)

    labeled_list_new = []

    for item_id, item_label in labeled_list:

        item_id = int(item_id)
        item_label = int(item_label)
        paper_i = int(paper_i)

        if paper_i == item_id:
            pool_list.append(item_id)
        else:
            labeled_list_new.append([item_id, item_label])

    # write the papers to the label dataset
    write_pool(project_id, pool_list)

    # load the papers from the pool
    write_label_history(project_id, labeled_list_new)
Exemple #3
0
def get_statistics(project_id):
    """Get statistics from project files.

    Arguments
    ---------
    project_id: str
        The id of the current project.

    Returns
    -------
    dict:
        Dictonary with statistics.
    """
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock,
                    blocking=True,
                    lock_name="active",
                    project_id=project_id):
        # get the index of the active iteration
        labeled = read_label_history(project_id)
        pool = read_pool(project_id)

    # compute metrics
    n_included = n_relevant(labeled)
    n_excluded = n_irrelevant(labeled)
    n_pool = len(pool)

    return {
        "n_included": n_included,
        "n_excluded": n_excluded,
        "n_since_last_inclusion": stop_n_since_last_relevant(labeled),
        "n_papers": n_pool + n_included + n_excluded,
        "n_pool": n_pool
    }
Exemple #4
0
def api_get_prior(project_id):  # noqa: F401
    """Get all papers classified as prior documents
    """
    lock_fp = get_lock_path(project_id)
    with SQLiteLock(lock_fp, blocking=True, lock_name="active"):
        label_history = read_label_history(project_id)

    indices = [x[0] for x in label_history]

    records = read_data(project_id).record(indices)

    payload = {"result": []}
    for i, record in enumerate(records):

        payload["result"].append({
            "id": int(record.record_id),
            "title": record.title,
            "abstract": record.abstract,
            "authors": record.authors,
            "keywords": record.keywords,
            "included": int(label_history[i][1])
        })

    response = jsonify(payload)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Exemple #5
0
def api_get_progress_history(project_id):
    """Get progress history on the article"""

    # get label history
    labeled = read_label_history(project_id)
    data = []
    for [key, value] in labeled:
        data.append(value)

    # create a dataset with the rolling mean of every 10 papers
    df = pd.DataFrame(data,
                      columns=["Relevant"]).rolling(10, min_periods=1).mean()
    df["Total"] = df.index + 1

    # transform mean(percentage) to number
    for i in range(0, len(df)):
        if df.loc[i, "Total"] < 10:
            df.loc[i,
                   "Irrelevant"] = (1 -
                                    df.loc[i, "Relevant"]) * df.loc[i, "Total"]
            df.loc[i,
                   "Relevant"] = df.loc[i, "Total"] - df.loc[i, "Irrelevant"]
        else:
            df.loc[i, "Irrelevant"] = (1 - df.loc[i, "Relevant"]) * 10
            df.loc[i, "Relevant"] = 10 - df.loc[i, "Irrelevant"]

    df = df.round(1).to_dict(orient="records")

    response = jsonify(df)
    response.headers.add('Access-Control-Allow-Origin', '*')

    return response
Exemple #6
0
def api_get_prior(project_id):  # noqa: F401
    """Get all papers classified as prior documents
    """

    subset = request.args.get('subset', default=None, type=str)

    # check if the subset exists
    if subset is not None and subset not in ["included", "excluded"]:
        message = "Unkown subset parameter"
        return jsonify(message=message), 400

    lock_fp = get_lock_path(project_id)
    with SQLiteLock(lock_fp, blocking=True, lock_name="active"):
        label_history = read_label_history(project_id, subset=subset)

    indices = [x[0] for x in label_history]

    records = read_data(project_id).record(indices)

    payload = {"result": []}
    for i, record in enumerate(records):

        payload["result"].append({
            "id": int(record.record_id),
            "title": record.title,
            "abstract": record.abstract,
            "authors": record.authors,
            "keywords": record.keywords,
            "included": int(label_history[i][1])
        })

    response = jsonify(payload)
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Exemple #7
0
def api_get_progress_efficiency(project_id):
    """Get cumulative number of inclusions by ASReview/at random"""

    try:
        statistics = get_data_statistics(project_id)
        labeled = read_label_history(project_id)
        data = []
        for [key, value] in labeled:
            data.append(value)

        # create a dataset with the cumulative number of inclusions
        df = pd.DataFrame(data, columns=["Relevant"]).cumsum()
        df["Total"] = df.index + 1
        df["Random"] = (
            df["Total"] *
            (df["Relevant"][-1:] / statistics["n_rows"]).values).round()

        df = df.round(1).to_dict(orient="records")

    except Exception as err:
        logging.error(err)
        return jsonify(message="Failed to load efficiency plot."), 500

    response = jsonify(df)
    response.headers.add('Access-Control-Allow-Origin', '*')

    return response
Exemple #8
0
def export_to_string(project_id, export_type="csv"):

    # read the dataset into a ASReview data object
    as_data = read_data(project_id)

    # set the lock to safely read labeled, pool, and proba
    fp_lock = get_lock_path(project_id)
    with SQLiteLock(
            fp_lock,
            blocking=True,
            lock_name="active",
            project_id=project_id
    ):
        proba = read_proba(project_id)
        pool = read_pool(project_id)
        labeled = read_label_history(project_id)

    # get the record_id of the inclusions and exclusions
    inclusion_record_id = [int(x[0]) for x in labeled if x[1] == 1]
    exclusion_record_id = [int(x[0]) for x in labeled if x[1] == 0]

    # order the pool from high to low proba
    if proba is not None:
        pool_ordered = proba.loc[pool, :] \
            .sort_values("proba", ascending=False).index.values
    else:
        pool_ordered = pool_ordered

    # get the ranking of the 3 subcategories
    ranking = np.concatenate(
        (
            # add the inclusions first
            inclusion_record_id,
            # add the ordered pool second
            pool_ordered,
            # add the exclusions last
            exclusion_record_id
        ),
        axis=None
    )

    # export the data to file
    if export_type == "csv":
        return as_data.to_csv(fp=None, labels=labeled, ranking=ranking)

    if export_type == "tsv":
        return as_data.to_csv(
            fp=None, sep="\t", labels=labeled, ranking=ranking)

    if export_type == "excel":
        get_tmp_path(project_id).mkdir(exist_ok=True)
        fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx")
        return as_data.to_excel(
            fp=fp_tmp_export, labels=labeled, ranking=ranking)
    else:
        raise ValueError("This export type isn't implemented.")
Exemple #9
0
def api_get_prior_stats(project_id):  # noqa: F401
    """Get all papers classified as prior documents
    """
    lock_fp = get_lock_path(project_id)
    with SQLiteLock(lock_fp, blocking=True, lock_name="active"):
        label_history = read_label_history(project_id)

    counter_prior = Counter([x[1] for x in label_history])

    response = jsonify({
        "n_prior": len(label_history),
        "n_inclusions": counter_prior[1],
        "n_exclusions": counter_prior[0]
    })
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Exemple #10
0
def move_label_from_pool_to_labeled(project_id, paper_i, label):

    # load the papers from the pool
    pool_idx = read_pool(project_id)

    # Remove the paper from the pool.
    try:
        pool_idx.remove(int(paper_i))
    except (IndexError, ValueError):
        return

    write_pool(project_id, pool_idx)

    # Add the paper to the reviewed papers.
    labeled = read_label_history(project_id)
    labeled.append([int(paper_i), int(label)])
    write_label_history(project_id, labeled)
Exemple #11
0
def api_get_prior_stats(project_id):  # noqa: F401
    """Get all papers classified as prior documents
    """
    try:
        lock_fp = get_lock_path(project_id)
        with SQLiteLock(lock_fp,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id):
            label_history = read_label_history(project_id)

        counter_prior = Counter([x[1] for x in label_history])

    except Exception as err:
        logging.error(err)
        return jsonify(message="Failed to load prior information."), 500

    response = jsonify({
        "n_prior": len(label_history),
        "n_inclusions": counter_prior[1],
        "n_exclusions": counter_prior[0]
    })
    response.headers.add('Access-Control-Allow-Origin', '*')
    return response
Exemple #12
0
def train_model(project_id, label_method=None):
    """Add the new labels to the review and do the modeling.

    It uses a lock to ensure only one model is running at the same time.
    Old results directories are deleted after 4 iterations.

    It has one argument on the CLI, which is the base project directory.
    """

    logging.info(f"Project {project_id} - Train a new model for project")

    # get file locations
    asr_kwargs_file = get_kwargs_path(project_id)
    lock_file = get_lock_path(project_id)

    # Lock so that only one training run is running at the same time.
    # It doesn't lock the flask server/client.
    with SQLiteLock(lock_file,
                    blocking=False,
                    lock_name="training",
                    project_id=project_id) as lock:

        # If the lock is not acquired, another training instance is running.
        if not lock.locked():
            logging.info("Project {project_id} - "
                         "Cannot acquire lock, other instance running.")
            return

        # Lock the current state. We want to have a consistent active state.
        # This does communicate with the flask backend; it prevents writing and
        # reading to the same files at the same time.
        with SQLiteLock(lock_file,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id) as lock:
            # Get the all labels since last run. If no new labels, quit.
            new_label_history = read_label_history(project_id)

        data_fp = str(get_data_file_path(project_id))
        as_data = read_data(project_id)
        state_file = get_state_path(project_id)

        # collect command line arguments and pass them to the reviewer
        with open(asr_kwargs_file, "r") as fp:
            asr_kwargs = json.load(fp)
        asr_kwargs['state_file'] = str(state_file)
        reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs)

        with open_state(state_file) as state:
            old_label_history = get_label_train_history(state)

        diff_history = get_diff_history(new_label_history, old_label_history)

        if len(diff_history) == 0:
            logging.info(
                "Project {project_id} - No new labels since last run.")
            return

        query_idx = np.array([x[0] for x in diff_history], dtype=int)
        inclusions = np.array([x[1] for x in diff_history], dtype=int)

        # Classify the new labels, train and store the results.
        with open_state(state_file) as state:
            reviewer.classify(query_idx,
                              inclusions,
                              state,
                              method=label_method)
            reviewer.train()
            reviewer.log_probabilities(state)
            new_query_idx = reviewer.query(reviewer.n_pool()).tolist()
            reviewer.log_current_query(state)
            proba = state.pred_proba.tolist()

        with SQLiteLock(lock_file,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id) as lock:
            current_pool = read_pool(project_id)
            in_current_pool = np.zeros(len(as_data))
            in_current_pool[current_pool] = 1
            new_pool = [x for x in new_query_idx if in_current_pool[x]]
            write_pool(project_id, new_pool)
            write_proba(project_id, proba)
Exemple #13
0
def train_model(project_id, label_method=None):
    """Add the new labels to the review and do the modeling.

    It uses a lock to ensure only one model is running at the same time.
    Old results directories are deleted after 4 iterations.

    It has one argument on the CLI, which is the base project directory.
    """

    logging.info(f"Project {project_id} - Train a new model for project")

    # get file locations
    asr_kwargs_file = get_kwargs_path(project_id)
    lock_file = get_lock_path(project_id)

    # Lock so that only one training run is running at the same time.
    # It doesn't lock the flask server/client.
    with SQLiteLock(
            lock_file, blocking=False, lock_name="training",
            project_id=project_id) as lock:

        # If the lock is not acquired, another training instance is running.
        if not lock.locked():
            logging.info("Project {project_id} - "
                         "Cannot acquire lock, other instance running.")
            return

        # Lock the current state. We want to have a consistent active state.
        # This does communicate with the flask backend; it prevents writing and
        # reading to the same files at the same time.
        with SQLiteLock(
                lock_file,
                blocking=True,
                lock_name="active",
                project_id=project_id) as lock:
            # Get the all labels since last run. If no new labels, quit.
            new_label_history = read_label_history(project_id)

        data_fp = str(get_data_file_path(project_id))
        as_data = read_data(project_id)
        state_file = get_state_path(project_id)

        # collect command line arguments and pass them to the reviewer
        with open(asr_kwargs_file, "r") as fp:
            asr_kwargs = json.load(fp)

        try:
            del asr_kwargs["abstract_only"]
        except KeyError:
            pass

        asr_kwargs['state_file'] = str(state_file)
        reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs)

        with open_state(state_file) as state:
            old_label_history = _get_label_train_history(state)

        diff_history = _get_diff_history(new_label_history, old_label_history)

        if len(diff_history) == 0:
            logging.info(
                "Project {project_id} - No new labels since last run.")
            return

        query_record_ids = np.array([x[0] for x in diff_history], dtype=int)
        inclusions = np.array([x[1] for x in diff_history], dtype=int)

        query_idx = convert_id_to_idx(as_data, query_record_ids)

        # Classify the new labels, train and store the results.
        with open_state(state_file) as state:
            reviewer.classify(
                query_idx, inclusions, state, method=label_method)
            reviewer.train()
            reviewer.log_probabilities(state)
            new_query_idx = reviewer.query(reviewer.n_pool()).tolist()
            reviewer.log_current_query(state)

            # write the proba to a pandas dataframe with record_ids as index
            proba = pd.DataFrame(
                {"proba": state.pred_proba.tolist()},
                index=pd.Index(as_data.record_ids, name="record_id")
            )

        # update the pool and output the proba's
        # important: pool is sorted on query
        with SQLiteLock(
                lock_file,
                blocking=True,
                lock_name="active",
                project_id=project_id) as lock:

            # read the pool
            current_pool = read_pool(project_id)

            # diff pool and new_query_ind
            current_pool_idx = convert_id_to_idx(as_data, current_pool)
            current_pool_idx = frozenset(current_pool_idx)
            new_pool_idx = [x for x in new_query_idx if x in current_pool_idx]

            # convert new_pool_idx back to record_ids
            new_pool = convert_idx_to_id(as_data, new_pool_idx)

            # write the pool and proba
            write_pool(project_id, new_pool)
            write_proba(project_id, proba)