Python read_data Examples, asreview.webapp.utils.io.read_data Python Examples

Example #1

0

Show file

def get_paper_data(project_id,
                   paper_id,
                   return_title=True,
                   return_authors=True,
                   return_abstract=True,
                   return_debug_label=False):
    """Get the title/authors/abstract for a paper."""
    as_data = read_data(project_id)
    record = as_data.record(int(paper_id), by_index=False)

    paper_data = {}
    if return_title and record.title is not None:
        paper_data['title'] = record.title
    if return_authors and record.authors is not None:
        paper_data['authors'] = record.authors
    if return_abstract and record.abstract is not None:
        paper_data['abstract'] = record.abstract

    # return the publication data if available
    pub_time = record.extra_fields.get("publish_time", None)
    paper_data['publish_time'] = pub_time if pd.notnull(pub_time) else None

    # return the doi if available
    doi = record.extra_fields.get("doi", None)
    paper_data['doi'] = doi if pd.notnull(doi) else None

    # return the debug label
    debug_label = record.extra_fields.get("debug_label", None)
    paper_data['_debug_label'] = \
        int(debug_label) if pd.notnull(debug_label) else None

    return paper_data

Example #2

0

Show file

File: project.py Project: mao-wang/asreview

def add_dataset_to_project(project_id, file_name):
    """Add file path to the project file.

    Add file to data subfolder and fill the pool of iteration 0.
    """

    project_file_path = get_project_file_path(project_id)
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        # open the projects file
        with open(project_file_path, "r") as f_read:
            project_dict = json.load(f_read)

        # add path to dict (overwrite if already exists)
        project_dict["dataset_path"] = file_name

        with open(project_file_path, "w") as f_write:
            json.dump(project_dict, f_write)

        # fill the pool of the first iteration
        pool_indices = read_data(project_id).record_ids
        np.random.shuffle(pool_indices)

        write_pool(project_id, pool_indices.tolist())

        # make a empty qeue for the items to label
        write_label_history(project_id, [])

Example #3

0

Show file

File: project.py Project: mao-wang/asreview

def get_paper_data(project_id,
                   paper_id,
                   return_title=True,
                   return_authors=True,
                   return_abstract=True,
                   return_debug_label=False):
    """Get the title/authors/abstract for a paper."""
    as_data = read_data(project_id)
    record = as_data.record(int(paper_id))

    paper_data = {}
    if return_title and record.title is not None:
        paper_data['title'] = record.title
    if return_authors and record.authors is not None:
        paper_data['authors'] = record.authors
    if return_abstract and record.abstract is not None:
        paper_data['abstract'] = record.abstract

    # return the publication data if available
    if record.extra_fields.get("publish_time", None) is not None:
        paper_data['publish_time'] = \
            record.extra_fields.get("publish_time", None)

    # return the debug label
    if return_debug_label and \
            record.extra_fields.get("debug_label", None) is not None:
        paper_data['_debug_label'] = \
            int(record.extra_fields.get("debug_label", None))

    return paper_data

Example #4

0

Show file

def export_to_string(project_id, export_type="csv"):
    fp_lock = get_lock_path(project_id)
    as_data = read_data(project_id)
    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        proba = read_proba(project_id)
        if proba is None:
            proba = np.flip(np.arange(len(as_data)))
        else:
            proba = np.array(proba)
        labels = read_current_labels(project_id, as_data=as_data)

    pool_idx = np.where(labels == LABEL_NA)[0]
    one_idx = np.where(labels == 1)[0]
    zero_idx = np.where(labels == 0)[0]

    proba_order = np.argsort(-proba[pool_idx])
    ranking = np.concatenate((one_idx, pool_idx[proba_order], zero_idx),
                             axis=None)

    if export_type == "csv":
        return as_data.to_csv(fp=None, labels=labels, ranking=ranking)
    if export_type == "excel":
        get_tmp_path(project_id).mkdir(exist_ok=True)
        fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx")
        return as_data.to_excel(fp=fp_tmp_export,
                                labels=labels,
                                ranking=ranking)
    else:
        raise ValueError("This export type isn't implemented.")

Example #5

0

Show file

File: project.py Project: valmelnikov/asreview

def export_to_string(project_id, export_type="csv"):

    # read the dataset into a ASReview data object
    as_data = read_data(project_id)

    # set the lock to safely read labeled, pool, and proba
    fp_lock = get_lock_path(project_id)
    with SQLiteLock(
            fp_lock,
            blocking=True,
            lock_name="active",
            project_id=project_id
    ):
        proba = read_proba(project_id)
        pool = read_pool(project_id)
        labeled = read_label_history(project_id)

    # get the record_id of the inclusions and exclusions
    inclusion_record_id = [int(x[0]) for x in labeled if x[1] == 1]
    exclusion_record_id = [int(x[0]) for x in labeled if x[1] == 0]

    # order the pool from high to low proba
    if proba is not None:
        pool_ordered = proba.loc[pool, :] \
            .sort_values("proba", ascending=False).index.values
    else:
        pool_ordered = pool_ordered

    # get the ranking of the 3 subcategories
    ranking = np.concatenate(
        (
            # add the inclusions first
            inclusion_record_id,
            # add the ordered pool second
            pool_ordered,
            # add the exclusions last
            exclusion_record_id
        ),
        axis=None
    )

    # export the data to file
    if export_type == "csv":
        return as_data.to_csv(fp=None, labels=labeled, ranking=ranking)

    if export_type == "tsv":
        return as_data.to_csv(
            fp=None, sep="\t", labels=labeled, ranking=ranking)

    if export_type == "excel":
        get_tmp_path(project_id).mkdir(exist_ok=True)
        fp_tmp_export = Path(get_tmp_path(project_id), "export_result.xlsx")
        return as_data.to_excel(
            fp=fp_tmp_export, labels=labeled, ranking=ranking)
    else:
        raise ValueError("This export type isn't implemented.")

Example #6

0

Show file

def search_data(project_id, q, n_max=100):
    """Get the title/authors/abstract for a paper."""

    # read the dataset
    as_data = read_data(project_id)

    # search for the keywords
    paper_ids = as_data.fuzzy_find(q, max_return=n_max, exclude=[])

    # return full information on the records
    return as_data.record(paper_ids)

Example #7

0

Show file

def get_data_statistics(project_id):
    """Get the title/authors/abstract for a paper."""

    # read the dataset
    as_data = read_data(project_id)

    result = {
        "n_rows": as_data.df.shape[0],
        "n_cols": as_data.df.shape[1],
    }

    # return full information on the records
    return result

Example #8

0

Show file

def search_data(project_id, q, n_max=100):
    """Get the title/authors/abstract for a paper."""

    # read the dataset
    as_data = read_data(project_id)

    # search for the keywords
    result_idx = fuzzy_find(as_data,
                            q,
                            max_return=n_max,
                            exclude=[],
                            by_index=True)

    # return full information on the records
    return as_data.record(result_idx, by_index=True)

Example #9

0

Show file

File: project.py Project: valmelnikov/asreview

def add_dataset_to_project(project_id, file_name):
    """Add file path to the project file.

    Add file to data subfolder and fill the pool of iteration 0.
    """

    project_file_path = get_project_file_path(project_id)

    # clean temp project files
    clean_project_tmp_files(project_id)

    with SQLiteLock(
            get_lock_path(project_id),
            blocking=True,
            lock_name="active",
            project_id=project_id
    ):
        # open the projects file
        with open(project_file_path, "r") as f_read:
            project_dict = json.load(f_read)

        # add path to dict (overwrite if already exists)
        project_dict["dataset_path"] = file_name

        with open(project_file_path, "w") as f_write:
            json.dump(project_dict, f_write)

        # fill the pool of the first iteration
        as_data = read_data(project_id)

        if as_data.labels is not None:
            unlabeled = np.where(as_data.labels == LABEL_NA)[0]
            pool_indices = as_data.record_ids[unlabeled]

            labeled_indices = np.where(as_data.labels != LABEL_NA)[0]
            label_indices = list(zip(
                as_data.record_ids[labeled_indices].tolist(),
                as_data.labels[labeled_indices].tolist()
            ))
        else:
            pool_indices = as_data.record_ids
            label_indices = []

        np.random.shuffle(pool_indices)
        write_pool(project_id, pool_indices.tolist())

        # make a empty qeue for the items to label
        write_label_history(project_id, label_indices)

Example #10

0

Show file

def get_paper_data(project_id, paper_id, return_debug_label=False):
    """Get the title/authors/abstract for a paper."""
    as_data = read_data(project_id)
    record = as_data.record(int(paper_id), by_index=False)

    paper_data = {}
    paper_data['title'] = record.title
    paper_data['authors'] = record.authors
    paper_data['abstract'] = record.abstract
    paper_data['doi'] = record.doi

    # return the debug label
    debug_label = record.extra_fields.get("debug_label", None)
    paper_data['_debug_label'] = \
        int(debug_label) if pd.notnull(debug_label) else None

    return paper_data

Example #11

0

Show file

File: project.py Project: mao-wang/asreview

def export_to_string(project_id):
    fp_lock = get_lock_path(project_id)
    as_data = read_data(project_id)
    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        proba = read_proba(project_id)
        if proba is None:
            proba = np.flip(np.arange(len(as_data)))
        else:
            proba = np.array(proba)
        labels = read_current_labels(project_id, as_data=as_data)

    pool_idx = np.where(labels == LABEL_NA)[0]
    one_idx = np.where(labels == 1)[0]
    zero_idx = np.where(labels == 0)[0]

    proba_order = np.argsort(-proba[pool_idx])
    ranking = np.concatenate((one_idx, pool_idx[proba_order], zero_idx),
                             axis=None)
    return as_data.to_csv(fp=None, labels=labels, ranking=ranking)

Example #12

0

Show file

def add_dataset_to_project(project_id, file_name):
    """Add file path to the project file.

    Add file to data subfolder and fill the pool of iteration 0.
    """

    project_file_path = get_project_file_path(project_id)
    fp_lock = get_lock_path(project_id)

    with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
        # open the projects file
        with open(project_file_path, "r") as f_read:
            project_dict = json.load(f_read)

        # add path to dict (overwrite if already exists)
        project_dict["dataset_path"] = file_name

        with open(project_file_path, "w") as f_write:
            json.dump(project_dict, f_write)

        # fill the pool of the first iteration
        as_data = read_data(project_id)

        if as_data.labels is not None:
            unlabeled = np.where(as_data.labels == LABEL_NA)[0]
            pool_indices = as_data.record_ids[unlabeled]

            label_indices_included = \
                [[int(x), 1] for x in np.where(as_data.labels == 1)[0]]
            label_indices_excluded = \
                [[int(x), 0] for x in np.where(as_data.labels == 0)[0]]
            label_indices = label_indices_included + label_indices_excluded
        else:
            pool_indices = as_data.record_ids
            label_indices = []

        np.random.shuffle(pool_indices)
        write_pool(project_id, pool_indices.tolist())

        # make a empty qeue for the items to label
        write_label_history(project_id, label_indices)