Exemple #1
0
    def to_file(self, fp, labels=None, ranking=None):
        """Export data object to file.

        RIS, CSV and Excel are supported file formats at the moment.

        Arguments
        ---------
        fp: str
            Filepath to export to.
        labels: list, numpy.ndarray
            Labels to be inserted into the dataframe before export.
        ranking: list, numpy.ndarray
            Optionally, dataframe rows can be reordered.
        """
        if Path(fp).suffix in [".csv", ".CSV"]:
            self.to_csv(fp, labels=labels, ranking=ranking)
        elif Path(fp).suffix in [".tsv", ".TSV", ".tab", ".TAB"]:
            self.to_csv(fp, sep="\t", labels=labels, ranking=ranking)
        elif Path(fp).suffix in [".ris", ".RIS"]:
            self.to_ris(fp, labels=labels, ranking=ranking)
        elif Path(fp).suffix in [".xlsx", ".XLSX"]:
            self.to_excel(fp, labels=labels, ranking=ranking)
        else:
            raise BadFileFormatError(
                f"Unknown file extension: {Path(fp).suffix}.\n"
                f"from file {fp}")
Exemple #2
0
def standardize_dataframe(df, column_spec={}):
    """Creates a ASReview readable dataframe.

    The main purpose is to rename columns with slightly different names;
    'authors' vs 'first_authors', etc. This greatly widens the compatibility
    with different datasets.

    Arguments
    ---------
    df: pd.DataFrame
        Unclean dataframe to be cleaned up.

    Returns
    -------
    pd.DataFrame:
        Cleaned dataframe with proper column names.
    """
    all_column_spec = {}
    col_names = list(df)
    for column_name in col_names:
        data_type = type_from_column_spec(column_name, column_spec)
        if data_type is not None:
            all_column_spec[data_type] = column_name
            continue
        data_type = type_from_column(column_name, COLUMN_DEFINITIONS)
        if data_type is not None:
            all_column_spec[data_type] = column_name

    col_names = list(all_column_spec)
    if "abstract" not in col_names and "title" not in col_names:
        raise BadFileFormatError("File supplied without 'abstract' or 'title'"
                                 " fields.")
    if "abstract" not in col_names:
        logging.warning("Unable to detect abstracts in dataset.")
    if "title" not in col_names:
        logging.warning("Unable to detect titles in dataset.")

    for col in ["title", "abstract", "authors", "keywords"]:
        try:
            df[all_column_spec[col]].fillna("", inplace=True)
        except KeyError:
            pass

    if "final_included" in col_names:
        try:
            col = all_column_spec["final_included"]
            df[col].fillna(LABEL_NA, inplace=True)
            df[col] = pd.to_numeric(df[col])
        except KeyError:
            pass

    if "record_id" in list(df):
        df.set_index('record_id', inplace=True)
    if df.index.name != "record_id":
        df["record_id"] = np.arange(len(df.index))
        df.set_index('record_id', inplace=True)
    df.sort_index(inplace=True)
    return df, all_column_spec
Exemple #3
0
def standardize_dataframe(df, column_spec={}):
    """Creates a ASReview readable dataframe.

    The main purpose is to rename columns with slightly different names;
    'authors' vs 'first_authors', etc. This greatly widens the compatibility
    with different datasets.

    Arguments
    ---------
    df: pd.DataFrame
        Unclean dataframe to be cleaned up.

    Returns
    -------
    pd.DataFrame:
        Cleaned dataframe with proper column names.
    """
    all_column_spec = {}
    col_names = list(df)
    for column_name in col_names:
        # First try the supplied column specifications if supplied.
        data_type = type_from_column_spec(column_name, column_spec)
        if data_type is not None:
            all_column_spec[data_type] = column_name
            continue
        # Then try the standard specifications in ASReview.
        data_type = type_from_column(column_name, COLUMN_DEFINITIONS)
        if data_type is not None:
            all_column_spec[data_type] = column_name

    # Check if we either have abstracts or titles.
    col_names = list(all_column_spec)
    if "abstract" not in col_names and "title" not in col_names:
        raise BadFileFormatError("File supplied without 'abstract' or 'title'"
                                 " fields.")
    if "abstract" not in col_names:
        logging.warning("Unable to detect abstracts in dataset.")
    if "title" not in col_names:
        logging.warning("Unable to detect titles in dataset.")

    # Replace NA values with empty strings.
    for col in ["title", "abstract", "authors", "keywords"]:
        try:
            df[all_column_spec[col]] = df[all_column_spec[col]].astype(str)
            df[all_column_spec[col]].fillna("", inplace=True)
        except KeyError:
            pass

    # Convert labels to integers.
    if "final_included" in col_names:
        try:
            col = all_column_spec["final_included"]
            df[col].fillna(LABEL_NA, inplace=True)
            df[col] = pd.to_numeric(df[col])
        except KeyError:
            pass
        except ValueError:
            logging.warning("Failed to parse label column name, no labels will"
                            " be present.")
            df.rename(columns={"label": "final_included"})
            all_column_spec.pop("final_included")

    # If the we have a record_id (for example from an ASReview export) use it.
    if "record_id" in list(df):
        df.set_index('record_id', inplace=True)
    if df.index.name != "record_id":
        df["record_id"] = np.arange(len(df.index))
        df.set_index('record_id', inplace=True)
    df.sort_index(inplace=True)
    return df, all_column_spec