def to_file(self, fp, labels=None, ranking=None): """Export data object to file. RIS, CSV and Excel are supported file formats at the moment. Arguments --------- fp: str Filepath to export to. labels: list, numpy.ndarray Labels to be inserted into the dataframe before export. ranking: list, numpy.ndarray Optionally, dataframe rows can be reordered. """ if Path(fp).suffix in [".csv", ".CSV"]: self.to_csv(fp, labels=labels, ranking=ranking) elif Path(fp).suffix in [".tsv", ".TSV", ".tab", ".TAB"]: self.to_csv(fp, sep="\t", labels=labels, ranking=ranking) elif Path(fp).suffix in [".ris", ".RIS"]: self.to_ris(fp, labels=labels, ranking=ranking) elif Path(fp).suffix in [".xlsx", ".XLSX"]: self.to_excel(fp, labels=labels, ranking=ranking) else: raise BadFileFormatError( f"Unknown file extension: {Path(fp).suffix}.\n" f"from file {fp}")
def standardize_dataframe(df, column_spec={}): """Creates a ASReview readable dataframe. The main purpose is to rename columns with slightly different names; 'authors' vs 'first_authors', etc. This greatly widens the compatibility with different datasets. Arguments --------- df: pd.DataFrame Unclean dataframe to be cleaned up. Returns ------- pd.DataFrame: Cleaned dataframe with proper column names. """ all_column_spec = {} col_names = list(df) for column_name in col_names: data_type = type_from_column_spec(column_name, column_spec) if data_type is not None: all_column_spec[data_type] = column_name continue data_type = type_from_column(column_name, COLUMN_DEFINITIONS) if data_type is not None: all_column_spec[data_type] = column_name col_names = list(all_column_spec) if "abstract" not in col_names and "title" not in col_names: raise BadFileFormatError("File supplied without 'abstract' or 'title'" " fields.") if "abstract" not in col_names: logging.warning("Unable to detect abstracts in dataset.") if "title" not in col_names: logging.warning("Unable to detect titles in dataset.") for col in ["title", "abstract", "authors", "keywords"]: try: df[all_column_spec[col]].fillna("", inplace=True) except KeyError: pass if "final_included" in col_names: try: col = all_column_spec["final_included"] df[col].fillna(LABEL_NA, inplace=True) df[col] = pd.to_numeric(df[col]) except KeyError: pass if "record_id" in list(df): df.set_index('record_id', inplace=True) if df.index.name != "record_id": df["record_id"] = np.arange(len(df.index)) df.set_index('record_id', inplace=True) df.sort_index(inplace=True) return df, all_column_spec
def standardize_dataframe(df, column_spec={}): """Creates a ASReview readable dataframe. The main purpose is to rename columns with slightly different names; 'authors' vs 'first_authors', etc. This greatly widens the compatibility with different datasets. Arguments --------- df: pd.DataFrame Unclean dataframe to be cleaned up. Returns ------- pd.DataFrame: Cleaned dataframe with proper column names. """ all_column_spec = {} col_names = list(df) for column_name in col_names: # First try the supplied column specifications if supplied. data_type = type_from_column_spec(column_name, column_spec) if data_type is not None: all_column_spec[data_type] = column_name continue # Then try the standard specifications in ASReview. data_type = type_from_column(column_name, COLUMN_DEFINITIONS) if data_type is not None: all_column_spec[data_type] = column_name # Check if we either have abstracts or titles. col_names = list(all_column_spec) if "abstract" not in col_names and "title" not in col_names: raise BadFileFormatError("File supplied without 'abstract' or 'title'" " fields.") if "abstract" not in col_names: logging.warning("Unable to detect abstracts in dataset.") if "title" not in col_names: logging.warning("Unable to detect titles in dataset.") # Replace NA values with empty strings. for col in ["title", "abstract", "authors", "keywords"]: try: df[all_column_spec[col]] = df[all_column_spec[col]].astype(str) df[all_column_spec[col]].fillna("", inplace=True) except KeyError: pass # Convert labels to integers. if "final_included" in col_names: try: col = all_column_spec["final_included"] df[col].fillna(LABEL_NA, inplace=True) df[col] = pd.to_numeric(df[col]) except KeyError: pass except ValueError: logging.warning("Failed to parse label column name, no labels will" " be present.") df.rename(columns={"label": "final_included"}) all_column_spec.pop("final_included") # If the we have a record_id (for example from an ASReview export) use it. if "record_id" in list(df): df.set_index('record_id', inplace=True) if df.index.name != "record_id": df["record_id"] = np.arange(len(df.index)) df.set_index('record_id', inplace=True) df.sort_index(inplace=True) return df, all_column_spec