Example #1
0
def read_excel(fp):
    """Excel file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the Excel file (.xlsx).

    Returns
    -------
    list:
        List with entries.

    """
    try:
        dfs = pd.read_excel(fp, sheet_name=None)
    except UnicodeDecodeError:
        dfs = pd.read_excel(fp, sheet_name=None, encoding="ISO-8859-1")

    best_sheet = None
    sheet_obj_val = -1
    wanted_columns = []
    for type_name, type_list in COLUMN_DEFINITIONS.items():
        wanted_columns.extend(type_list)

    for sheet_name in dfs:
        col_names = set([col.lower() for col in list(dfs[sheet_name])])
        obj_val = len(col_names & set(wanted_columns))
        if obj_val > sheet_obj_val:
            sheet_obj_val = obj_val
            best_sheet = sheet_name

    return standardize_dataframe(dfs[best_sheet])
Example #2
0
def read_csv(data_fp):
    """CVS file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the CSV file.

    Returns
    -------
    list:
        List with entries.

    """

    for encoding in ["utf-8", "ISO-8859-1"]:
        try:
            df = pd.read_csv(data_fp,
                             sep=None,
                             encoding=encoding,
                             engine='python')
            return standardize_dataframe(df)
        except UnicodeDecodeError:
            # if unicode error, go to next encoding
            continue

    raise UnicodeDecodeError("The encoding of the file is not supported.")
Example #3
0
def read_pubmed_xml(fp):
    """PubMed XML file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the XML file (.xml).

    Returns
    -------
    list:
        List with entries.
    """
    tree = ET.parse(fp)
    root = tree.getroot()

    records = []
    for child in root:
        parts = []
        elem = child.find('MedlineCitation/Article/ArticleTitle')
        title = elem.text.replace('[', '').replace(']', '')

        for elem in child.iter('AbstractText'):
            parts.append(elem.text)
        authors = []
        for author in child.iter('Author'):
            author_elems = []
            for elem in author.iter('ForeName'):
                author_elems.append(elem.text)
            for elem in author.iter('LastName'):
                author_elems.append(elem.text)
            authors.append(" ".join(author_elems))

        author_str = ", ".join(authors)
        abstract = " ".join(parts)

        keyword_list = [keyword.text for keyword in child.iter('Keyword')]
        keywords = ", ".join(keyword_list)

        new_record = {
            "abstract": abstract,
            "title": title,
            "authors": author_str,
            "keywords": keywords,
        }
        records.append(new_record)
    return standardize_dataframe(pd.DataFrame(records))
Example #4
0
def read_ris(fp):
    """RIS file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the RIS file.
    label: bool
        Check for label. If None, this is automatic.

    Returns
    -------
    list:
        List with entries.

    """

    encodings = ['ISO-8859-1', 'utf-8', 'utf-8-sig']
    entries = None
    for encoding in encodings:
        try:
            with open(fp, 'r', encoding=encoding) as bibliography_file:
                mapping = _tag_key_mapping(reverse=False)
                entries = list(readris(bibliography_file, mapping=mapping))
                break
        except UnicodeDecodeError:
            pass
        except IOError as e:
            logging.warning(e)

    if entries is None:
        raise ValueError("Cannot find proper encoding for data file.")

    df = pd.DataFrame(entries)

    def converter(x):
        try:
            return ", ".join(x)
        except TypeError:
            return ""

    for tag in LIST_TYPE_TAGS:
        key = TAG_KEY_MAPPING[tag]
        if key in df:
            df[key] = df[key].apply(converter)
    return standardize_dataframe(df)
Example #5
0
def read_csv(data_fp):
    """CVS file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the CSV file.

    Returns
    -------
    list:
        List with entries.

    """
    try:
        df = pd.read_csv(data_fp)
    except UnicodeDecodeError:
        df = pd.read_csv(data_fp, encoding="ISO-8859-1")
    return standardize_dataframe(df)
Example #6
0
def read_ris(fp):
    """RIS file reader.

    Parameters
    ----------
    fp: str, pathlib.Path
        File path to the RIS file.
    label: bool
        Check for label. If None, this is automatic.

    Returns
    -------
    list:
        List with entries.

    """

    encodings = ['ISO-8859-1', 'utf-8', 'utf-8-sig']
    entries = None
    for encoding in encodings:
        try:
            with open(fp, 'r', encoding=encoding) as bibliography_file:
                mapping = _tag_key_mapping(reverse=False)
                entries = list(readris(bibliography_file, mapping=mapping))
                break
        except (UnicodeDecodeError, IOError):
            pass

    if entries is None:
        raise ValueError("Cannot find proper encoding for data file.")

    df = pd.DataFrame(entries)
    if "keywords" in df:

        def converter(x):
            try:
                return ", ".join(x)
            except TypeError:
                return ""

        df["keywords"] = df["keywords"].apply(converter)
    return standardize_dataframe(df)