Example #1
0
def loader_func(**kwargs):
    path = kwargs.pop("path")
    engine = "xlrd" if path.endswith("xls") else "openpyxl"
    sheet_name = kwargs.pop("sheet", None)
    if path.startswith("http://") or path.startswith("https://"):
        proxy = kwargs.pop("proxy", None)
        req_kwargs = {}
        if proxy is not None:
            req_kwargs["proxies"] = dict(http=proxy, https=proxy)
        resp = requests.get(path, **req_kwargs)
        assert resp.status_code == 200
        path = BytesIO(resp.content) if PY3 else StringIO(resp.content.decode("utf-8"))
    dfs = pd.read_excel(
        path,
        sheet_name=sheet_name,
        engine=engine,
        **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)}
    )
    if dfs is None or not len(dfs):
        raise Exception("Failed to load Excel file. Returned no data.")
    if sheet_name:
        if sheet_name not in dfs:
            raise Exception(
                "Excel file loaded but there was no sheet named '{}'.".format(
                    sheet_name
                )
            )
        return dfs[sheet_name]
    # this is required because there is no support for loading multiple datasets at once from the CLI
    # I can add this later...
    return dfs[list(dfs.keys())[0]]
Example #2
0
def loader_func(**kwargs):
    path = handle_path(kwargs.pop("path"), kwargs)
    return pd.read_csv(
        path, **{
            k: v
            for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)
        })
Example #3
0
def loader_func(**kwargs):
    try:
        import pyarrow  # noqa: F401
    except ImportError:
        try:
            import fastparquet  # noqa: F401
        except ImportError:
            raise ImportError(
                "In order to use the parquet loader you must install either pyarrow or fastparquet!"
            )

    path = kwargs.pop("path")
    return pd.read_parquet(
        path, **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)}
    )
Example #4
0
def loader_func(**kwargs):
    path = kwargs.pop("path")
    if path.startswith("http://") or path.startswith(
        "https://"
    ):  # add support for URLs
        proxy = kwargs.pop("proxy", None)
        req_kwargs = {}
        if proxy is not None:
            req_kwargs["proxies"] = dict(http=proxy, https=proxy)
        resp = requests.get(path, **req_kwargs)
        assert resp.status_code == 200
        path = BytesIO(resp.content)
    return pd.read_csv(
        path, **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)}
    )
Example #5
0
def loader_func(**kwargs):
    path = kwargs.pop('path')
    normalize = kwargs.pop('normalize', False)
    if path.startswith('http://') or path.startswith('https://'):  # add support for URLs
        proxy = kwargs.pop('proxy', None)
        req_kwargs = {}
        if proxy is not None:
            req_kwargs['proxies'] = dict(http=proxy, https=proxy)
        resp = requests.get(path, **req_kwargs)
        assert resp.status_code == 200
        path = resp.json() if normalize else resp.text
    if normalize:
        normalize_func = pd.json_normalize if is_pandas1() else pd.io.json.json_normalize
        return normalize_func(path, **kwargs)
    return pd.read_json(path, **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)})
Example #6
0
def load_file(sheet_name=None, **kwargs):
    path = kwargs.pop("path")
    engine = "xlrd" if path.endswith("xls") else "openpyxl"
    path = handle_path(path, kwargs)
    dfs = pd.read_excel(path,
                        sheet_name=sheet_name,
                        engine=engine,
                        **{
                            k: v
                            for k, v in kwargs.items()
                            if k in loader_prop_keys(LOADER_PROPS)
                        })
    if dfs is None or not len(dfs):
        raise Exception("Failed to load Excel file. Returned no data.")
    return dfs
Example #7
0
def loader_func(**kwargs):
    normalize = kwargs.pop("normalize", False)

    def resp_handler(resp):
        return resp.json() if normalize else resp.text

    path = handle_path(kwargs.pop("path"), kwargs, resp_handler=resp_handler)
    if normalize:
        normalize_func = (pd.json_normalize
                          if is_pandas1() else pd.io.json.json_normalize)
        return normalize_func(path, **kwargs)
    return pd.read_json(
        path, **{
            k: v
            for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)
        })
Example #8
0
def loader_func(**kwargs):
    path = kwargs.pop('path')
    if path.startswith('http://') or path.startswith(
            'https://'):  # add support for URLs
        proxy = kwargs.pop('proxy', None)
        req_kwargs = {}
        if proxy is not None:
            req_kwargs['proxies'] = dict(http=proxy, https=proxy)
        resp = requests.get(path, **req_kwargs)
        assert resp.status_code == 200
        path = BytesIO(resp.content) if PY3 else StringIO(
            resp.content.decode('utf-8'))
    return pd.read_csv(
        path, **{
            k: v
            for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)
        })
Example #9
0
def loader_func(**kwargs):
    path = kwargs.pop("path")
    engine = "xlrd" if path.endswith("xls") else "openpyxl"
    sheet_name = kwargs.pop("sheet", None)
    path = handle_path(path, kwargs)
    dfs = pd.read_excel(
        path,
        sheet_name=sheet_name,
        engine=engine,
        **{k: v for k, v in kwargs.items() if k in loader_prop_keys(LOADER_PROPS)}
    )
    if dfs is None or not len(dfs):
        raise Exception("Failed to load Excel file. Returned no data.")
    if sheet_name:
        if sheet_name not in dfs:
            raise Exception(
                "Excel file loaded but there was no sheet named '{}'.".format(
                    sheet_name
                )
            )
        return dfs[sheet_name]
    # this is required because there is no support for loading multiple datasets at once from the CLI
    # I can add this later...
    return dfs[list(dfs.keys())[0]]