Exemple #1
0
def test_read_json_error():
    with tmpfile('json') as f:
        with pytest.raises(ValueError):
            df.to_json(f, orient='split', lines=True)
        df.to_json(f, orient='split', lines=False)
        with pytest.raises(ValueError):
            dd.read_json(f, orient='split', blocksize=1)
Exemple #2
0
def _safe_load_json(path: str, **kwargs: Any) -> dd.DataFrame:
    try:
        return dd.read_json(path, **kwargs)
    except IsADirectoryError:
        x = dd.read_json(os.path.join(path, "*.json"), **kwargs)
        print(x.compute())
        return x
def test_read_json_meta(orient, tmpdir):
    df = pd.DataFrame({'x': range(5), 'y': ['a', 'b', 'c', 'd', 'e']})
    df2 = df.assign(x=df.x + 0.5)
    lines = orient == 'records'
    df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines)
    df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines)
    sol = pd.concat([df, df2])
    meta = df2.iloc[:0]

    if orient == 'values':
        # orient=values loses column names
        sol.columns = meta.columns = [0, 1]

    res = dd.read_json(str(tmpdir.join("fil*.json")),
                       orient=orient,
                       meta=meta,
                       lines=lines)
    assert_eq(res, sol)

    if orient == 'records':
        # Also check chunked version
        res = dd.read_json(str(tmpdir.join("fil*.json")),
                           orient=orient,
                           meta=meta,
                           lines=True,
                           blocksize=50)
        assert_eq(res, sol, check_index=False)
Exemple #4
0
def test_read_json_meta(orient, tmpdir):
    df = pd.DataFrame({"x": range(5), "y": ["a", "b", "c", "d", "e"]})
    df2 = df.assign(x=df.x + 0.5)
    lines = orient == "records"
    df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines)
    df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines)
    sol = pd.concat([df, df2])
    meta = df2.iloc[:0]

    if orient == "values":
        # orient=values loses column names
        sol.columns = meta.columns = [0, 1]

    res = dd.read_json(
        str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=lines
    )
    assert_eq(res, sol)

    if orient == "records":
        # Also check chunked version
        res = dd.read_json(
            str(tmpdir.join("fil*.json")),
            orient=orient,
            meta=meta,
            lines=True,
            blocksize=50,
        )
        assert_eq(res, sol, check_index=False)
Exemple #5
0
def test_read_json_error():
    with tmpfile("json") as f:
        with pytest.raises(ValueError):
            df.to_json(f, orient="split", lines=True)
        df.to_json(f, orient="split", lines=False)
        with pytest.raises(ValueError):
            dd.read_json(f, orient="split", blocksize=1)
Exemple #6
0
def test_read_json_path_column_with_duplicate_name_is_error():
    with tmpfile("json") as f:
        df.to_json(f, orient="records", lines=False)
        with pytest.raises(ValueError, match="Files already contain"):
            dd.read_json(f,
                         orient="records",
                         lines=False,
                         include_path_column="x")
Exemple #7
0
def from_json(path: Union[str, List[str]],
              flatten: bool = False,
              **params) -> dd.DataFrame:
    """Creates a `dd.DataFrame` from one or several json files.

    Includes a "path column".

    Parameters
    ----------
    path
        Path to files
    flatten
        If true, flatten nested data (default false).
    **params
        Extra arguments passed on to `pandas.read_json`

    Returns
    -------
    dataframe
        A `dd.DataFrame`
    """
    def json_engine(*args, **kwargs) -> pd.DataFrame:
        data_frame = pd.read_json(*args, **kwargs)
        return flatten_dataframe(data_frame) if flatten else data_frame

    path_list = _get_file_paths(path)

    dds = []
    for path_name in path_list:
        ddf = dd.read_json(path_name, engine=json_engine, **params)
        ddf[PATH_COLUMN_NAME] = path_name
        dds.append(ddf)

    return dd.concat(dds)
Exemple #8
0
def test_read_chunked(block):
    with tmpdir() as path:
        fn = os.path.join(path, '1.json')
        df.to_json(fn, orient='records', lines=True)
        d = dd.read_json(fn, blocksize=block, sample=10)
        assert (d.npartitions > 1) or (block > 50)
        assert_eq(d, df, check_index=False)
def usingDASK(file):
    chunks = []
    timing.log("Starting reading-in")

    reader = dd.read_json(
        file,
        lines=True,
        blocksize=2 ** 28,
        meta={"data": object, "message_type": object},
    )
    # t=reader['data'].map_partitions(lambda df: df.apply(lambda x: x.apply(flattenDict, key='', result={}))).to_bag()
    # t=reader.map_partitions(lambda df: df['data'].apply(flattenDict, key='', result={})).to_bag()
    datas = (
        reader["data"]
        .map_partitions(lambda df: df.apply((lambda row: flattenDict(row, "", {}))))
        .to_bag()
    )
    new = datas.to_dataframe()
    new["message_type"] = reader["message_type"]
    new = new.compute()
    dups = new.duplicated(subset="leaf_cert.fingerprint")
    dups = new[dups]
    dups.to_csv("duplicates_DASK.csv")

    """
Exemple #10
0
def test_read_chunked(block):
    with tmpdir() as path:
        fn = os.path.join(path, "1.json")
        df.to_json(fn, orient="records", lines=True)
        d = dd.read_json(fn, blocksize=block, sample=10)
        assert (d.npartitions > 1) or (block > 50)
        assert_eq(d, df, check_index=False)
Exemple #11
0
def test_write_json_basic(orient):
    with tmpdir() as path:
        fn = os.path.join(path, "1.json")
        df.to_json(fn, orient=orient, lines=False)
        actual = dd.read_json(fn, orient=orient, lines=False)
        if orient == "values":
            actual.columns = list(df.columns)
        assert_eq(actual, df)
Exemple #12
0
 def get_generator(self, path):
     df = dd.read_json(path)
     while True:
         sample_df = df.sample(frac=0.01).compute()
         x500 = np.array([np.array(x) for x in sample_df.one_hot.values])
         x500_t = np.array([np.array(y) for y in sample_df.one_hot_tags.values])
         y500_t = np.array([np.array(y) for y in sample_df.price.values])
         yield {'text_input':x500,'tags':x500_t }, {'price': y500_t}
Exemple #13
0
def test_write_json_basic(orient):
    with tmpdir() as path:
        fn = os.path.join(path, '1.json')
        df.to_json(fn, orient=orient, lines=False)
        actual = dd.read_json(fn, orient=orient, lines=False)
        out = actual.compute()
        if orient == 'values':
            out.columns = list(df.columns)
        assert_eq(out, df)
Exemple #14
0
def test_write_json_basic(orient):
    with tmpdir() as path:
        fn = os.path.join(path, '1.json')
        df.to_json(fn, orient=orient, lines=False)
        actual = dd.read_json(fn, orient=orient, lines=False)
        out = actual.compute()
        if orient == 'values':
            out.columns = list(df.columns)
        assert_eq(out, df)
Exemple #15
0
def test_json_compressed(compression):
    if compression == 'xz' and lzma is None:
        pytest.skip(
            "LZMA not available. Please install backports.lzma on Python 2.")

    with tmpdir() as path:
        dd.to_json(ddf, path, compression=compression)
        actual = dd.read_json(os.path.join(path, '*'), compression=compression)
        assert_eq(df, actual.compute(), check_index=False)
    def read_using_dask(self):
        t1 = timeit.default_timer()
        """ Read json file using dask read_json"""
        ipdf = dd.read_json(self.path,
                            compression=self.compression,
                            encoding=self.encoding)
        print("Time taken : {} seconds for reading json file '{}'".format(
            timeit.default_timer() - t1, self.path))

        return ipdf
Exemple #17
0
def test_read_json_basic(orient):
    with tmpfile("json") as f:
        df.to_json(f, orient=orient, lines=False)
        actual = dd.read_json(f, orient=orient, lines=False)
        actual_pd = pd.read_json(f, orient=orient, lines=False)

        assert_eq(actual, actual_pd)
        if orient == "values":
            actual.columns = list(df.columns)
        assert_eq(actual, df)
 def get_generator(self, path):
     df = dd.read_json(path)
     while True:
         sample_df = df.sample(frac=0.01).compute()
         x500 = np.array([np.array(x) for x in sample_df.one_hot.values])
         y500_t = np.array(
             [np.array(y) for y in sample_df.one_hot_tags.values])
         y500_c = np.array(
             [np.array(y) for y in sample_df.one_hot_cat.values])
         yield x500, {'tags': y500_t, 'category': y500_c}
Exemple #19
0
def test_json_compressed(compression):
    if compression == 'xz' and lzma is None:
        pytest.skip(
            "LZMA not available. Please install backports.lzma on Python 2."
        )

    with tmpdir() as path:
        dd.to_json(ddf, path, compression=compression)
        actual = dd.read_json(os.path.join(path, '*'),
                              compression=compression)
        assert_eq(df, actual.compute(), check_index=False)
Exemple #20
0
def test_read_json_basic(orient):
    with tmpfile('json') as f:
        df.to_json(f, orient=orient, lines=False)
        actual = dd.read_json(f, orient=orient, lines=False)
        actual_pd = pd.read_json(f, orient=orient, lines=False)

        out = actual.compute()
        assert_eq(out, actual_pd)
        if orient == 'values':
            out.columns = list(df.columns)
        assert_eq(out, df)
Exemple #21
0
def test_read_json_basic(orient):
    with tmpfile('json') as f:
        df.to_json(f, orient=orient, lines=False)
        actual = dd.read_json(f, orient=orient, lines=False)
        actual_pd = pd.read_json(f, orient=orient, lines=False)

        out = actual.compute()
        assert_eq(out, actual_pd)
        if orient == 'values':
            out.columns = list(df.columns)
        assert_eq(out, df)
Exemple #22
0
def test_read_json_fkeyword(fkeyword):
    def _my_json_reader(*args, **kwargs):
        if fkeyword == "json":
            return pd.DataFrame.from_dict(json.load(*args))
        return pd.read_json(*args)

    with tmpfile("json") as f:
        df.to_json(f, orient="records", lines=False)
        actual = dd.read_json(f, orient="records", lines=False, engine=_my_json_reader)
        actual_pd = pd.read_json(f, orient="records", lines=False)
        assert_eq(actual, actual_pd)
Exemple #23
0
def test_read_json_meta(orient, tmpdir):
    df = pd.DataFrame({'x': range(5), 'y': ['a', 'b', 'c', 'd', 'e']})
    df2 = df.assign(x=df.x + 0.5)
    lines = orient == 'records'
    df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines)
    df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines)
    sol = pd.concat([df, df2])
    meta = df2.iloc[:0]

    if orient == 'values':
        # orient=values loses column names
        sol.columns = meta.columns = [0, 1]

    res = dd.read_json(str(tmpdir.join("fil*.json")), orient=orient,
                       meta=meta, lines=lines)
    assert_eq(res, sol)

    if orient == 'records':
        # Also check chunked version
        res = dd.read_json(str(tmpdir.join("fil*.json")), orient=orient,
                           meta=meta, lines=True, blocksize=50)
        assert_eq(res, sol, check_index=False)
Exemple #24
0
def read_versions(log_path, list_of_keys=None):
    """
    Read version information from log path.

    - Calculate number of available workers

    Parameters
    ----------
    log_path: str
        Path to read the logging files.
    list_of_keys: list of list of int,str
        Version information is nested. Provide a list of list of the keys to retrieve this information.

    Examples
    --------
    >>> # Logs must already exist for this to work. (n_workers not shown)
    >>> import dask_log_server
    >>> list_of_keys = [["scheduler", "host", "OS"], ["scheduler", "packages", "dask"], ["scheduler", "packages", "python"]]
    >>> dask_log_server.read_versions("logs", list_of_keys).compute()
                              datetime   status        client_id                                           versions scheduler-host-OS scheduler-packages-dask scheduler-packages-python
    0 2020-07-13 15:13:55.322711+00:00  running  139859169225248  {'scheduler': {'host': {'python': '3.6.9.final...             Linux                  2.20.0             3.6.9.final.0
    0 2020-07-08 17:18:28.451828+00:00  running  140103390383688  {'scheduler': {'host': {'python': '3.6.9.final...             Linux                  2.20.0             3.6.9.final.0

    Returns
    -------
    pandas.DataFrame

    """
    if list_of_keys is None:
        list_of_keys = []
    df_versions = dd.read_json(log_path + "/version*.json")
    for keys in list_of_keys:
        column_name = "-".join(keys)
        df_versions[column_name] = df_versions["versions"].map(
            functools.partial(_get_nested, keys=keys), meta=(column_name, str))
    df_versions["n_workers"] = (dd.read_json("logs/info_*.jsonl")["info"].map(
        functools.partial(_get_nested, keys=["workers"]),
        meta=("n_workers", object)).map(len))
    return df_versions
Exemple #25
0
    def _json_as_df(self):
        """
        Import json file as Pandas DataFrame

        :return: Pandas DataFrame or dask dataframe
            Content of the json file
        """
        if self.use_dask:
            return dd.read_json(
                url_path=self.full_path,
                orient='records' if self.kwargs.get('orient') is None else
                self.kwargs.get('orient'),
                lines=self.kwargs.get('lines'),
                storage_options=self.kwargs.get('storage_options'),
                blocksize=self.kwargs.get('blocksize'),
                sample=2**20 if self.kwargs.get('sample') is None else
                self.kwargs.get('sample'),
                encoding='utf-8' if self.kwargs.get('encoding') is None else
                self.kwargs.get('encoding'),
                errors='strict' if self.kwargs.get('errors') is None else
                self.kwargs.get('errors'),
                compression='infer' if self.kwargs.get('compression') is None
                else self.kwargs.get('compression'),
                meta=self.kwargs.get('meta'),
                engine=pd.read_json)
        return pd.read_json(
            path_or_buf=self.full_path,
            orient='records' if self.kwargs.get('orient') is None else
            self.kwargs.get('orient'),
            typ='frame',
            dtype=True
            if self.kwargs.get('dtype') is None else self.kwargs.get('dtype'),
            convert_axes=True if self.kwargs.get('convert_axes') is None else
            self.kwargs.get('convert_axes'),
            convert_dates=True if self.kwargs.get('convert_dates') is None else
            self.kwargs.get('convert_dates'),
            keep_default_dates=True
            if self.kwargs.get('keep_default_dates') is None else
            self.kwargs.get('keep_default_dates'),
            numpy=False
            if self.kwargs.get('numpy') is None else self.kwargs.get('numpy'),
            precise_float=False if self.kwargs.get('precise_float') is None
            else self.kwargs.get('precise_float'),
            date_unit=self.kwargs.get('date_unit'),
            encoding='utf-8' if self.kwargs.get('encoding') is None else
            self.kwargs.get('encoding'),
            lines=False
            if self.kwargs.get('lines') is None else self.kwargs.get('lines'),
            chunksize=self.kwargs.get('chunksize'),
            compression=self.kwargs.get('compression'))
Exemple #26
0
def test_read_json_with_path_column(orient):
    with tmpfile("json") as f:
        df.to_json(f, orient=orient, lines=False)
        actual = dd.read_json(f,
                              orient=orient,
                              lines=False,
                              include_path_column=True)
        actual_pd = pd.read_json(f, orient=orient, lines=False)
        # The default column name when include_path_colum is True is "path"
        # The paths on Windows are converted to forward slash somewhere in the file
        # reading chain in Dask, so we have to do the same here.
        actual_pd["path"] = pd.Series(
            (f.replace(os.sep, "/"), ) * len(actual_pd), dtype="category")
        assert actual.path.dtype == "category"
        assert_eq(actual, actual_pd)
Exemple #27
0
    def import_data(path):
        """
        Description: 
            read data from data_set .csv and convert them into Pandas/ Dask Data Frame and append them into a list

        References:
            https://examples.dask.org/dataframes/01-data-access.html
        """
        # input configuration parameters
        # Start._arguments()
        # if not Start.kwargs:
        #     return

        pandas = Start.kwargs['pandas_type']

        try:
            # Delimiter processing
            if path.endswith(
                    '.xlsx') or path.endswith('.xls') and os.path.isfile(path):
                if pandas:
                    df = pd.read_excel(path)
                else:
                    parts = dask.delayed(pd.read_excel)(path)
                    df = dd.from_delayed(parts)

            elif path.endswith('.json') and os.path.isfile(path):
                if pandas:
                    df = pd.read_json(path)
                else:
                    df = dd.read_json(path)

            elif path.endswith('.csv') and os.path.isfile(path):
                if pandas:
                    df = pd.read_csv(path, low_memory=False)
                else:
                    df = dd.read_csv(path)

            else:
                # print('Unknown format')
                return None

        except (TypeError, OSError, FileNotFoundError):
            print("Wrong Type Format of imported data")
            import sys
            sys.exit(1)

        return df
Exemple #28
0
def test_to_json_with_get():
    from dask.multiprocessing import get as mp_get

    flag = [False]

    def my_get(*args, **kwargs):
        flag[0] = True
        return mp_get(*args, **kwargs)

    df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]})
    ddf = dd.from_pandas(df, npartitions=2)

    with tmpdir() as dn:
        ddf.to_json(dn, compute_kwargs={"scheduler": my_get})
        assert flag[0]
        result = dd.read_json(os.path.join(dn, "*"))
        assert_eq(result, df, check_index=False)
Exemple #29
0
def test_read_json_with_path_converter():
    path_column_name = "filenames"

    def path_converter(x):
        return "asdf.json"

    with tmpfile("json") as f:
        df.to_json(f, orient="records", lines=False)
        actual = dd.read_json(
            f,
            orient="records",
            lines=False,
            include_path_column=path_column_name,
            path_converter=path_converter,
        )
        actual_pd = pd.read_json(f, orient="records", lines=False)
        actual_pd[path_column_name] = pd.Series(
            (path_converter(f), ) * len(actual_pd), dtype="category")
        assert_eq(actual, actual_pd)
Exemple #30
0
    def _read_text_json(self, files_path):
        """
        Read json text files stored in files_path

        Parameters
        ----------
        files_path : string | list[string]
            single or multiple files path

        Returns
        -------
        dask.dataframe
        """
        text_ddf = dd.read_json(files_path, encoding=self.encoding)
        try:
            return text_ddf[[self.text_column]]
        except KeyError:
            raise KeyError(
                f"Specified text_column '{self.text_column}' not in file keys")
Exemple #31
0
def load_dfs_from_jsons(data_dir='data', useDask=False, n_json_files=100):
    paths = glob(os.path.join(data_dir, 'flightjson/*json'))

    if not useDask:
        dfs = [pd.read_json(path, lines=True) for path in paths[:n_json_files]]
        dfs = pd.concat(dfs)
    else:
        from dask import delayed
        import dask.dataframe as dd

        import dask.bag as db  #os.path.join(data_dir, 'flightjson/*json'
        mybag = delayed(db.read_text(paths).map(json.loads))

        mybag.to_dataframe()

        dfs = [(dd.read_json(path, lines=True))
               for path in paths[:n_json_files]]
        dfs = dd.concat(dfs)

    return dfs
Exemple #32
0
    def dask_read(option, file_path):

        # Python map for file type pattern
        file_type = {
            'parquet': file_path + '/*.parquet',
            'csv': file_path + '/*.csv',
            'json': file_path + '/*.json',
            'text': file_path + '/*.txt'
        }

        # Define reader type by pattern mapping
        file_pattern = file_type[option]
        dask_reader = {
            'parquet': dask_df.read_parquet(file_pattern, engine='pyarrow'),
            'csv': dask_df.read_csv(file_pattern),
            'json': dask_df.read_json(file_pattern),
            'text': dask_df.read_table(file_pattern)
        }

        return dask_reader[option]
Exemple #33
0
    def run(self):
        os.makedirs(DATASET_DIR, exist_ok=True)

        print("Load Data...")
        # Load
        df = dd.read_json(BASE_METADATA_FILE).repartition(npartitions=5)

        # Load train
        df_train = dd.read_csv(
            self.input()[0].path).repartition(npartitions=5).sample(frac=0.1)
        df_train = df_train.reset_index(
        )  #withColumn("idx", F.monotonically_increasing_id())
        df_train = self.add_more_information(df_train)

        # Load train
        df_test = dd.read_csv(
            self.input()[1].path).repartition(npartitions=5).sample(frac=0.1)
        df_test = df_test.reset_index(
        )  #withColumn("idx", F.monotonically_increasing_id())
        df_test = self.add_more_information(df_test)

        #Apply tokenizer
        ## Metadada
        df["title"] = df.title.apply(char_encode, meta=('title', 'object'))

        ## Train
        df_train["event_search"] = df_train.event_search.apply(
            char_encode, meta=('event_search', 'object'))

        ## Test
        df_test["event_search"] = df_test.event_search.apply(
            char_encode, meta=('event_search', 'object'))

        df_train.visualize(filename='df_train_dask.svg')

        # Save
        df_train.compute().sort_values("event_timestamp").to_csv(
            self.output()[0].path, index=False)
        df_test.compute().sort_values("event_timestamp").to_csv(
            self.output()[1].path, index=False)
        df.compute().to_csv(self.output()[2].path, index=False)
Exemple #34
0
def test_read_json_multiple_files_with_path_column(blocksize, tmpdir):
    fil1 = str(tmpdir.join("fil1.json")).replace(os.sep, "/")
    fil2 = str(tmpdir.join("fil2.json")).replace(os.sep, "/")
    df = pd.DataFrame({"x": range(5), "y": ["a", "b", "c", "d", "e"]})
    df2 = df.assign(x=df.x + 0.5)
    orient = "records"
    lines = True
    df.to_json(fil1, orient=orient, lines=lines)
    df2.to_json(fil2, orient=orient, lines=lines)
    path_dtype = pd.CategoricalDtype((fil1, fil2))
    df["path"] = pd.Series((fil1, ) * len(df), dtype=path_dtype)
    df2["path"] = pd.Series((fil2, ) * len(df2), dtype=path_dtype)
    sol = pd.concat([df, df2])
    res = dd.read_json(
        str(tmpdir.join("fil*.json")),
        orient=orient,
        lines=lines,
        include_path_column=True,
        blocksize=blocksize,
    )
    assert_eq(res, sol, check_index=False)
Exemple #35
0
    def __init__(self,
                 file_path,
                 block_size=10e6,
                 random_seed=None,
                 lines=True):
        """Initializes the loader.

        Args:
            file_path (str): Path to delimited file.
            block_size (int, optional): Size of partition in bytes.
                See dask.dataframe.read_csv()
                Defaults to 10e6.
            random_seed (int, optional): Random seed. See random.seed().
                Defaults to None.
            lines (bool, optional): Read the file as a json object per line. Defaults to True.
        """

        self.df = dd.read_json(file_path, blocksize=block_size, lines=lines)

        self.random_seed = random_seed
        random.seed(random_seed)
Exemple #36
0
def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]
    path = file_options.get("path")

    if file_type == "csv":
        return dd.read_csv(path, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        return dd.read_parquet(path, **dict_without_keys(file_options, "path"))
    elif file_type == "hdf":
        return dd.read_hdf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "json":
        return dd.read_json(path, **dict_without_keys(file_options, "path"))
    elif file_type == "sql_table":
        return dd.read_sql_table(**file_options)
    elif file_type == "table":
        return dd.read_table(path, **dict_without_keys(file_options, "path"))
    elif file_type == "fwf":
        return dd.read_fwf(path, **dict_without_keys(file_options, "path"))
    elif file_type == "orc":
        return dd.read_orc(path, **dict_without_keys(file_options, "path"))
    else:
        raise DagsterInvariantViolationError(
            "Unsupported file_type {file_type}".format(file_type=file_type))
Exemple #37
0
def test_read_json_inferred_compression():
    with tmpdir() as path:
        fn = os.path.join(path, '*.json.gz')
        dd.to_json(ddf, fn, compression='gzip')
        actual = dd.read_json(fn)
        assert_eq(df, actual.compute(), check_index=False)