def test_read_json_error(): with tmpfile('json') as f: with pytest.raises(ValueError): df.to_json(f, orient='split', lines=True) df.to_json(f, orient='split', lines=False) with pytest.raises(ValueError): dd.read_json(f, orient='split', blocksize=1)
def _safe_load_json(path: str, **kwargs: Any) -> dd.DataFrame: try: return dd.read_json(path, **kwargs) except IsADirectoryError: x = dd.read_json(os.path.join(path, "*.json"), **kwargs) print(x.compute()) return x
def test_read_json_meta(orient, tmpdir): df = pd.DataFrame({'x': range(5), 'y': ['a', 'b', 'c', 'd', 'e']}) df2 = df.assign(x=df.x + 0.5) lines = orient == 'records' df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines) df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines) sol = pd.concat([df, df2]) meta = df2.iloc[:0] if orient == 'values': # orient=values loses column names sol.columns = meta.columns = [0, 1] res = dd.read_json(str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=lines) assert_eq(res, sol) if orient == 'records': # Also check chunked version res = dd.read_json(str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=True, blocksize=50) assert_eq(res, sol, check_index=False)
def test_read_json_meta(orient, tmpdir): df = pd.DataFrame({"x": range(5), "y": ["a", "b", "c", "d", "e"]}) df2 = df.assign(x=df.x + 0.5) lines = orient == "records" df.to_json(str(tmpdir.join("fil1.json")), orient=orient, lines=lines) df2.to_json(str(tmpdir.join("fil2.json")), orient=orient, lines=lines) sol = pd.concat([df, df2]) meta = df2.iloc[:0] if orient == "values": # orient=values loses column names sol.columns = meta.columns = [0, 1] res = dd.read_json( str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=lines ) assert_eq(res, sol) if orient == "records": # Also check chunked version res = dd.read_json( str(tmpdir.join("fil*.json")), orient=orient, meta=meta, lines=True, blocksize=50, ) assert_eq(res, sol, check_index=False)
def test_read_json_error(): with tmpfile("json") as f: with pytest.raises(ValueError): df.to_json(f, orient="split", lines=True) df.to_json(f, orient="split", lines=False) with pytest.raises(ValueError): dd.read_json(f, orient="split", blocksize=1)
def test_read_json_path_column_with_duplicate_name_is_error(): with tmpfile("json") as f: df.to_json(f, orient="records", lines=False) with pytest.raises(ValueError, match="Files already contain"): dd.read_json(f, orient="records", lines=False, include_path_column="x")
def from_json(path: Union[str, List[str]], flatten: bool = False, **params) -> dd.DataFrame: """Creates a `dd.DataFrame` from one or several json files. Includes a "path column". Parameters ---------- path Path to files flatten If true, flatten nested data (default false). **params Extra arguments passed on to `pandas.read_json` Returns ------- dataframe A `dd.DataFrame` """ def json_engine(*args, **kwargs) -> pd.DataFrame: data_frame = pd.read_json(*args, **kwargs) return flatten_dataframe(data_frame) if flatten else data_frame path_list = _get_file_paths(path) dds = [] for path_name in path_list: ddf = dd.read_json(path_name, engine=json_engine, **params) ddf[PATH_COLUMN_NAME] = path_name dds.append(ddf) return dd.concat(dds)
def test_read_chunked(block): with tmpdir() as path: fn = os.path.join(path, '1.json') df.to_json(fn, orient='records', lines=True) d = dd.read_json(fn, blocksize=block, sample=10) assert (d.npartitions > 1) or (block > 50) assert_eq(d, df, check_index=False)
def usingDASK(file): chunks = [] timing.log("Starting reading-in") reader = dd.read_json( file, lines=True, blocksize=2 ** 28, meta={"data": object, "message_type": object}, ) # t=reader['data'].map_partitions(lambda df: df.apply(lambda x: x.apply(flattenDict, key='', result={}))).to_bag() # t=reader.map_partitions(lambda df: df['data'].apply(flattenDict, key='', result={})).to_bag() datas = ( reader["data"] .map_partitions(lambda df: df.apply((lambda row: flattenDict(row, "", {})))) .to_bag() ) new = datas.to_dataframe() new["message_type"] = reader["message_type"] new = new.compute() dups = new.duplicated(subset="leaf_cert.fingerprint") dups = new[dups] dups.to_csv("duplicates_DASK.csv") """
def test_read_chunked(block): with tmpdir() as path: fn = os.path.join(path, "1.json") df.to_json(fn, orient="records", lines=True) d = dd.read_json(fn, blocksize=block, sample=10) assert (d.npartitions > 1) or (block > 50) assert_eq(d, df, check_index=False)
def test_write_json_basic(orient): with tmpdir() as path: fn = os.path.join(path, "1.json") df.to_json(fn, orient=orient, lines=False) actual = dd.read_json(fn, orient=orient, lines=False) if orient == "values": actual.columns = list(df.columns) assert_eq(actual, df)
def get_generator(self, path): df = dd.read_json(path) while True: sample_df = df.sample(frac=0.01).compute() x500 = np.array([np.array(x) for x in sample_df.one_hot.values]) x500_t = np.array([np.array(y) for y in sample_df.one_hot_tags.values]) y500_t = np.array([np.array(y) for y in sample_df.price.values]) yield {'text_input':x500,'tags':x500_t }, {'price': y500_t}
def test_write_json_basic(orient): with tmpdir() as path: fn = os.path.join(path, '1.json') df.to_json(fn, orient=orient, lines=False) actual = dd.read_json(fn, orient=orient, lines=False) out = actual.compute() if orient == 'values': out.columns = list(df.columns) assert_eq(out, df)
def test_json_compressed(compression): if compression == 'xz' and lzma is None: pytest.skip( "LZMA not available. Please install backports.lzma on Python 2.") with tmpdir() as path: dd.to_json(ddf, path, compression=compression) actual = dd.read_json(os.path.join(path, '*'), compression=compression) assert_eq(df, actual.compute(), check_index=False)
def read_using_dask(self): t1 = timeit.default_timer() """ Read json file using dask read_json""" ipdf = dd.read_json(self.path, compression=self.compression, encoding=self.encoding) print("Time taken : {} seconds for reading json file '{}'".format( timeit.default_timer() - t1, self.path)) return ipdf
def test_read_json_basic(orient): with tmpfile("json") as f: df.to_json(f, orient=orient, lines=False) actual = dd.read_json(f, orient=orient, lines=False) actual_pd = pd.read_json(f, orient=orient, lines=False) assert_eq(actual, actual_pd) if orient == "values": actual.columns = list(df.columns) assert_eq(actual, df)
def get_generator(self, path): df = dd.read_json(path) while True: sample_df = df.sample(frac=0.01).compute() x500 = np.array([np.array(x) for x in sample_df.one_hot.values]) y500_t = np.array( [np.array(y) for y in sample_df.one_hot_tags.values]) y500_c = np.array( [np.array(y) for y in sample_df.one_hot_cat.values]) yield x500, {'tags': y500_t, 'category': y500_c}
def test_json_compressed(compression): if compression == 'xz' and lzma is None: pytest.skip( "LZMA not available. Please install backports.lzma on Python 2." ) with tmpdir() as path: dd.to_json(ddf, path, compression=compression) actual = dd.read_json(os.path.join(path, '*'), compression=compression) assert_eq(df, actual.compute(), check_index=False)
def test_read_json_basic(orient): with tmpfile('json') as f: df.to_json(f, orient=orient, lines=False) actual = dd.read_json(f, orient=orient, lines=False) actual_pd = pd.read_json(f, orient=orient, lines=False) out = actual.compute() assert_eq(out, actual_pd) if orient == 'values': out.columns = list(df.columns) assert_eq(out, df)
def test_read_json_fkeyword(fkeyword): def _my_json_reader(*args, **kwargs): if fkeyword == "json": return pd.DataFrame.from_dict(json.load(*args)) return pd.read_json(*args) with tmpfile("json") as f: df.to_json(f, orient="records", lines=False) actual = dd.read_json(f, orient="records", lines=False, engine=_my_json_reader) actual_pd = pd.read_json(f, orient="records", lines=False) assert_eq(actual, actual_pd)
def read_versions(log_path, list_of_keys=None): """ Read version information from log path. - Calculate number of available workers Parameters ---------- log_path: str Path to read the logging files. list_of_keys: list of list of int,str Version information is nested. Provide a list of list of the keys to retrieve this information. Examples -------- >>> # Logs must already exist for this to work. (n_workers not shown) >>> import dask_log_server >>> list_of_keys = [["scheduler", "host", "OS"], ["scheduler", "packages", "dask"], ["scheduler", "packages", "python"]] >>> dask_log_server.read_versions("logs", list_of_keys).compute() datetime status client_id versions scheduler-host-OS scheduler-packages-dask scheduler-packages-python 0 2020-07-13 15:13:55.322711+00:00 running 139859169225248 {'scheduler': {'host': {'python': '3.6.9.final... Linux 2.20.0 3.6.9.final.0 0 2020-07-08 17:18:28.451828+00:00 running 140103390383688 {'scheduler': {'host': {'python': '3.6.9.final... Linux 2.20.0 3.6.9.final.0 Returns ------- pandas.DataFrame """ if list_of_keys is None: list_of_keys = [] df_versions = dd.read_json(log_path + "/version*.json") for keys in list_of_keys: column_name = "-".join(keys) df_versions[column_name] = df_versions["versions"].map( functools.partial(_get_nested, keys=keys), meta=(column_name, str)) df_versions["n_workers"] = (dd.read_json("logs/info_*.jsonl")["info"].map( functools.partial(_get_nested, keys=["workers"]), meta=("n_workers", object)).map(len)) return df_versions
def _json_as_df(self): """ Import json file as Pandas DataFrame :return: Pandas DataFrame or dask dataframe Content of the json file """ if self.use_dask: return dd.read_json( url_path=self.full_path, orient='records' if self.kwargs.get('orient') is None else self.kwargs.get('orient'), lines=self.kwargs.get('lines'), storage_options=self.kwargs.get('storage_options'), blocksize=self.kwargs.get('blocksize'), sample=2**20 if self.kwargs.get('sample') is None else self.kwargs.get('sample'), encoding='utf-8' if self.kwargs.get('encoding') is None else self.kwargs.get('encoding'), errors='strict' if self.kwargs.get('errors') is None else self.kwargs.get('errors'), compression='infer' if self.kwargs.get('compression') is None else self.kwargs.get('compression'), meta=self.kwargs.get('meta'), engine=pd.read_json) return pd.read_json( path_or_buf=self.full_path, orient='records' if self.kwargs.get('orient') is None else self.kwargs.get('orient'), typ='frame', dtype=True if self.kwargs.get('dtype') is None else self.kwargs.get('dtype'), convert_axes=True if self.kwargs.get('convert_axes') is None else self.kwargs.get('convert_axes'), convert_dates=True if self.kwargs.get('convert_dates') is None else self.kwargs.get('convert_dates'), keep_default_dates=True if self.kwargs.get('keep_default_dates') is None else self.kwargs.get('keep_default_dates'), numpy=False if self.kwargs.get('numpy') is None else self.kwargs.get('numpy'), precise_float=False if self.kwargs.get('precise_float') is None else self.kwargs.get('precise_float'), date_unit=self.kwargs.get('date_unit'), encoding='utf-8' if self.kwargs.get('encoding') is None else self.kwargs.get('encoding'), lines=False if self.kwargs.get('lines') is None else self.kwargs.get('lines'), chunksize=self.kwargs.get('chunksize'), compression=self.kwargs.get('compression'))
def test_read_json_with_path_column(orient): with tmpfile("json") as f: df.to_json(f, orient=orient, lines=False) actual = dd.read_json(f, orient=orient, lines=False, include_path_column=True) actual_pd = pd.read_json(f, orient=orient, lines=False) # The default column name when include_path_colum is True is "path" # The paths on Windows are converted to forward slash somewhere in the file # reading chain in Dask, so we have to do the same here. actual_pd["path"] = pd.Series( (f.replace(os.sep, "/"), ) * len(actual_pd), dtype="category") assert actual.path.dtype == "category" assert_eq(actual, actual_pd)
def import_data(path): """ Description: read data from data_set .csv and convert them into Pandas/ Dask Data Frame and append them into a list References: https://examples.dask.org/dataframes/01-data-access.html """ # input configuration parameters # Start._arguments() # if not Start.kwargs: # return pandas = Start.kwargs['pandas_type'] try: # Delimiter processing if path.endswith( '.xlsx') or path.endswith('.xls') and os.path.isfile(path): if pandas: df = pd.read_excel(path) else: parts = dask.delayed(pd.read_excel)(path) df = dd.from_delayed(parts) elif path.endswith('.json') and os.path.isfile(path): if pandas: df = pd.read_json(path) else: df = dd.read_json(path) elif path.endswith('.csv') and os.path.isfile(path): if pandas: df = pd.read_csv(path, low_memory=False) else: df = dd.read_csv(path) else: # print('Unknown format') return None except (TypeError, OSError, FileNotFoundError): print("Wrong Type Format of imported data") import sys sys.exit(1) return df
def test_to_json_with_get(): from dask.multiprocessing import get as mp_get flag = [False] def my_get(*args, **kwargs): flag[0] = True return mp_get(*args, **kwargs) df = pd.DataFrame({"x": ["a", "b", "c", "d"], "y": [1, 2, 3, 4]}) ddf = dd.from_pandas(df, npartitions=2) with tmpdir() as dn: ddf.to_json(dn, compute_kwargs={"scheduler": my_get}) assert flag[0] result = dd.read_json(os.path.join(dn, "*")) assert_eq(result, df, check_index=False)
def test_read_json_with_path_converter(): path_column_name = "filenames" def path_converter(x): return "asdf.json" with tmpfile("json") as f: df.to_json(f, orient="records", lines=False) actual = dd.read_json( f, orient="records", lines=False, include_path_column=path_column_name, path_converter=path_converter, ) actual_pd = pd.read_json(f, orient="records", lines=False) actual_pd[path_column_name] = pd.Series( (path_converter(f), ) * len(actual_pd), dtype="category") assert_eq(actual, actual_pd)
def _read_text_json(self, files_path): """ Read json text files stored in files_path Parameters ---------- files_path : string | list[string] single or multiple files path Returns ------- dask.dataframe """ text_ddf = dd.read_json(files_path, encoding=self.encoding) try: return text_ddf[[self.text_column]] except KeyError: raise KeyError( f"Specified text_column '{self.text_column}' not in file keys")
def load_dfs_from_jsons(data_dir='data', useDask=False, n_json_files=100): paths = glob(os.path.join(data_dir, 'flightjson/*json')) if not useDask: dfs = [pd.read_json(path, lines=True) for path in paths[:n_json_files]] dfs = pd.concat(dfs) else: from dask import delayed import dask.dataframe as dd import dask.bag as db #os.path.join(data_dir, 'flightjson/*json' mybag = delayed(db.read_text(paths).map(json.loads)) mybag.to_dataframe() dfs = [(dd.read_json(path, lines=True)) for path in paths[:n_json_files]] dfs = dd.concat(dfs) return dfs
def dask_read(option, file_path): # Python map for file type pattern file_type = { 'parquet': file_path + '/*.parquet', 'csv': file_path + '/*.csv', 'json': file_path + '/*.json', 'text': file_path + '/*.txt' } # Define reader type by pattern mapping file_pattern = file_type[option] dask_reader = { 'parquet': dask_df.read_parquet(file_pattern, engine='pyarrow'), 'csv': dask_df.read_csv(file_pattern), 'json': dask_df.read_json(file_pattern), 'text': dask_df.read_table(file_pattern) } return dask_reader[option]
def run(self): os.makedirs(DATASET_DIR, exist_ok=True) print("Load Data...") # Load df = dd.read_json(BASE_METADATA_FILE).repartition(npartitions=5) # Load train df_train = dd.read_csv( self.input()[0].path).repartition(npartitions=5).sample(frac=0.1) df_train = df_train.reset_index( ) #withColumn("idx", F.monotonically_increasing_id()) df_train = self.add_more_information(df_train) # Load train df_test = dd.read_csv( self.input()[1].path).repartition(npartitions=5).sample(frac=0.1) df_test = df_test.reset_index( ) #withColumn("idx", F.monotonically_increasing_id()) df_test = self.add_more_information(df_test) #Apply tokenizer ## Metadada df["title"] = df.title.apply(char_encode, meta=('title', 'object')) ## Train df_train["event_search"] = df_train.event_search.apply( char_encode, meta=('event_search', 'object')) ## Test df_test["event_search"] = df_test.event_search.apply( char_encode, meta=('event_search', 'object')) df_train.visualize(filename='df_train_dask.svg') # Save df_train.compute().sort_values("event_timestamp").to_csv( self.output()[0].path, index=False) df_test.compute().sort_values("event_timestamp").to_csv( self.output()[1].path, index=False) df.compute().to_csv(self.output()[2].path, index=False)
def test_read_json_multiple_files_with_path_column(blocksize, tmpdir): fil1 = str(tmpdir.join("fil1.json")).replace(os.sep, "/") fil2 = str(tmpdir.join("fil2.json")).replace(os.sep, "/") df = pd.DataFrame({"x": range(5), "y": ["a", "b", "c", "d", "e"]}) df2 = df.assign(x=df.x + 0.5) orient = "records" lines = True df.to_json(fil1, orient=orient, lines=lines) df2.to_json(fil2, orient=orient, lines=lines) path_dtype = pd.CategoricalDtype((fil1, fil2)) df["path"] = pd.Series((fil1, ) * len(df), dtype=path_dtype) df2["path"] = pd.Series((fil2, ) * len(df2), dtype=path_dtype) sol = pd.concat([df, df2]) res = dd.read_json( str(tmpdir.join("fil*.json")), orient=orient, lines=lines, include_path_column=True, blocksize=blocksize, ) assert_eq(res, sol, check_index=False)
def __init__(self, file_path, block_size=10e6, random_seed=None, lines=True): """Initializes the loader. Args: file_path (str): Path to delimited file. block_size (int, optional): Size of partition in bytes. See dask.dataframe.read_csv() Defaults to 10e6. random_seed (int, optional): Random seed. See random.seed(). Defaults to None. lines (bool, optional): Read the file as a json object per line. Defaults to True. """ self.df = dd.read_json(file_path, blocksize=block_size, lines=lines) self.random_seed = random_seed random.seed(random_seed)
def dataframe_loader(_context, config): file_type, file_options = list(config.items())[0] path = file_options.get("path") if file_type == "csv": return dd.read_csv(path, **dict_without_keys(file_options, "path")) elif file_type == "parquet": return dd.read_parquet(path, **dict_without_keys(file_options, "path")) elif file_type == "hdf": return dd.read_hdf(path, **dict_without_keys(file_options, "path")) elif file_type == "json": return dd.read_json(path, **dict_without_keys(file_options, "path")) elif file_type == "sql_table": return dd.read_sql_table(**file_options) elif file_type == "table": return dd.read_table(path, **dict_without_keys(file_options, "path")) elif file_type == "fwf": return dd.read_fwf(path, **dict_without_keys(file_options, "path")) elif file_type == "orc": return dd.read_orc(path, **dict_without_keys(file_options, "path")) else: raise DagsterInvariantViolationError( "Unsupported file_type {file_type}".format(file_type=file_type))
def test_read_json_inferred_compression(): with tmpdir() as path: fn = os.path.join(path, '*.json.gz') dd.to_json(ddf, fn, compression='gzip') actual = dd.read_json(fn) assert_eq(df, actual.compute(), check_index=False)