def install_bids(sourcedata_dir: PathLike, bids_filename: PathLike) -> None: from pathlib import Path from fsspec.implementations.local import LocalFileSystem fs = LocalFileSystem(auto_mkdir=True) source_file = fs.open(fs.ls(sourcedata_dir)[0], mode="rb") target_file = fs.open(bids_filename, mode="wb") with source_file as sf, target_file as tf: tf.write(sf.read()) source_basename = Path(Path(Path(fs.ls(sourcedata_dir)[0]).stem).stem) target_basename = Path(bids_filename.stem).stem # The following part adds the sidecar files related to the nifti with the same name: it can be tsv or json files. # It may or may not be used, since there might not be any sidecars. sidecar_dir = sourcedata_dir.parent / "BIDS" for source_sidecar in sidecar_dir.rglob(f"{source_basename}*"): target_sidecar = Path.joinpath( bids_filename.parent, target_basename).with_name( f"{target_basename}{source_sidecar.suffix}") source_file = fs.open(source_sidecar, mode="rb") target_file = fs.open(target_sidecar, mode="wb") with source_file as sf, target_file as tf: tf.write(sf.read())
def write_bids( to: PathLike, participants: DataFrame, sessions: DataFrame, scans: DataFrame, ) -> List[PathLike]: from pathlib import Path from fsspec.implementations.local import LocalFileSystem to = Path(to) fs = LocalFileSystem(auto_mkdir=True) # Ensure BIDS hierarchy is written first. with fs.transaction: with fs.open(to / "participants.tsv", "w") as participant_file: write_to_tsv(participants, participant_file) for participant_id, sessions_group in sessions.groupby( "participant_id"): sessions_group = sessions_group.droplevel("participant_id") sessions_filepath = to / participant_id / f"{participant_id}_sessions.tsv" with fs.open(sessions_filepath, "w") as sessions_file: write_to_tsv(sessions_group, sessions_file) # Perform import of imaging data next. for filename, metadata in scans.iterrows(): if metadata.format == "DCM": convert_dicom(sourcedata_dir=metadata.source_dir, bids_filename=to / filename) else: install_nifti(sourcedata_dir=metadata.source_dir, bids_filename=to / filename) return scans.index.to_list()
def install_nifti(sourcedata_dir: PathLike, bids_filename: PathLike) -> None: from fsspec.implementations.local import LocalFileSystem fs = LocalFileSystem(auto_mkdir=True) source_file = fs.open(fs.ls(sourcedata_dir)[0], mode="rb") target_file = fs.open(bids_filename, mode="wb", compression="gzip") with source_file as sf, target_file as tf: tf.write(sf.read())
def test_transaction(tmpdir): file = str(tmpdir / "test.txt") fs = LocalFileSystem() with fs.transaction: content = "hello world" with fs.open(file, "w") as fp: fp.write(content) with fs.open(file, "r") as fp: read_content = fp.read() assert content == read_content
def test_csv_equality(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f: schema1 = from_file(f, {"read_headers": True}) assert(isinstance(schema1, TextSchema)) with fs.open(from_root('/test/sample_data/csv_sample_2.csv')) as f: schema2 = from_file(f, {"read_headers": True}) assert(isinstance(schema2, TextSchema)) schema = find_conflicts([schema1, schema2])[0] assert(isinstance(schema, SchemaConflict)) expect = {'CountDistinctSchemas': 2, 'DistinctSchemas': [{'SchemaType': 'csv', 'Columns': [{'Name': 'type', 'Type': 'object'}, {'Name': 'price', 'Type': 'float64'}]},{'SchemaType': 'csv', 'Columns': [{'Name': 'type', 'Type': 'object'}, {'Name': 'price', 'Type': 'float64'}, {'Name': 'availabile', 'Type': 'bool'}, {'Name': 'date', 'Type': 'object'}]}], 'NonOverlappingColumns': [{'name': 'availabile', 'type': 'bool'}, {'name': 'date', 'type': 'object'}]} assert(schema.to_dict() == {'SchemaConflicts': expect})
def test_jsonl(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/json_lines.jsonl')) as f: schema = from_file(f) assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'string'},'field6': {'type': 'string'},'field7': {'type': 'string'}}, 'type': 'object'} assert(schema.schema == expect)
def test_file_ops(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() with pytest.raises(FileNotFoundError): fs.info(tmpdir + "/nofile") fs.touch(tmpdir + "/afile") i1 = fs.ukey(tmpdir + "/afile") assert tmpdir + "/afile" in fs.ls(tmpdir) with fs.open(tmpdir + "/afile", "wb") as f: f.write(b"data") i2 = fs.ukey(tmpdir + "/afile") assert i1 != i2 # because file changed fs.copy(tmpdir + "/afile", tmpdir + "/afile2") assert tmpdir + "/afile2" in fs.ls(tmpdir) fs.move(tmpdir + "/afile", tmpdir + "/afile3") assert not fs.exists(tmpdir + "/afile") fs.rm(tmpdir + "/afile3", recursive=True) assert not fs.exists(tmpdir + "/afile3") fs.rm(tmpdir, recursive=True) assert not fs.exists(tmpdir)
def test_pickle(tmpdir): fs = LocalFileSystem() tmpdir = str(tmpdir) fn0 = os.path.join(tmpdir, "target") with open(fn0, "wb") as f: f.write(b"data") f = fs.open(fn0, "rb") f.seek(1) f2 = pickle.loads(pickle.dumps(f)) assert f2.read() == f.read() f = fs.open(fn0, "wb") with pytest.raises(ValueError): pickle.dumps(f)
def test_invalid_json(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/bad_json.json')) as f: schema = from_file(f, {}) assert(isinstance(schema, InvalidSchema)) message = f"File type not supported for file {from_root('/test/sample_data/bad_json.json')}. Type: ASCII text, with no line terminators" assert(message in schema.reason)
def test_file_ops(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() with pytest.raises(FileNotFoundError): fs.info(tmpdir + '/nofile') fs.touch(tmpdir + '/afile') i1 = fs.ukey(tmpdir + '/afile') assert tmpdir + '/afile' in fs.ls(tmpdir) with fs.open(tmpdir + '/afile', 'wb') as f: f.write(b'data') i2 = fs.ukey(tmpdir + '/afile') assert i1 != i2 # because file changed fs.copy(tmpdir + '/afile', tmpdir + '/afile2') assert tmpdir + '/afile2' in fs.ls(tmpdir) fs.move(tmpdir + '/afile', tmpdir + '/afile3') assert not fs.exists(tmpdir + '/afile') fs.rm(tmpdir + '/afile3', recursive=True) assert not fs.exists(tmpdir + '/afile3') fs.rm(tmpdir, recursive=True) assert not fs.exists(tmpdir)
def test_csv_no_header(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/csv_no_header.csv')) as f: schema = from_file(f) assert(isinstance(schema, TextSchema)) assert(list(map(lambda c: c.name,schema.columns)) == [0,1]) assert(list(map(lambda c: c.type,schema.columns)) == ["object","float64"])
def test_file_not_supported(self): logger.set_level("error") fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/unsupported_file_type.usf')) as f: schema = from_file(f) assert(isinstance(schema, InvalidSchema)) assert(schema.reason[0:32] == f"File type not supported for file")
def test_complex_json(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/complex_json.json')) as f: schema = from_file(f) assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {'data': {'type': 'array', 'items': {'type': 'object','properties': {'field1': {'type': 'string'},'field2': {'type': ['integer', 'string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'object','properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff']}}}}}, 'required': ['data']} assert(schema.schema == expect)
def test_valid_csv(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f: schema = from_file(f, {"read_headers": True}) assert(isinstance(schema, TextSchema)) assert(list(map(lambda c: c.name, schema.columns)) == ["type","price"]) assert(list(map(lambda c: c.type,schema.columns)) == ["object","float64"])
def test_commit_discard(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() with fs.transaction: with fs.open(tmpdir + '/afile', 'wb') as f: assert not fs.exists(tmpdir + '/afile') f.write(b'data') assert not fs.exists(tmpdir + '/afile') assert fs.cat(tmpdir + '/afile') == b'data' try: with fs.transaction: with fs.open(tmpdir + '/bfile', 'wb') as f: f.write(b'data') raise KeyboardInterrupt except KeyboardInterrupt: assert not fs.exists(tmpdir + '/bfile')
def test_commit_discard(tmpdir): tmpdir = str(tmpdir) fs = LocalFileSystem() with fs.transaction: with fs.open(tmpdir + "/afile", "wb") as f: assert not fs.exists(tmpdir + "/afile") f.write(b"data") assert not fs.exists(tmpdir + "/afile") assert fs.cat(tmpdir + "/afile") == b"data" try: with fs.transaction: with fs.open(tmpdir + "/bfile", "wb") as f: f.write(b"data") raise KeyboardInterrupt except KeyboardInterrupt: assert not fs.exists(tmpdir + "/bfile")
def test_infer_compression(tmpdir, opener, ext): filename = str(tmpdir / f"test{ext}") content = b"hello world" with opener(filename, "wb") as fp: fp.write(content) fs = LocalFileSystem() with fs.open(f"file://{filename}", "rb", compression="infer") as fp: read_content = fp.read() assert content == read_content
def test_valid_json(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/json_simple.json')) as f: schema = from_file(f) assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'}, 'field2': {'type': 'string'}, 'field3': {'type': 'string'}}, 'required': ['field', 'field2', 'field3'], 'type': 'object'} assert(schema.schema == expect) assert(schema.to_dict() == {'Columns': [], 'SchemaType': 'json'}) assert(schema.to_pd_dict() == {})
def open(self, key: str): if (key == "s3://crawler-poc/catalog_poc_data/test1.csv" or key == "s3://crawler-poc/catalog_poc_data/test2.csv"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/sample.snappy.parquet')) elif key in [ "s3://tests/in/csv/sample.csv", "s3://tests/in/csv/sample2.csv" ]: fs = LocalFileSystem() return fs.open(from_root('/test/sample_data/csv_sample.csv')) elif (key == "s3://test-data/test-path/test1.usf"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/unsupported_file_type.usf')) elif (key == "s3://test-data/test-path/test2.usf"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/unsupported_file_type.usf')) elif (key == "s3://test-data/test-path/sample.snappy.parquet"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/sample.snappy.parquet')) elif (key == "s3://test-data/test-path/sample.snappy.parquet"): fs = LocalFileSystem() return fs.open( from_root('/test/sample_data/sample.snappy.parquet')) else: raise Exception(f"Unmocked S3 API endpoint: {key}")
def test_seekable(tmpdir): fs = LocalFileSystem() tmpdir = str(tmpdir) fn0 = os.path.join(tmpdir, "target") with open(fn0, "wb") as f: f.write(b"data") f = fs.open(fn0, "rt") assert f.seekable(), "file is not seekable" f.seek(1) assert f.read(1) == "a" assert f.tell() == 2
def test_abs_paths(tmpdir): tmpdir = str(tmpdir) here = os.getcwd() os.chdir(tmpdir) with open("tmp", "w") as f: f.write("hi") out = LocalFileSystem().glob("*") assert len(out) == 1 assert "/" in out[0] assert "tmp" in out[0] fs = LocalFileSystem() os.chdir(here) with fs.open(out[0], "r") as f: res = f.read() assert res == "hi"
def test_check_schemas(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/complex_json.json')) as f: schema1 = from_file(f) assert(isinstance(schema1, JsonSchema)) with fs.open(from_root('/test/sample_data/complex_json_2.json')) as f: schema2 = from_file(f) assert(isinstance(schema2, JsonSchema)) with fs.open(from_root('/test/sample_data/json_simple.json')) as f: schema3 = from_file(f) assert(isinstance(schema3, JsonSchema)) with fs.open(from_root('/test/sample_data/unsupported_file_type.usf')) as f: schema4 = from_file(f) assert(isinstance(schema4, InvalidSchema)) with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f: schema5 = from_file(f, {"read_headers": True}) assert(isinstance(schema5, TextSchema)) with fs.open(from_root('/test/sample_data/json_lines.jsonl')) as f: schema6 = from_file(f) assert(isinstance(schema6, JsonSchema)) with fs.open(from_root('/test/sample_data/json_lines2.jsonl')) as f: schema7 = from_file(f) assert(isinstance(schema7, JsonSchema)) schema = find_conflicts([schema1, schema2])[0] expect = {'$schema': 'http://json-schema.org/schema#','properties': {'data': {'items': {'properties': {'field1': {'type': 'string'},'field2': {'type': ['integer','string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff'],'type': 'object'},'field6': {'type': 'string'}},'type': 'object'},'type': 'array'}},'required': ['data'],'type': 'object'} assert(isinstance(schema, JsonSchema)) assert(schema.schema == expect) schema = find_conflicts([schema1, schema2, schema3])[0] assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'data': {'items': {'properties': {'field1': {'type': 'string'},'field2': {'type': ['integer','string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff'],'type': 'object'},'field6': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'},'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'}}, 'required': [], 'type': 'object'} assert(schema.schema == expect) schema = find_conflicts([schema1, schema2, schema3, schema5])[0] assert(isinstance(schema, InvalidSchema)) assert(schema.reason == "Mixed type schemas not supported at this time. Ensure that files are of one type: ['csv', 'json']") schema = find_conflicts([schema6, schema7])[0] assert(isinstance(schema, JsonSchema)) expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'string'},'field6': {'type': 'string'},'field7': {'type': 'string'},'other': {'type': 'string'},'other2': {'type': 'string'},'other3': {'type': 'string'}}, 'required': ['other'], 'type': 'object'} assert(schema.schema == expect)
def test_file_ops(tmpdir): tmpdir = make_path_posix(str(tmpdir)) fs = LocalFileSystem(auto_mkdir=True) with pytest.raises(FileNotFoundError): fs.info(tmpdir + "/nofile") fs.touch(tmpdir + "/afile") i1 = fs.ukey(tmpdir + "/afile") assert tmpdir + "/afile" in fs.ls(tmpdir) with fs.open(tmpdir + "/afile", "wb") as f: f.write(b"data") i2 = fs.ukey(tmpdir + "/afile") assert i1 != i2 # because file changed fs.copy(tmpdir + "/afile", tmpdir + "/afile2") assert tmpdir + "/afile2" in fs.ls(tmpdir) fs.move(tmpdir + "/afile", tmpdir + "/afile3") assert not fs.exists(tmpdir + "/afile") fs.cp(tmpdir + "/afile3", tmpdir + "/deeply/nested/file") assert fs.exists(tmpdir + "/deeply/nested/file") fs.rm(tmpdir + "/afile3", recursive=True) assert not fs.exists(tmpdir + "/afile3") files = [tmpdir + "/afile4", tmpdir + "/afile5"] [fs.touch(f) for f in files] with pytest.raises(TypeError): fs.rm_file(files) fs.rm(files) assert all(not fs.exists(f) for f in files) fs.touch(tmpdir + "/afile6") fs.rm_file(tmpdir + "/afile6") assert not fs.exists(tmpdir + "/afile6") # IsADirectoryError raised on Linux, PermissionError on Windows with pytest.raises((IsADirectoryError, PermissionError)): fs.rm_file(tmpdir) fs.rm(tmpdir, recursive=True) assert not fs.exists(tmpdir)
class DeltaLake: """An instance of containing a Delta Lake This class provides an interface for Delta Lake using python-like filesystems provided by fsspec. """ def __init__( self, path: str, filesystem: AbstractFileSystem = None, time_travel: datetime = None, ): """Initializes a Delta Lake Retrieves rows pertaining to the given keys from the Table instance represented by table_handle. String keys will be UTF-8 encoded. Args: path: the path to the table on the filesystem filesystem: python-like filesystem (If unset, assume local) time_travel: set the delta lake to a specific version Returns: An instance of a delta table. """ if not filesystem: self.filesystem = LocalFileSystem(path) else: self.filesystem = filesystem self.path = path self._set_timestamp(time_travel) self.checkpoint_info = self._get_checkpoint_info() self.fileset = set() def _set_timestamp(self, time_travel: datetime): if not time_travel: self.timestamp = None else: self.timestamp = round(time.mktime(time_travel.timetuple())) def _get_checkpoint_info(self) -> Dict: try: with self.filesystem.open( os.path.join(self.path, "_delta_log", "_last_checkpoint") ) as last_checkpoint: return json.load(last_checkpoint) except (FileNotFoundError, OSError): return None def _replay_log(self, file: TextIO) -> Tuple[Set, Set]: actions = ndjson.loads(file.read()) if not self.timestamp: cut_time = round(time.time() * 1000) else: cut_time = self.timestamp * 1000 adds = set( action["add"]["path"] for action in actions if "add" in action.keys() and action["add"]["modificationTime"] < cut_time ) removes = set( action["remove"]["path"] for action in actions if "remove" in action.keys() and action["remove"]["deletionTimestamp"] < cut_time ) return adds, removes def _delta_files(self, version: int = None) -> str: if not version: version = 0 while True: try: loc = f"{self.path}/_delta_log/{str(version).zfill(20)}.json" file = self.filesystem.open(loc) version += 1 yield file except (FileNotFoundError, OSError): break def _replay_delta_and_update_fileset(self, version: int = 0): for file in self._delta_files(version): adds, removes = self._replay_log(file) self.fileset |= adds self.fileset -= removes def _get_checkpoint_files(self) -> List[str]: if "parts" in self.checkpoint_info.keys(): checkpoint_files = [ f"{self.path}/_delta_log/" + f"{str(self.checkpoint_info['version']).zfill(20)}" + f".checkpoint.{str(i).zfill(10)}" + f".{str(self.checkpoint_info['parts']).zfill(10)}.parquet" for i in range(1, self.checkpoint_info["parts"] + 1) ] else: checkpoint_files = [ f"{self.path}/_delta_log/" + f"{str(self.checkpoint_info['version']).zfill(20)}.checkpoint.parquet" ] return checkpoint_files def _get_checkpoint(self) -> DataFrame: checkpoints = [] for checkpoint_file in self._get_checkpoint_files(): with self.filesystem.open(checkpoint_file) as file_handler: checkpoints.append(pandas.read_parquet(file_handler)) return pandas.concat(checkpoints) def _replay_checkpoint_and_update_fileset(self): checkpoint = self._get_checkpoint() self.fileset |= set( x["path"] for x in checkpoint[checkpoint["add"].notnull()]["add"] ) def files(self) -> Set: """Fetches the parquet file list from the delta lake. Provides a list of the parquet files on the delta lake on the date specified during instantiation. Returns: A set of the parquet files on the delta lake. """ if ( self.timestamp or not self.checkpoint_info ): # time travel needs to replay all self._replay_delta_and_update_fileset() else: self._replay_checkpoint_and_update_fileset() self._replay_delta_and_update_fileset(self.checkpoint_info["version"] + 1) return self.fileset
def test_valid_csv_crlf_lf(self): fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/csv_crlf_sample.csv')) as f: schema = from_file(f, {"read_headers": True}) assert(isinstance(schema, TextSchema))
def test_snappy_parquet_schema_support(self): logger.set_level("info") fs = LocalFileSystem() with fs.open(from_root('/test/sample_data/sample.snappy.parquet')) as f: schema = from_file(f) assert(isinstance(schema, ParquetSchema))