def test_regex(self): datasets = self.init_datasets() check = [ FileInfo( join( self.refdir, 'regex_dataset/NSS.HIRX.NJ.D99127.S0632.E0820.B2241718.WI.gz' ), # noqa [ datetime.datetime(1999, 5, 7, 6, 32), datetime.datetime(1999, 5, 7, 8, 20) ], { 'satcode': 'NJ', 'B': '2241718', 'station': 'WI' }), ] found_file = datasets["regex-HIRS"].find_closest("1999-05-08") assert found_file == check[0] assert found_file.attr == check[0].attr found_files = \ list(datasets["regex-HIRS"].find("1999-05-07", "1999-05-09")) assert found_files == check
def test_complicated_subdirs(self, ): """Check whether Dataset can find files in subdirectories that contain text and placeholders. """ # The Pinocchio dataset from the cloud toolbox: a folder name contains # normal text and a placeholder: pinocchio = Dataset( join( self.refdir, "pinocchio_dataset/t{year2}{month}{day}/tm{year2}{month}{day}" "{hour}{minute}{second}{millisecond}.jpg", ), ) # Find all files: files = list(pinocchio) check = [ FileInfo( join(self.refdir, 'pinocchio_dataset/t171102/tm171102132855573.jpg'), [ datetime.datetime(2017, 11, 2, 13, 28, 55, 573000), datetime.datetime(2017, 11, 2, 13, 28, 55, 573000) ], {}), ] assert files == check
def test_single(self): """Test find on the single dataset. Returns: None """ datasets = self.init_datasets() # STANDARD DATASET # Should not find anything: empty = list(datasets["single"].find("2016-12-31", "2018-01-01", no_files_error=False)) assert not empty check = [ FileInfo(join(self.refdir, 'dataset_of_single_file.nc'), [ datetime.datetime(2018, 1, 1, 0, 0), datetime.datetime(2018, 1, 3, 0, 0) ], {}), ] found_files = list(datasets["single"].find( "2018-01-01", "2018-01-02", )) assert found_files == check found_files = list(datasets["single"].find( "2018-01-01", "2018-01-02", bundle="12h", )) assert found_files == check found_files = list(datasets["single"].find( "2018-01-01", "2018-01-02", bundle=3, )) assert found_files == check
def test_files_overlap_subdirectory(self): """A file covers a time period longer than its sub directory. """ datasets = self.init_datasets() datasets["tutorial"].set_placeholders(satellite="SatelliteA") found_file = datasets["tutorial"].find_closest("2018-01-03") check = FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteA/2018/01/02/210000-020000.nc.zip' ), [ datetime.datetime(2018, 1, 2, 21, 0), datetime.datetime(2018, 1, 3, 2, 0) ], { 'satellite': 'SatelliteA', 'compression': 'zip' }) assert found_file == check
def test_sequence_placeholder(self): """Test find on all standard datasets. Returns: None """ datasets = self.init_datasets() # STANDARD DATASET # Should not find anything: empty = list(datasets["sequence-placeholder"].find( "2016-12-31", "2018-01-01", no_files_error=False)) assert not empty # Should find two files: found_files = list(datasets["sequence-placeholder"].find( "2018-01-01", "2018-01-02", )) check = [ FileInfo( join(self.refdir, 'sequence_dataset/2018/001/sequence0001.txt'), [ datetime.datetime(2018, 1, 1, 0, 0), datetime.datetime(2018, 1, 1, 12, 0) ], {'id': 1}), FileInfo( join(self.refdir, 'sequence_dataset/2018/001/sequence0002.txt'), [ datetime.datetime(2018, 1, 1, 12, 0), datetime.datetime(2018, 1, 2, 0, 0) ], {'id': 2}), ] assert found_files == check # Should find two files and should return them in two bins: found_files = list(datasets["sequence-placeholder"].find( "2018-01-01", "2018-01-02", bundle="6h", )) check = [ [ FileInfo( join(self.refdir, 'sequence_dataset/2018/001/sequence0001.txt'), [ datetime.datetime(2018, 1, 1, 0, 0), datetime.datetime(2018, 1, 1, 12, 0) ], {'id': 1}), ], [ FileInfo( join(self.refdir, 'sequence_dataset/2018/001/sequence0002.txt'), [ datetime.datetime(2018, 1, 1, 12, 0), datetime.datetime(2018, 1, 2, 0, 0) ], {'id': 2}), ], ] assert found_files == check
def test_tutorial(self): """Test the dataset examples of the tutorial. Returns: None """ datasets = self.init_datasets() # STANDARD DATASET # Should not find anything: empty = list(datasets["tutorial"].find("2017-12-31", "2018-01-01", no_files_error=False)) assert not empty # Find the closest file to 2018-01-01 found_file = datasets["tutorial"].find_closest( "2018-01-01 03:00", filters={"!satellite": ("SatelliteA", "SatelliteC")}) # Limit this to SatelliteB refdir = join(self.refdir, "tutorial_datasets/SatelliteB") check = FileInfo(join(refdir, '2018/01/01/000000-060000.nc.gz'), [ datetime.datetime(2018, 1, 1, 0, 0), datetime.datetime(2018, 1, 1, 6, 0) ], {}) assert found_file == check # Limit this dataset to SatelliteB permanently datasets["tutorial"].set_placeholders(satellite="SatelliteB", ) # Should find four files: found_files = list(datasets["tutorial"].find( "2018-01-01", "2018-01-02", )) check = [ FileInfo(join(refdir, '2018/01/01/000000-060000.nc.gz'), [ datetime.datetime(2018, 1, 1, 0, 0), datetime.datetime(2018, 1, 1, 6, 0) ], {}), FileInfo(join(refdir, '2018/01/01/060000-120000.nc.gz'), [ datetime.datetime(2018, 1, 1, 6, 0), datetime.datetime(2018, 1, 1, 12, 0) ], {}), FileInfo(join(refdir, '2018/01/01/120000-180000.nc.gz'), [ datetime.datetime(2018, 1, 1, 12, 0), datetime.datetime(2018, 1, 1, 18, 0) ], {}), FileInfo(join(refdir, '2018/01/01/180000-000000.nc.gz'), [ datetime.datetime(2018, 1, 1, 18, 0), datetime.datetime(2018, 1, 2, 0, 0) ], {}), ] assert found_files == check # Should find four files and should return them in two bins: found_files = list(datasets["tutorial"].find( "2018-01-01", "2018-01-02", bundle="12h", )) check = [ [ FileInfo(join(refdir, '2018/01/01/000000-060000.nc.gz'), [ datetime.datetime(2018, 1, 1, 0, 0), datetime.datetime(2018, 1, 1, 6, 0) ], {}), FileInfo(join(refdir, '2018/01/01/060000-120000.nc.gz'), [ datetime.datetime(2018, 1, 1, 6, 0), datetime.datetime(2018, 1, 1, 12, 0) ], {}), ], [ FileInfo(join(refdir, '2018/01/01/120000-180000.nc.gz'), [ datetime.datetime(2018, 1, 1, 12, 0), datetime.datetime(2018, 1, 1, 18, 0) ], {}), FileInfo(join(refdir, '2018/01/01/180000-000000.nc.gz'), [ datetime.datetime(2018, 1, 1, 18, 0), datetime.datetime(2018, 1, 2, 0, 0) ], {}), ], ] assert found_files == check # Should find four files and should return them in two bins: found_files = list(datasets["tutorial"].find( "2018-01-01", "2018-01-02", bundle=3, )) check = [ [ FileInfo(join(refdir, '2018/01/01/000000-060000.nc.gz'), [ datetime.datetime(2018, 1, 1, 0, 0), datetime.datetime(2018, 1, 1, 6, 0) ], {}), FileInfo(join(refdir, '2018/01/01/060000-120000.nc.gz'), [ datetime.datetime(2018, 1, 1, 6, 0), datetime.datetime(2018, 1, 1, 12, 0) ], {}), FileInfo(join(refdir, '2018/01/01/120000-180000.nc.gz'), [ datetime.datetime(2018, 1, 1, 12, 0), datetime.datetime(2018, 1, 1, 18, 0) ], {}), ], [ FileInfo(join(refdir, '2018/01/01/180000-000000.nc.gz'), [ datetime.datetime(2018, 1, 1, 18, 0), datetime.datetime(2018, 1, 2, 0, 0) ], {}), ], ] assert found_files == check for test_method in [Dataset.map, Dataset.imap]: # Check map method results = list( test_method(datasets["tutorial"], "2018-01-01", "2018-01-03", func=TestDataset._tutorial_map)) check = ['gz', 'gz', 'gz', 'gz', 'gz', 'gz', 'gz', 'gz'] assert results == check # Check map method on content results = list( test_method( datasets["tutorial"], "2018-01-01", "2018-01-03", func=TestDataset._tutorial_map_content, on_content=True, )) check = [ 0.25007269785924874, 0.25007269785924874, 0.25007269785924874, 0.25007269785924874, 0.25007269785924874, 0.25007269785924874, 0.25007269785924874, 0.25007269785924874 ] assert np.allclose(results, check)
def test_glob(self): files = Dataset( join(self.refdir, "tutorial_datasets/{satellite}/*/*/*/*.nc.gz"), ) # Sort this after paths rather than times (because the times are all # equal) check = list( sorted( [ FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/02/180000-000000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/02/000000-060000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/02/120000-180000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/02/060000-120000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/01/180000-000000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/01/000000-060000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/01/120000-180000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/01/060000-120000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), FileInfo( join( self.refdir, 'tutorial_datasets/SatelliteB/2018/01/03/000000-060000.nc.gz' ), # noqa [ datetime.datetime(1, 1, 1, 0, 0), datetime.datetime(9999, 12, 31, 23, 59, 59, 999999) ], {'satellite': 'SatelliteB'}), ], key=lambda x: x.path)) assert list(sorted(files, key=lambda x: x.path)) == check