Ejemplo n.º 1
0
    def test_regex(self):
        datasets = self.init_datasets()

        check = [
            FileInfo(
                join(
                    self.refdir,
                    'regex_dataset/NSS.HIRX.NJ.D99127.S0632.E0820.B2241718.WI.gz'
                ),  # noqa
                [
                    datetime.datetime(1999, 5, 7, 6, 32),
                    datetime.datetime(1999, 5, 7, 8, 20)
                ],
                {
                    'satcode': 'NJ',
                    'B': '2241718',
                    'station': 'WI'
                }),
        ]

        found_file = datasets["regex-HIRS"].find_closest("1999-05-08")

        assert found_file == check[0]
        assert found_file.attr == check[0].attr

        found_files = \
            list(datasets["regex-HIRS"].find("1999-05-07", "1999-05-09"))

        assert found_files == check
Ejemplo n.º 2
0
    def test_complicated_subdirs(self, ):
        """Check whether Dataset can find files in subdirectories that contain
        text and placeholders.
        """

        # The Pinocchio dataset from the cloud toolbox: a folder name contains
        # normal text and a placeholder:
        pinocchio = Dataset(
            join(
                self.refdir,
                "pinocchio_dataset/t{year2}{month}{day}/tm{year2}{month}{day}"
                "{hour}{minute}{second}{millisecond}.jpg",
            ), )

        # Find all files:
        files = list(pinocchio)

        check = [
            FileInfo(
                join(self.refdir,
                     'pinocchio_dataset/t171102/tm171102132855573.jpg'), [
                         datetime.datetime(2017, 11, 2, 13, 28, 55, 573000),
                         datetime.datetime(2017, 11, 2, 13, 28, 55, 573000)
                     ], {}),
        ]
        assert files == check
Ejemplo n.º 3
0
    def test_single(self):
        """Test find on the single dataset.

        Returns:
            None
        """
        datasets = self.init_datasets()

        # STANDARD DATASET
        # Should not find anything:
        empty = list(datasets["single"].find("2016-12-31",
                                             "2018-01-01",
                                             no_files_error=False))
        assert not empty

        check = [
            FileInfo(join(self.refdir, 'dataset_of_single_file.nc'), [
                datetime.datetime(2018, 1, 1, 0, 0),
                datetime.datetime(2018, 1, 3, 0, 0)
            ], {}),
        ]

        found_files = list(datasets["single"].find(
            "2018-01-01",
            "2018-01-02",
        ))

        assert found_files == check

        found_files = list(datasets["single"].find(
            "2018-01-01",
            "2018-01-02",
            bundle="12h",
        ))

        assert found_files == check

        found_files = list(datasets["single"].find(
            "2018-01-01",
            "2018-01-02",
            bundle=3,
        ))

        assert found_files == check
Ejemplo n.º 4
0
    def test_files_overlap_subdirectory(self):
        """A file covers a time period longer than its sub directory.
        """
        datasets = self.init_datasets()
        datasets["tutorial"].set_placeholders(satellite="SatelliteA")
        found_file = datasets["tutorial"].find_closest("2018-01-03")

        check = FileInfo(
            join(
                self.refdir,
                'tutorial_datasets/SatelliteA/2018/01/02/210000-020000.nc.zip'
            ), [
                datetime.datetime(2018, 1, 2, 21, 0),
                datetime.datetime(2018, 1, 3, 2, 0)
            ], {
                'satellite': 'SatelliteA',
                'compression': 'zip'
            })

        assert found_file == check
Ejemplo n.º 5
0
    def test_sequence_placeholder(self):
        """Test find on all standard datasets.

        Returns:
            None
        """
        datasets = self.init_datasets()

        # STANDARD DATASET
        # Should not find anything:
        empty = list(datasets["sequence-placeholder"].find(
            "2016-12-31", "2018-01-01", no_files_error=False))
        assert not empty

        # Should find two files:
        found_files = list(datasets["sequence-placeholder"].find(
            "2018-01-01",
            "2018-01-02",
        ))

        check = [
            FileInfo(
                join(self.refdir,
                     'sequence_dataset/2018/001/sequence0001.txt'), [
                         datetime.datetime(2018, 1, 1, 0, 0),
                         datetime.datetime(2018, 1, 1, 12, 0)
                     ], {'id': 1}),
            FileInfo(
                join(self.refdir,
                     'sequence_dataset/2018/001/sequence0002.txt'), [
                         datetime.datetime(2018, 1, 1, 12, 0),
                         datetime.datetime(2018, 1, 2, 0, 0)
                     ], {'id': 2}),
        ]
        assert found_files == check

        # Should find two files and should return them in two bins:
        found_files = list(datasets["sequence-placeholder"].find(
            "2018-01-01",
            "2018-01-02",
            bundle="6h",
        ))

        check = [
            [
                FileInfo(
                    join(self.refdir,
                         'sequence_dataset/2018/001/sequence0001.txt'), [
                             datetime.datetime(2018, 1, 1, 0, 0),
                             datetime.datetime(2018, 1, 1, 12, 0)
                         ], {'id': 1}),
            ],
            [
                FileInfo(
                    join(self.refdir,
                         'sequence_dataset/2018/001/sequence0002.txt'), [
                             datetime.datetime(2018, 1, 1, 12, 0),
                             datetime.datetime(2018, 1, 2, 0, 0)
                         ], {'id': 2}),
            ],
        ]
        assert found_files == check
Ejemplo n.º 6
0
    def test_tutorial(self):
        """Test the dataset examples of the tutorial.

        Returns:
            None
        """
        datasets = self.init_datasets()

        # STANDARD DATASET
        # Should not find anything:
        empty = list(datasets["tutorial"].find("2017-12-31",
                                               "2018-01-01",
                                               no_files_error=False))
        assert not empty

        # Find the closest file to 2018-01-01
        found_file = datasets["tutorial"].find_closest(
            "2018-01-01 03:00",
            filters={"!satellite": ("SatelliteA", "SatelliteC")})

        # Limit this to SatelliteB
        refdir = join(self.refdir, "tutorial_datasets/SatelliteB")

        check = FileInfo(join(refdir, '2018/01/01/000000-060000.nc.gz'), [
            datetime.datetime(2018, 1, 1, 0, 0),
            datetime.datetime(2018, 1, 1, 6, 0)
        ], {})

        assert found_file == check

        # Limit this dataset to SatelliteB permanently
        datasets["tutorial"].set_placeholders(satellite="SatelliteB", )

        # Should find four files:
        found_files = list(datasets["tutorial"].find(
            "2018-01-01",
            "2018-01-02",
        ))

        check = [
            FileInfo(join(refdir, '2018/01/01/000000-060000.nc.gz'), [
                datetime.datetime(2018, 1, 1, 0, 0),
                datetime.datetime(2018, 1, 1, 6, 0)
            ], {}),
            FileInfo(join(refdir, '2018/01/01/060000-120000.nc.gz'), [
                datetime.datetime(2018, 1, 1, 6, 0),
                datetime.datetime(2018, 1, 1, 12, 0)
            ], {}),
            FileInfo(join(refdir, '2018/01/01/120000-180000.nc.gz'), [
                datetime.datetime(2018, 1, 1, 12, 0),
                datetime.datetime(2018, 1, 1, 18, 0)
            ], {}),
            FileInfo(join(refdir, '2018/01/01/180000-000000.nc.gz'), [
                datetime.datetime(2018, 1, 1, 18, 0),
                datetime.datetime(2018, 1, 2, 0, 0)
            ], {}),
        ]

        assert found_files == check

        # Should find four files and should return them in two bins:
        found_files = list(datasets["tutorial"].find(
            "2018-01-01",
            "2018-01-02",
            bundle="12h",
        ))

        check = [
            [
                FileInfo(join(refdir, '2018/01/01/000000-060000.nc.gz'), [
                    datetime.datetime(2018, 1, 1, 0, 0),
                    datetime.datetime(2018, 1, 1, 6, 0)
                ], {}),
                FileInfo(join(refdir, '2018/01/01/060000-120000.nc.gz'), [
                    datetime.datetime(2018, 1, 1, 6, 0),
                    datetime.datetime(2018, 1, 1, 12, 0)
                ], {}),
            ],
            [
                FileInfo(join(refdir, '2018/01/01/120000-180000.nc.gz'), [
                    datetime.datetime(2018, 1, 1, 12, 0),
                    datetime.datetime(2018, 1, 1, 18, 0)
                ], {}),
                FileInfo(join(refdir, '2018/01/01/180000-000000.nc.gz'), [
                    datetime.datetime(2018, 1, 1, 18, 0),
                    datetime.datetime(2018, 1, 2, 0, 0)
                ], {}),
            ],
        ]

        assert found_files == check

        # Should find four files and should return them in two bins:
        found_files = list(datasets["tutorial"].find(
            "2018-01-01",
            "2018-01-02",
            bundle=3,
        ))

        check = [
            [
                FileInfo(join(refdir, '2018/01/01/000000-060000.nc.gz'), [
                    datetime.datetime(2018, 1, 1, 0, 0),
                    datetime.datetime(2018, 1, 1, 6, 0)
                ], {}),
                FileInfo(join(refdir, '2018/01/01/060000-120000.nc.gz'), [
                    datetime.datetime(2018, 1, 1, 6, 0),
                    datetime.datetime(2018, 1, 1, 12, 0)
                ], {}),
                FileInfo(join(refdir, '2018/01/01/120000-180000.nc.gz'), [
                    datetime.datetime(2018, 1, 1, 12, 0),
                    datetime.datetime(2018, 1, 1, 18, 0)
                ], {}),
            ],
            [
                FileInfo(join(refdir, '2018/01/01/180000-000000.nc.gz'), [
                    datetime.datetime(2018, 1, 1, 18, 0),
                    datetime.datetime(2018, 1, 2, 0, 0)
                ], {}),
            ],
        ]

        assert found_files == check

        for test_method in [Dataset.map, Dataset.imap]:
            # Check map method
            results = list(
                test_method(datasets["tutorial"],
                            "2018-01-01",
                            "2018-01-03",
                            func=TestDataset._tutorial_map))
            check = ['gz', 'gz', 'gz', 'gz', 'gz', 'gz', 'gz', 'gz']
            assert results == check

            # Check map method on content
            results = list(
                test_method(
                    datasets["tutorial"],
                    "2018-01-01",
                    "2018-01-03",
                    func=TestDataset._tutorial_map_content,
                    on_content=True,
                ))
            check = [
                0.25007269785924874, 0.25007269785924874, 0.25007269785924874,
                0.25007269785924874, 0.25007269785924874, 0.25007269785924874,
                0.25007269785924874, 0.25007269785924874
            ]
            assert np.allclose(results, check)
Ejemplo n.º 7
0
    def test_glob(self):
        files = Dataset(
            join(self.refdir, "tutorial_datasets/{satellite}/*/*/*/*.nc.gz"), )

        # Sort this after paths rather than times (because the times are all
        # equal)
        check = list(
            sorted(
                [
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/02/180000-000000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/02/000000-060000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/02/120000-180000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/02/060000-120000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/01/180000-000000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/01/000000-060000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/01/120000-180000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/01/060000-120000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                    FileInfo(
                        join(
                            self.refdir,
                            'tutorial_datasets/SatelliteB/2018/01/03/000000-060000.nc.gz'
                        ),  # noqa
                        [
                            datetime.datetime(1, 1, 1, 0, 0),
                            datetime.datetime(9999, 12, 31, 23, 59, 59, 999999)
                        ],
                        {'satellite': 'SatelliteB'}),
                ],
                key=lambda x: x.path))

        assert list(sorted(files, key=lambda x: x.path)) == check