Esempio n. 1
0
def dataset():
    return RandomDataset(
        train_start_date="2017-12-25 06:00:00Z",
        train_end_date="2017-12-29 06:00:00Z",
        tag_list=[SensorTag("Tag 1", None),
                  SensorTag("Tag 2", None)],
    )
Esempio n. 2
0
def test_aggregation_methods():
    """Tests that it works to set aggregation method(s)"""

    kwargs = dict(
        data_provider=MockDataProvider(),
        tag_list=[
            SensorTag("Tag 1", None),
            SensorTag("Tag 2", None),
            SensorTag("Tag 3", None),
        ],
        train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"),
        train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"),
    )

    # Default aggregation gives no extra columns
    X, _ = TimeSeriesDataset(**kwargs).get_data()
    assert (83, 3) == X.shape

    # The default single aggregation method gives the tag-names as columns
    assert list(X.columns) == ["Tag 1", "Tag 2", "Tag 3"]

    # Using two aggregation methods give a multi-level column with tag-names
    # on top and aggregation_method as second level
    X, _ = TimeSeriesDataset(aggregation_methods=["mean", "max"],
                             **kwargs).get_data()

    assert (83, 6) == X.shape
    assert list(X.columns) == [
        ("Tag 1", "mean"),
        ("Tag 1", "max"),
        ("Tag 2", "mean"),
        ("Tag 2", "max"),
        ("Tag 3", "mean"),
        ("Tag 3", "max"),
    ]
Esempio n. 3
0
def test_lookup_default(legacy_ncs_lookup: NcsLookup, mock_assets_config,
                        threads_count):
    tags = [
        SensorTag("Ásgarðr", "asset"),
        SensorTag("tag1", "asset"),
        SensorTag("tag2", "asset"),
        SensorTag("tag4", "asset"),
        SensorTag("tag5", "asset1"),
    ]
    result = list(
        legacy_ncs_lookup.lookup(
            mock_assets_config,
            tags,
            [YearPartition(2019), YearPartition(2020)],
            threads_count=threads_count,
        ))
    assert reduce_tag_locations(result) == {
        ("Ásgarðr", YearPartition(2019)): (
            "path/%C3%81sgar%C3%B0r/%C3%81sgar%C3%B0r_2019.csv",
            CsvFileType,
        ),
        ("tag2", YearPartition(2020)): (
            "path/tag2/parquet/tag2_2020.parquet",
            ParquetFileType,
        ),
        ("tag5", YearPartition(2020)): (
            "path1/tag5/parquet/tag5_2020.parquet",
            ParquetFileType,
        ),
    }
Esempio n. 4
0
def test_tag_locations(parquet_file_type):
    tag = SensorTag("tag1", "asset")
    location_2020 = Location("path/2020.parquet", parquet_file_type)
    locations = {
        YearPartition(2020): location_2020,
        YearPartition(2018): Location("path/2018.parquet", parquet_file_type),
    }
    tag_locations = TagLocations(tag, locations)
    assert tag_locations.available()
    assert tag_locations.partitions() == [
        YearPartition(2018), YearPartition(2020)
    ]
    assert tag_locations.get_location(2020) is location_2020
    assert tag_locations.get_location(2019) is None
    result = list(tag_locations)
    assert result == [
        (
            SensorTag(name="tag1", asset="asset"),
            YearPartition(2018),
            Location(path="path/2018.parquet", file_type=parquet_file_type),
        ),
        (
            SensorTag(name="tag1", asset="asset"),
            YearPartition(2020),
            Location(path="path/2020.parquet", file_type=parquet_file_type),
        ),
    ]
Esempio n. 5
0
def test_empty_target_tag_list():
    app = Flask(__name__)
    with app.app_context():
        g.metadata = {
            "dataset": {
                "tag_list": [SensorTag("test", "asset"), "test1"]
            },
            "metadata": {
                "build_metadata": {
                    "dataset": {
                        "dataset_meta": {
                            "tag_loading_metadata": {
                                "tags": {
                                    "test": {
                                        "name": "test",
                                        "asset": "asset"
                                    },
                                    "test1": {
                                        "name": "test1",
                                        "asset": "asset1"
                                    },
                                }
                            }
                        }
                    }
                }
            },
        }
        view = BaseModelView()
        assert view.target_tags == [
            SensorTag("test", "asset"),
            SensorTag("test1", "asset1"),
        ]
Esempio n. 6
0
def get_random_data():
    data = {
        "type": "RandomDataset",
        "train_start_date": dateutil.parser.isoparse("2017-12-25 06:00:00Z"),
        "train_end_date": dateutil.parser.isoparse("2017-12-30 06:00:00Z"),
        "tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)],
        "target_tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)],
    }
    return data
Esempio n. 7
0
def test_legacy_to_dict():
    dataset = RandomDataset(
        "2017-12-25 06:00:00Z",
        "2017-12-29 06:00:00Z",
        [SensorTag("Tag 1", None),
         SensorTag("Tag 2", None)],
    )
    config = dataset.to_dict()
    assert config["type"] == "RandomDataset"
Esempio n. 8
0
def test_load_series_need_asset_hint(dates, ncs_reader):
    with pytest.raises(ValueError):
        for _ in ncs_reader.load_series(dates[0], dates[1],
                                        [SensorTag("XYZ-123", None)]):
            pass

    valid_tag_list_with_asset = [SensorTag("XYZ-123", "gordoplatform")]
    for frame in ncs_reader.load_series(dates[0], dates[1],
                                        valid_tag_list_with_asset):
        assert len(frame) == 20
Esempio n. 9
0
def test_get_dataset_with_full_import():
    dataset = _get_dataset({
        "type":
        "gordo_dataset.datasets.RandomDataset",
        "train_start_date":
        "2017-12-25 06:00:00Z",
        "train_end_date":
        "2017-12-29 06:00:00Z",
        "tag_list": [SensorTag("Tag 1", None),
                     SensorTag("Tag 2", None)],
    })
    assert type(dataset) is RandomDataset
Esempio n. 10
0
 def test_load_from_multiple_providers(self):
     """Two tags, each belonging to different data producers, and both gets loaded"""
     series_collection = list(
         load_series_from_multiple_providers(
             [self.ab_producer, self.containing_b_producer],
             None,
             None,
             [SensorTag("abba", None),
              SensorTag("cba", None)],
         ))
     self.assertEqual(series_collection[0].name, "ab.*")
     self.assertEqual(series_collection[1].name, ".*b.*")
Esempio n. 11
0
 def test_load_multiple_raises_with_no_matches(self):
     """If no provider matches a tag then load_series_from_multiple_providers
     raises a ValueError when the generator is realized"""
     with self.assertRaises(ValueError):
         list(
             load_series_from_multiple_providers(
                 [self.ab_producer, self.containing_b_producer],
                 None,
                 None,
                 [
                     SensorTag("ab", None),
                     SensorTag("tag_not_matching_any_of_the_regexps", None),
                 ],
             ))
Esempio n. 12
0
def test_trigger_tags():
    data_provider = MockDataProvider()
    dataset = TimeSeriesDataset(
        data_provider=data_provider,
        tag_list=[
            SensorTag("Tag 1", "asset"),
            SensorTag("Tag 2", "asset"),
        ],
        target_tag_list=[
            SensorTag("Tag 5", "asset"),
        ],
        train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"),
        train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"),
        row_filter="`Tag 3` > 0 & `Tag 4` > 1",
        asset="asset",
    )
    X, y = dataset.get_data()
    assert X is not None
    assert y is not None
    assert set(data_provider.last_tag_list) == {
        SensorTag("Tag 1", "asset"),
        SensorTag("Tag 2", "asset"),
        SensorTag("Tag 3", "asset"),
        SensorTag("Tag 4", "asset"),
        SensorTag("Tag 5", "asset"),
    }
    assert set(X.columns.values) == {"Tag 1", "Tag 2"}
    assert set(y.columns.values) == {"Tag 5"}
Esempio n. 13
0
def test_lookup_exceptions(legacy_ncs_lookup: NcsLookup, mock_assets_config,
                           threads_count):
    tags = [
        SensorTag("Ásgarðr", "asset"),
        SensorTag("tag1", "asset"),
    ]
    with pytest.raises(ConfigException):
        list(
            legacy_ncs_lookup.lookup(
                mock_assets_config,
                tags,
                [YearPartition(2019), YearPartition(2020)],
                threads_count=threads_count,
            ))
Esempio n. 14
0
def test_time_series_no_resolution():
    kwargs = dict(
        data_provider=MockDataProvider(),
        tag_list=[
            SensorTag("Tag 1", None),
            SensorTag("Tag 2", None),
            SensorTag("Tag 3", None),
        ],
        train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"),
        train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"),
    )

    no_resolution, _ = TimeSeriesDataset(resolution=None, **kwargs).get_data()
    wi_resolution, _ = TimeSeriesDataset(resolution="10T", **kwargs).get_data()
    assert len(no_resolution) > len(wi_resolution)
Esempio n. 15
0
def test_from_dict_with_empty_type():
    train_start_date = datetime(2020, 1, 1, tzinfo=tzutc())
    train_end_date = datetime(2020, 3, 1, tzinfo=tzutc())
    tag_list = [SensorTag("tag1", "asset"), SensorTag("tag2", "asset")]

    config = {
        "train_start_date": train_start_date,
        "train_end_date": train_end_date,
        "tag_list": tag_list,
    }
    dataset = GordoBaseDataset.from_dict(config)
    assert type(dataset) is TimeSeriesDataset
    assert dataset.train_start_date == train_start_date
    assert dataset.train_end_date == train_end_date
    assert dataset.tag_list == tag_list
Esempio n. 16
0
def test_to_dict_build_in():
    train_start_date = datetime(2020, 1, 1, tzinfo=tzutc())
    train_end_date = datetime(2020, 3, 1, tzinfo=tzutc())
    tag_list = [SensorTag("tag1", "asset"), SensorTag("tag2", "asset")]

    dataset = TimeSeriesDataset(
        train_start_date=train_start_date,
        train_end_date=train_end_date,
        tag_list=tag_list,
    )
    config = dataset.to_dict()
    assert config["train_start_date"] == "2020-01-01T00:00:00+00:00"
    assert config["train_end_date"] == "2020-03-01T00:00:00+00:00"
    assert config["tag_list"] == tag_list
    assert config["type"] == "TimeSeriesDataset"
Esempio n. 17
0
def test_tag_dirs_lookup(legacy_ncs_lookup: NcsLookup):
    tags = [
        SensorTag("Ásgarðr", "asset"),
        SensorTag("tag1", "asset"),
        SensorTag("tag2", "asset"),
        SensorTag("tag4", "asset"),
    ]
    result = {}
    for tag, path in legacy_ncs_lookup.tag_dirs_lookup("path", tags):
        result[tag.name] = path
    assert result == {
        "Ásgarðr": "path/%C3%81sgar%C3%B0r",
        "tag2": "path/tag2",
        "tag1": None,
        "tag4": None,
    }
Esempio n. 18
0
def test_empty_target_tag_list():
    app = Flask(__name__)
    test_tag = SensorTag("test", "asset")
    with app.app_context():
        g.metadata = {"dataset": {"tag_list": [test_tag]}}
        view = BaseModelView()
        assert view.target_tags == [test_tag]
Esempio n. 19
0
def test_monthly_partition_lookup(default_ncs_lookup: NcsLookup,
                                  mock_assets_config):
    tags = [SensorTag("tag11", "asset")]
    partitions = [
        MonthPartition(2020, 2),
        MonthPartition(2020, 3),
        MonthPartition(2020, 4),
    ]
    locations_list = list(
        default_ncs_lookup.lookup(mock_assets_config, tags, partitions))
    assert len(locations_list) == 1

    locations = locations_list[0]
    assert locations.partitions() == [
        MonthPartition(2020, 2),
        MonthPartition(2020, 4)
    ]

    location_2020_2 = locations.get_location(MonthPartition(2020, 2))
    assert location_2020_2 is not None
    assert location_2020_2.path == "path/tag11/parquet/2020/tag11_202002.parquet"
    assert isinstance(location_2020_2.file_type, ParquetFileType)
    assert location_2020_2.partition == MonthPartition(2020, 2)

    location_2020_4 = locations.get_location(MonthPartition(2020, 4))
    assert location_2020_4 is not None
    assert location_2020_4.path == "path/tag11/parquet/2020/tag11_202004.parquet"
    assert isinstance(location_2020_4.file_type, ParquetFileType)
    assert location_2020_4.partition == MonthPartition(2020, 4)
Esempio n. 20
0
def test_timeseries_dataset_compat():
    """
    There are accepted keywords in the config file when using type: TimeSeriesDataset
    which don't actually match the kwargs of the dataset's __init__; for compatibility
    :func:`gordo_dataset.datasets.compat` should adjust for these differences.
    """
    dataset = TimeSeriesDataset(
        data_provider=MockDataProvider(),
        train_start_date="2017-12-25 06:00:00Z",
        train_end_date="2017-12-29 06:00:00Z",
        tags=[SensorTag("Tag 1", None)],
    )
    assert dataset.train_start_date == dateutil.parser.isoparse(
        "2017-12-25 06:00:00Z")
    assert dataset.train_end_date == dateutil.parser.isoparse(
        "2017-12-29 06:00:00Z")
    assert dataset.tag_list == [SensorTag("Tag 1", None)]
def test_can_handle_tag_no_asset():
    iroc_reader = IrocReader(
        storage=None,
        assets_config=None,
        threads=1,
        storage_name="dataplatformdlsprod",
    )
    assert not iroc_reader.can_handle_tag(SensorTag("UON_EF.xxx", None))
def test_can_handle_tag_ok(mock_file_system):
    iroc_reader = IrocReader(
        storage=mock_file_system,
        assets_config=load_assets_config(),
        threads=1,
        storage_name="dataplatformdlsprod",
    )
    assert iroc_reader.can_handle_tag(SensorTag("UON_EF.xxx", "UON_EF"))
Esempio n. 23
0
def test_assets_config_wrong_reader(legacy_ncs_lookup: NcsLookup,
                                    mock_assets_config):
    tags = [
        SensorTag("tag4", "asset5"),
    ]
    with pytest.raises(ValueError):
        list(
            legacy_ncs_lookup.assets_config_tags_lookup(
                mock_assets_config, tags))
Esempio n. 24
0
def _machine(name: str) -> Machine:
    """
    Helper to build a basic Machine, only defining its name
    """
    from gordo_dataset.sensor_tag import SensorTag

    return Machine.from_config(
        config={
            "name": name,
            "dataset": {
                "tag_list": [SensorTag("tag-1", "foo"), SensorTag("tag-2", "foo")],
                "train_start_date": "2016-01-01T00:00:00Z",
                "train_end_date": "2016-01-05T00:00:00Z",
            },
            "model": {"sklearn.linear_model.LinearRegression": {}},
        },
        project_name="test-project",
    )
Esempio n. 25
0
def test_assets_config_tags_lookup_exceptions(legacy_ncs_lookup: NcsLookup,
                                              mock_assets_config):
    tags = [
        SensorTag("Ásgarðr", "asset"),
        SensorTag("tag10", ""),
    ]
    with pytest.raises(ValueError):
        list(
            legacy_ncs_lookup.assets_config_tags_lookup(
                mock_assets_config, tags))
    tags = [
        SensorTag("Ásgarðr", "asset"),
        SensorTag("tag10", "asset10"),
    ]
    with pytest.raises(ValueError):
        list(
            legacy_ncs_lookup.assets_config_tags_lookup(
                mock_assets_config, tags))
Esempio n. 26
0
def test_assets_config_tags_lookup(legacy_ncs_lookup: NcsLookup,
                                   mock_assets_config):
    tags = [
        SensorTag("Ásgarðr", "asset"),
        SensorTag("tag1", "asset"),
        SensorTag("tag2", "asset"),
        SensorTag("tag4", "asset"),
        SensorTag("tag5", "asset1"),
    ]
    result = list(
        legacy_ncs_lookup.assets_config_tags_lookup(mock_assets_config, tags))
    assert result == [
        (SensorTag(name="Ásgarðr", asset="asset"), "path/%C3%81sgar%C3%B0r"),
        (SensorTag(name="tag2", asset="asset"), "path/tag2"),
        (SensorTag(name="tag1", asset="asset"), None),
        (SensorTag(name="tag4", asset="asset"), None),
        (SensorTag(name="tag5", asset="asset1"), "path1/tag5"),
    ]
Esempio n. 27
0
def test_process_metadata():
    data_provider = MockDataProvider()
    dataset = TimeSeriesDataset(
        data_provider=data_provider,
        tag_list=[
            SensorTag("Tag 1", None),
            SensorTag("Tag 2", None),
        ],
        target_tag_list=[
            SensorTag("Tag 5", None),
        ],
        train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"),
        train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"),
        row_filter="`Tag 3` > 0 & `Tag 4` > 1",
        process_metadata=False,
        asset="asset",
    )
    dataset.get_data()
    assert dataset._metadata == {}
Esempio n. 28
0
 def test_load_multiple_matches_loads_from_first(self):
     """When a tag can be read from multiple providers it is the first provider in
     the list of providers which gets the job"""
     series_collection = list(
         load_series_from_multiple_providers(
             [self.ab_producer, self.containing_b_producer],
             None,
             None,
             [SensorTag("abba", None)],
         ))
     self.assertEqual(series_collection[0].name, "ab.*")
Esempio n. 29
0
def test_can_handle_tag_non_supported_asset_with_base_path(
        ncs_reader, assets_config):
    tag = SensorTag("WEIRD-123", "UNKNOWN-ASSET")
    assert not ncs_reader.can_handle_tag(tag)

    ncs_reader_with_base = NcsReader(
        ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"),
        assets_config=assets_config,
        dl_base_path="/this/is/a/base/path",
    )
    assert ncs_reader_with_base.can_handle_tag(tag)
Esempio n. 30
0
def test_insufficient_data_after_automatic_filtering():
    """
    Test that dataframe after row_filter scenarios raise appropriate
    InsufficientDataError
    """

    kwargs = dict(
        data_provider=MockDataProvider(),
        tag_list=[
            SensorTag("Tag 1", None),
            SensorTag("Tag 2", None),
            SensorTag("Tag 3", None),
        ],
        train_start_date=dateutil.parser.isoparse("2017-12-25 06:00:00Z"),
        train_end_date=dateutil.parser.isoparse("2017-12-29 06:00:00Z"),
        n_samples_threshold=84,
        filter_periods={"filter_method": "median"},
    )

    with pytest.raises(InsufficientDataError):
        TimeSeriesDataset(**kwargs).get_data()