def test_serverless_ingest():
    init_store()
    key = "patient_id"

    measurements = fs.FeatureSet("measurements",
                                 entities=[Entity(key)],
                                 timestamp_key="timestamp")
    target_path = os.path.relpath(results_dir + "mycsv.csv")
    source = CSVSource("mycsv",
                       path=os.path.relpath(local_dir + "testdata.csv"))
    targets = [CSVTarget("mycsv", path=target_path)]
    if os.path.exists(target_path):
        os.remove(target_path)

    run_ingestion_job(
        measurements,
        source,
        targets,
        name="test_ingest",
        infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats,
        parameters={},
        function=None,
        local=True,
    )
    assert os.path.exists(target_path), "result file was not generated"
    features = sorted(measurements.spec.features.keys())
    stats = sorted(measurements.status.stats.keys())
    print(features)
    print(stats)
    stats.remove("timestamp")
    assert features == stats, "didnt infer stats for all features"

    print(measurements.to_yaml())
Exemple #2
0
    def test_serverless_ingest(self):
        key = "patient_id"
        measurements = fs.FeatureSet("measurements",
                                     entities=[Entity(key)],
                                     timestamp_key="timestamp")
        target_path = os.path.relpath(str(self.results_path / "mycsv.csv"))
        source = CSVSource("mycsv",
                           path=os.path.relpath(
                               str(self.assets_path / "testdata.csv")))
        targets = [CSVTarget("mycsv", path=target_path)]
        if os.path.exists(target_path):
            os.remove(target_path)

        fs.ingest(
            measurements,
            source,
            targets,
            infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats,
            run_config=fs.RunConfig(local=True),
        )
        assert os.path.exists(target_path), "result file was not generated"
        features = sorted(measurements.spec.features.keys())
        stats = sorted(measurements.status.stats.keys())
        print(features)
        print(stats)
        stats.remove("timestamp")
        assert features == stats, "didnt infer stats for all features"
Exemple #3
0
    def test_read_csv(self):
        from storey import CSVSource, ReduceToDataFrame, build_flow

        csv_path = str(self.results_path / _generate_random_name() / ".csv")
        targets = [CSVTarget("mycsv", path=csv_path)]
        stocks_set = fs.FeatureSet(
            "tests", entities=[Entity("ticker", ValueType.STRING)]
        )
        fs.ingest(
            stocks_set, stocks, infer_options=fs.InferOptions.default(), targets=targets
        )

        # reading csv file
        controller = build_flow([CSVSource(csv_path), ReduceToDataFrame()]).run()
        termination_result = controller.await_termination()

        expected = pd.DataFrame(
            {
                0: ["ticker", "MSFT", "GOOG", "AAPL"],
                1: ["name", "Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
                2: ["exchange", "NASDAQ", "NASDAQ", "NASDAQ"],
            }
        )

        assert termination_result.equals(
            expected
        ), f"{termination_result}\n!=\n{expected}"
        os.remove(csv_path)
Exemple #4
0
    def test_csv_time_columns(self):
        df = pd.DataFrame(
            {
                "key": ["key1", "key2"],
                "time_stamp": [
                    datetime(2020, 11, 1, 17, 33, 15),
                    datetime(2020, 10, 1, 17, 33, 15),
                ],
                "another_time_column": [
                    datetime(2020, 9, 1, 17, 33, 15),
                    datetime(2020, 8, 1, 17, 33, 15),
                ],
            }
        )

        csv_path = "/tmp/multiple_time_columns.csv"
        df.to_csv(path_or_buf=csv_path, index=False)
        source = CSVSource(
            path=csv_path, time_field="time_stamp", parse_dates=["another_time_column"]
        )

        measurements = fs.FeatureSet(
            "fs", entities=[Entity("key")], timestamp_key="time_stamp"
        )
        try:
            resp = fs.ingest(measurements, source)
            df.set_index("key", inplace=True)
            assert_frame_equal(df, resp)
        finally:
            os.remove(csv_path)
Exemple #5
0
    def test_purge(self):
        key = "patient_id"
        fset = fs.FeatureSet("purge",
                             entities=[Entity(key)],
                             timestamp_key="timestamp")
        path = os.path.relpath(str(self.assets_path / "testdata.csv"))
        source = CSVSource(
            "mycsv",
            path=path,
            time_field="timestamp",
        )
        targets = [
            CSVTarget(),
            CSVTarget(name="specified-path",
                      path="v3io:///bigdata/csv-purge-test.csv"),
            ParquetTarget(partitioned=True, partition_cols=["timestamp"]),
            NoSqlTarget(),
        ]
        fset.set_targets(
            targets=targets,
            with_defaults=False,
        )
        fs.ingest(fset, source)

        verify_purge(fset, targets)

        fs.ingest(fset, source)

        targets_to_purge = targets[:-1]
        verify_purge(fset, targets_to_purge)
Exemple #6
0
 def test_ingest_with_timestamp(self):
     key = "patient_id"
     measurements = fs.FeatureSet("measurements",
                                  entities=[Entity(key)],
                                  timestamp_key="timestamp")
     source = CSVSource(
         "mycsv",
         path=os.path.relpath(str(self.assets_path / "testdata.csv")),
         time_field="timestamp",
     )
     resp = fs.ingest(measurements, source)
     assert resp["timestamp"].head(
         n=1)[0] == datetime.fromisoformat("2020-12-01 17:24:15.906352")
Exemple #7
0
    def test_non_partitioned_target_in_dir(self):
        source = CSVSource(
            "mycsv", path=os.path.relpath(str(self.assets_path / "testdata.csv"))
        )
        path = str(self.results_path / _generate_random_name())
        target = ParquetTarget(path=path)

        fset = fs.FeatureSet(
            name="test", entities=[Entity("patient_id")], timestamp_key="timestamp"
        )
        fs.ingest(fset, source, targets=[target])

        list_files = os.listdir(path)
        assert len(list_files) == 1 and not os.path.isdir(path + "/" + list_files[0])
        os.remove(path + "/" + list_files[0])
Exemple #8
0
    def test_ingest_partitioned_by_key_and_time(
        self, key_bucketing_number, partition_cols, time_partitioning_granularity
    ):
        key = "patient_id"
        name = f"measurements_{uuid.uuid4()}"
        measurements = fs.FeatureSet(name, entities=[Entity(key)])
        source = CSVSource(
            "mycsv",
            path=os.path.relpath(str(self.assets_path / "testdata.csv")),
            time_field="timestamp",
        )
        measurements.set_targets(
            targets=[
                ParquetTarget(
                    partitioned=True,
                    key_bucketing_number=key_bucketing_number,
                    partition_cols=partition_cols,
                    time_partitioning_granularity=time_partitioning_granularity,
                )
            ],
            with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source)

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp = fs.get_offline_features(vector)
        resp2 = resp.to_dataframe()

        assert resp1.to_dict() == resp2.to_dict()

        file_system = fsspec.filesystem("v3io")
        kind = TargetTypes.parquet
        path = f"{get_default_prefix_for_target(kind)}/sets/{name}-latest"
        path = path.format(name=name, kind=kind, project="system-test-project")
        dataset = pq.ParquetDataset(path, filesystem=file_system,)
        partitions = [key for key, _ in dataset.pieces[0].partition_keys]

        if key_bucketing_number is None:
            expected_partitions = []
        elif key_bucketing_number == 0:
            expected_partitions = ["igzpart_key"]
        else:
            expected_partitions = [f"igzpart_hash{key_bucketing_number}_key"]
        expected_partitions += partition_cols or []
        if all(
            value is None
            for value in [
                key_bucketing_number,
                partition_cols,
                time_partitioning_granularity,
            ]
        ):
            time_partitioning_granularity = "hour"
        if time_partitioning_granularity:
            for unit in ["year", "month", "day", "hour"]:
                expected_partitions.append(f"igzpart_{unit}")
                if unit == time_partitioning_granularity:
                    break

        assert partitions == expected_partitions

        resp = fs.get_offline_features(
            vector,
            start_time=datetime(2020, 12, 1, 17, 33, 15),
            end_time=datetime(2020, 12, 1, 17, 33, 16),
            entity_timestamp_column="timestamp",
        )
        resp2 = resp.to_dataframe()
        assert len(resp2) == 10