Python DataTransferServiceの例、cognite.data_transfer_service.DataTransferService Pythonの例

コード例 #1

0

ファイルを表示

ファイル: model.py プロジェクト: ojjiojji/cognite-sdk-python

    def train(file_io, data_spec, api_key, project, **kwargs):
        """
        file_io:
            The train method must accept a file_io argument. This is a function
            that works the same way as the builtin open(), except it reads from
            and writes to the root of a special storage location in the cloud
            that belongs to the current model version.
        data_spec:
            An argument we pass in ourself when we initiate the training.
        api_key, project:
            Optional arguments that are passed in automatically from Model
            Hosting for your convenience. The API key is the one that were
            used to initiate this training routine through the Model Hosting
            HTTP API.
        """
        dts = DataTransferService(data_spec, api_key=api_key, project=project)
        X = pd.read_csv(dts.get_file("data"))
        y = pd.read_csv(dts.get_file("target"))

        # Add a feature of constant value 1
        X.insert(0, "f0", 1)

        # Least squares
        coefficients = pd.DataFrame(np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y),
                                    columns=["beta_hat"])

        # Persist our result
        with file_io("coefficients.csv", "w") as f:
            coefficients.to_csv(f, index=False)

コード例 #2

0

ファイルを表示

ファイル: model.py プロジェクト: ojjiojji/cognite-sdk-python

    def train(file_io, data_spec, api_key, project, **kwargs):
        """
        file_io:
            The train method must accept a file_io argument. This is a function
            that works the same way as the builtin open(), except it reads from
            and writes to the root of a special storage location in the cloud
            that belongs to the current model version.
        data_spec:
            An argument we pass in ourself when we initiate the training.
        api_key, project:
            Optional arguments that are passed in automatically from Model
            Hosting for your convenience. The API key is the one that were
            used to initiate this training routine through the Model Hosting
            HTTP API.
        """
        dts = DataTransferService(data_spec, api_key=api_key, project=project)
        df = dts.get_dataframe().dropna()

        X = df[["temp", "pressure", "rpm"]].values
        y = df["production_rate"].values

        regressor = RandomForestRegressor(
            n_estimators=10,
            min_samples_split=100)  # We'll mostly use default settings
        regressor.fit(X, y)

        # Persist our regressor model
        with file_io("regressor.pickle", "wb") as f:
            pickle.dump(regressor, f)

コード例 #3

0

ファイルを表示

    def test_get_dataframes(self, ts_data_spec_dtos):
        data_spec = DataSpec(time_series_data_specs=ts_data_spec_dtos)
        service = DataTransferService(data_spec)
        dataframes = service.get_dataframes()

        assert isinstance(dataframes.get("ds1"), pd.DataFrame)
        assert isinstance(dataframes.get("ds2"), pd.DataFrame)

コード例 #4

0

ファイルを表示

 def test_json_dumps_after_used_by_dts(self, ts_data_spec_dtos, files_data_spec_dto):
     data_spec = DataSpec(time_series_data_specs=ts_data_spec_dtos, files_data_spec=files_data_spec_dto)
     json_repr = data_spec.to_JSON()
     dts = DataTransferService(data_spec)
     dts.get_dataframes()
     json_repr_after_dts = data_spec.to_JSON()
     assert json_repr == json_repr_after_dts

コード例 #5

0

ファイルを表示

    def test_get_files(self):
        data_spec = DataSpec(files_data_spec=FilesDataSpec(file_ids={"test": 7725800487412823}))

        dts = DataTransferService(data_spec)
        data = dts.get_file("test")
        assert isinstance(data, BytesIO)
        assert (
            data.getvalue()
            == b'import os\n\nfrom cognite.config import configure_session\nfrom cognite.v05 import files\n\nconfigure_session(os.getenv("COGNITE_TEST_API_KEY"), "mltest")\n\n\nres = files.upload_file("test.py", "./test.py")\n\nprint(res)\n'
        )

コード例 #6

0

ファイルを表示

    def test_get_dataframes_w_column_mapping(self, time_series_in_cdp):
        ts1 = TimeSeries(id=time_series_in_cdp[0], aggregates=["avg"], label="cavg")
        ts2 = TimeSeries(id=time_series_in_cdp[0], aggregates=["cv"], label="ccv")
        ts3 = TimeSeries(id=time_series_in_cdp[1], aggregates=["avg"], label="sinavg")

        tsds = TimeSeriesDataSpec(time_series=[ts1, ts2, ts3], aggregates=["avg"], granularity="1h", start="300d-ago")

        dts = DataTransferService(DataSpec([tsds]))
        dfs = dts.get_dataframes()
        expected = ["timestamp", "cavg", "ccv", "sinavg"]
        assert expected == list(dfs["default"].columns.values)

コード例 #7

0

ファイルを表示

    def test_get_dataframes_column_mapping_no_drop_agg_suffix(self, data_spec):
        dts = DataTransferService(data_spec, num_of_workers=3)

        dfs = dts.get_dataframes(drop_agg_suffix=False)
        assert list(dfs["default"].columns.values) == [
            "timestamp",
            "ts1|average",
            "ts1|min",
            "ts2|continuousvariance",
            "ts3|max",
            "ts3|count",
            "ts4|stepinterpolation",
        ]

コード例 #8

0

ファイルを表示

    def test_get_dataframes_column_mapping_drop_agg_suffixes(self, data_spec):
        dts = DataTransferService(data_spec, num_of_workers=3)

        dfs = dts.get_dataframes(drop_agg_suffix=True)
        assert list(dfs["default"].columns.values) == [
            "timestamp",
            "ts1|average",
            "ts1|min",
            "ts2",
            "ts3|max",
            "ts3|count",
            "ts4",
        ]

コード例 #9

0

ファイルを表示

ファイル: model.py プロジェクト: ojjiojji/cognite-sdk-python

    def predict(self, instance, api_key, project, **kwargs):
        """
        instance:
            Since we're doing scheduled prediction, this will be a data spec
            describing the data we should do prediction on.
        
        Note that it's also possible to take api_key and project in as
        optional arguments here the same way as in train().
        """
        dts = DataTransferService(instance, api_key=api_key, project=project)
        df = dts.get_dataframe().dropna()

        X = df[["temp", "pressure", "rpm"]].values
        df["production_rate"] = self.regressor.predict(X)

        # For scheduled prediction we need to return output on the format:
        # {
        #   "timestamp": [t0, t1, t2, ...],
        #   "production_rate": [p0, p1, p2, ...]
        # }
        # And we can call to_dict(orient="list") on our pandas DataFrame to get
        # our prediction on that format.
        return df[["timestamp", "production_rate"]].to_dict(orient="list")

コード例 #10

0

ファイルを表示

ファイル: test_data_transfer_service.py プロジェクト: boyeah/cognite-sdk-python

    def test_dict_dto_equal(self, ts_data_spec_dicts, ts_data_spec_dtos):
        data_spec_dtos = DataSpec(time_series_data_specs=ts_data_spec_dtos)
        data_spec_dicts = DataSpec(time_series_data_specs=ts_data_spec_dicts)
        service = DataTransferService(data_spec_dicts)
        service2 = DataTransferService(data_spec_dtos)
        dataframes_by_dicts = service.get_dataframes()
        dataframes_by_dtos = service2.get_dataframes()

        for df1, df2 in zip(dataframes_by_dtos.values(),
                            dataframes_by_dicts.values()):
            pd.testing.assert_frame_equal(df1, df2)

コード例 #11

0

ファイルを表示

def main():
    configure_session(api_key=os.getenv("COGNITE_API_KEY"),
                      project="akerbp",
                      debug=True)
    tags_d03 = []
    tags_d02 = []

    for root, subdirs, files in os.walk("../tags"):
        for file in files:
            if file in ("well_tags.csv", "routing.csv", "output.csv",
                        "riser_tags.csv", "template_tags.csv"):
                with open(os.path.join(root, file)) as f:
                    df = pd.read_csv(f)

                    placements = ["T3 WGM", "Template", "Riser"]
                    placements_d03 = ["WellD03"] + placements
                    placements_d02 = ["WellD02"] + placements

                    df = df[~df["tag"].isin(EXCLUDE_TAGS)]

                    tags_d03.append(df[df["placement"].isin(placements_d03)])
                    tags_d02.append(df[df["placement"].isin(placements_d02)])

    tags_d02_concat = pd.concat(tags_d02, ignore_index=True)
    tags_d03_concat = pd.concat(tags_d03, ignore_index=True)

    tags_d02_concat = tags_d02_concat.drop_duplicates(subset="tag")
    tags_d03_concat = tags_d03_concat.drop_duplicates(subset="tag")

    d02_input_time_series = []
    d03_input_time_series = []

    for tag in tags_d02_concat["tag"]:
        aggregate = "step" if ("ESV" in tag or "18HV" in tag) else "avg"
        missing_data_strategy = "ffill" if (
            "ESV" in tag or "18HV" in tag) else "linearInterpolation"
        ts = TimeSeries(name=tag,
                        missing_data_strategy=missing_data_strategy,
                        aggregates=[aggregate])
        d02_input_time_series.append(ts)

    for tag in tags_d03_concat["tag"]:
        aggregate = "step" if ("ESV" in tag or "18HV" in tag) else "avg"
        missing_data_strategy = "ffill" if (
            "ESV" in tag or "18HV" in tag) else "linearInterpolation"
        ts = TimeSeries(name=tag,
                        missing_data_strategy=missing_data_strategy,
                        aggregates=[aggregate])
        d03_input_time_series.append(ts)

    d02_tsds = TimeSeriesDataSpec(
        time_series=d02_input_time_series,
        aggregates=["avg"],
        granularity="10s",
        start=int(datetime(2017, 3, 1).timestamp() * 1e3),
        label="d2",
    )
    d03_tsds = TimeSeriesDataSpec(
        time_series=d03_input_time_series,
        aggregates=["avg"],
        granularity="10s",
        start=int(datetime(2017, 3, 1).timestamp() * 1e3),
        label="d3",
    )

    data_spec = DataSpec(time_series_data_specs=[d02_tsds, d03_tsds])

    dts = DataTransferService(data_spec, num_of_processes=10)

    print(data_spec.to_JSON())

    df_dict = dts.get_dataframes()

    for label, df in df_dict.items():
        df.to_csv(f"../data/{label}.csv")
        print(df.shape)

コード例 #12

0

ファイルを表示

def main():
    output_columns = [
        "SKAP_18FI381-VFlLGas/Y/10sSAMP|average",
        "SKAP_18FI381-VFlLH2O/Y/10sSAMP|average",
        "SKAP_18FI381-VFlLOil/Y/10sSAMP|average",
    ]
    router = "SKAP_18HV3806/BCH/10sSAMP|stepinterpolation"
    one_hour_ago = datetime.now() - timedelta(0, 3600)
    last_processed_timestamp = int(one_hour_ago.timestamp() * 1e3)

    is_first = True

    while True:
        d2_inputs = pd.DataFrame([[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]])
        d2_inputs.columns = ["hoho", "blaa", "hgi"] + output_columns
        input_has_nans = True
        while input_has_nans:
            ds = generate_data_spec(last_processed_timestamp)
            dts = DataTransferService(data_spec=ds)
            while True:
                try:
                    d2_inputs = dts.get_dataframes()["d2"]
                    break
                except:
                    time.sleep(2)
            any_nans_per_column = d2_inputs.drop(output_columns, axis=1).isna().any()
            all_nans_per_column = d2_inputs.drop(output_columns, axis=1).isna().all()

            print(any_nans_per_column)
            print(all_nans_per_column)

            if any_nans_per_column.any() and not all_nans_per_column.any():
                last_processed_timestamp -= 10000

            print(datetime.fromtimestamp(last_processed_timestamp * 1e-3))
            time.sleep(2)
            input_has_nans = d2_inputs.drop(output_columns, axis=1).isna().any().any()

        last_ts = d2_inputs["timestamp"].iloc[-1]

        print(d2_inputs[output_columns[0]].values.tolist())
        d2_inputs_formatted = (
            d2_inputs.drop("timestamp", axis=1).drop(router, axis=1).drop(output_columns, axis=1).values.tolist()
        )
        timestamps = d2_inputs["timestamp"]
        res = models.online_predict(
            model_id=3885574571413770, version_id=4299054386152423, instances=[d2_inputs_formatted]
        )

        predictions = res["predictions"][0]
        formatted_predictions = [int(pred[0]) for pred in predictions]
        last_processed_timestamp = int(last_ts)

        dps = [Datapoint(ts, value) for ts, value in zip(timestamps.values.tolist(), formatted_predictions)]
        print([dp.value for dp in dps])
        if is_first:
            post_datapoints(name="SKAP_18FI381-VFlLGas/Y/10sSAMP_calc_D02_2", datapoints=dps)
            is_first = False
        else:
            for dp in dps:
                post_datapoints(name="SKAP_18FI381-VFlLGas/Y/10sSAMP_calc_D02_2", datapoints=[dp])
                time.sleep(5)

コード例 #13

0

ファイルを表示

 def test_get_timeseries_name(self, data_spec):
     dts = DataTransferService(data_spec, num_of_workers=3)
     for ts_label in ["ts1", "ts2", "ts3", "ts4"]:
         assert dts.get_time_series_name(ts_label) == TEST_TS_1_NAME