def test_json_dumps_after_used_by_dts(self, ts_data_spec_dtos, files_data_spec_dto): data_spec = DataSpec(time_series_data_specs=ts_data_spec_dtos, files_data_spec=files_data_spec_dto) json_repr = data_spec.to_JSON() dts = DataTransferService(data_spec) dts.get_dataframes() json_repr_after_dts = data_spec.to_JSON() assert json_repr == json_repr_after_dts
def test_dict_dto_equal(self, ts_data_spec_dicts, ts_data_spec_dtos): data_spec_dtos = DataSpec(time_series_data_specs=ts_data_spec_dtos) data_spec_dicts = DataSpec(time_series_data_specs=ts_data_spec_dicts) service = DataTransferService(data_spec_dicts) service2 = DataTransferService(data_spec_dtos) dataframes_by_dicts = service.get_dataframes() dataframes_by_dtos = service2.get_dataframes() for df1, df2 in zip(dataframes_by_dtos.values(), dataframes_by_dicts.values()): pd.testing.assert_frame_equal(df1, df2)
def test_get_dataframes(self, ts_data_spec_dtos): data_spec = DataSpec(time_series_data_specs=ts_data_spec_dtos) service = DataTransferService(data_spec) dataframes = service.get_dataframes() assert isinstance(dataframes.get("ds1"), pd.DataFrame) assert isinstance(dataframes.get("ds2"), pd.DataFrame)
def test_get_dataframes_w_column_mapping(self, time_series_in_cdp): ts1 = TimeSeries(id=time_series_in_cdp[0], aggregates=["avg"], label="cavg") ts2 = TimeSeries(id=time_series_in_cdp[0], aggregates=["cv"], label="ccv") ts3 = TimeSeries(id=time_series_in_cdp[1], aggregates=["avg"], label="sinavg") tsds = TimeSeriesDataSpec(time_series=[ts1, ts2, ts3], aggregates=["avg"], granularity="1h", start="300d-ago") dts = DataTransferService(DataSpec([tsds])) dfs = dts.get_dataframes() expected = ["timestamp", "cavg", "ccv", "sinavg"] assert expected == list(dfs["default"].columns.values)
def test_get_dataframes_column_mapping_no_drop_agg_suffix(self, data_spec): dts = DataTransferService(data_spec, num_of_workers=3) dfs = dts.get_dataframes(drop_agg_suffix=False) assert list(dfs["default"].columns.values) == [ "timestamp", "ts1|average", "ts1|min", "ts2|continuousvariance", "ts3|max", "ts3|count", "ts4|stepinterpolation", ]
def test_get_dataframes_column_mapping_drop_agg_suffixes(self, data_spec): dts = DataTransferService(data_spec, num_of_workers=3) dfs = dts.get_dataframes(drop_agg_suffix=True) assert list(dfs["default"].columns.values) == [ "timestamp", "ts1|average", "ts1|min", "ts2", "ts3|max", "ts3|count", "ts4", ]
def main(): configure_session(api_key=os.getenv("COGNITE_API_KEY"), project="akerbp", debug=True) tags_d03 = [] tags_d02 = [] for root, subdirs, files in os.walk("../tags"): for file in files: if file in ("well_tags.csv", "routing.csv", "output.csv", "riser_tags.csv", "template_tags.csv"): with open(os.path.join(root, file)) as f: df = pd.read_csv(f) placements = ["T3 WGM", "Template", "Riser"] placements_d03 = ["WellD03"] + placements placements_d02 = ["WellD02"] + placements df = df[~df["tag"].isin(EXCLUDE_TAGS)] tags_d03.append(df[df["placement"].isin(placements_d03)]) tags_d02.append(df[df["placement"].isin(placements_d02)]) tags_d02_concat = pd.concat(tags_d02, ignore_index=True) tags_d03_concat = pd.concat(tags_d03, ignore_index=True) tags_d02_concat = tags_d02_concat.drop_duplicates(subset="tag") tags_d03_concat = tags_d03_concat.drop_duplicates(subset="tag") d02_input_time_series = [] d03_input_time_series = [] for tag in tags_d02_concat["tag"]: aggregate = "step" if ("ESV" in tag or "18HV" in tag) else "avg" missing_data_strategy = "ffill" if ( "ESV" in tag or "18HV" in tag) else "linearInterpolation" ts = TimeSeries(name=tag, missing_data_strategy=missing_data_strategy, aggregates=[aggregate]) d02_input_time_series.append(ts) for tag in tags_d03_concat["tag"]: aggregate = "step" if ("ESV" in tag or "18HV" in tag) else "avg" missing_data_strategy = "ffill" if ( "ESV" in tag or "18HV" in tag) else "linearInterpolation" ts = TimeSeries(name=tag, missing_data_strategy=missing_data_strategy, aggregates=[aggregate]) d03_input_time_series.append(ts) d02_tsds = TimeSeriesDataSpec( time_series=d02_input_time_series, aggregates=["avg"], granularity="10s", start=int(datetime(2017, 3, 1).timestamp() * 1e3), label="d2", ) d03_tsds = TimeSeriesDataSpec( time_series=d03_input_time_series, aggregates=["avg"], granularity="10s", start=int(datetime(2017, 3, 1).timestamp() * 1e3), label="d3", ) data_spec = DataSpec(time_series_data_specs=[d02_tsds, d03_tsds]) dts = DataTransferService(data_spec, num_of_processes=10) print(data_spec.to_JSON()) df_dict = dts.get_dataframes() for label, df in df_dict.items(): df.to_csv(f"../data/{label}.csv") print(df.shape)
def main(): output_columns = [ "SKAP_18FI381-VFlLGas/Y/10sSAMP|average", "SKAP_18FI381-VFlLH2O/Y/10sSAMP|average", "SKAP_18FI381-VFlLOil/Y/10sSAMP|average", ] router = "SKAP_18HV3806/BCH/10sSAMP|stepinterpolation" one_hour_ago = datetime.now() - timedelta(0, 3600) last_processed_timestamp = int(one_hour_ago.timestamp() * 1e3) is_first = True while True: d2_inputs = pd.DataFrame([[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]]) d2_inputs.columns = ["hoho", "blaa", "hgi"] + output_columns input_has_nans = True while input_has_nans: ds = generate_data_spec(last_processed_timestamp) dts = DataTransferService(data_spec=ds) while True: try: d2_inputs = dts.get_dataframes()["d2"] break except: time.sleep(2) any_nans_per_column = d2_inputs.drop(output_columns, axis=1).isna().any() all_nans_per_column = d2_inputs.drop(output_columns, axis=1).isna().all() print(any_nans_per_column) print(all_nans_per_column) if any_nans_per_column.any() and not all_nans_per_column.any(): last_processed_timestamp -= 10000 print(datetime.fromtimestamp(last_processed_timestamp * 1e-3)) time.sleep(2) input_has_nans = d2_inputs.drop(output_columns, axis=1).isna().any().any() last_ts = d2_inputs["timestamp"].iloc[-1] print(d2_inputs[output_columns[0]].values.tolist()) d2_inputs_formatted = ( d2_inputs.drop("timestamp", axis=1).drop(router, axis=1).drop(output_columns, axis=1).values.tolist() ) timestamps = d2_inputs["timestamp"] res = models.online_predict( model_id=3885574571413770, version_id=4299054386152423, instances=[d2_inputs_formatted] ) predictions = res["predictions"][0] formatted_predictions = [int(pred[0]) for pred in predictions] last_processed_timestamp = int(last_ts) dps = [Datapoint(ts, value) for ts, value in zip(timestamps.values.tolist(), formatted_predictions)] print([dp.value for dp in dps]) if is_first: post_datapoints(name="SKAP_18FI381-VFlLGas/Y/10sSAMP_calc_D02_2", datapoints=dps) is_first = False else: for dp in dps: post_datapoints(name="SKAP_18FI381-VFlLGas/Y/10sSAMP_calc_D02_2", datapoints=[dp]) time.sleep(5)