def test_defaults_produce_clusters(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # mark grouping key dataframe.metadata = dataframe.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/GroupingKey", ) # create the clustering primitive and cluster hyperparams_class = Sloth.metadata.query()["primitive_code"][ "class_type_arguments" ]["Hyperparams"] sloth = Sloth(hyperparams=hyperparams_class.defaults()) result = sloth.produce_clusters(inputs=dataframe).value # check that the grouping key columns match self.assertListEqual( result.iloc[:, 0].tolist(), ["alpha", "bravo", "charlie", "delta"] ) # check that the first two keys are in their own cluster, and the the last two are in # the same cluster clusters_by_key = result[["key", "__cluster"]].drop_duplicates() self.assertNotEquals(clusters_by_key.iloc[0, 1], clusters_by_key.iloc[1, 1]) self.assertNotEquals(clusters_by_key.iloc[1, 1], clusters_by_key.iloc[2, 1]) self.assertEquals(clusters_by_key.iloc[2, 1], clusters_by_key.iloc[3, 1]) # check metadata is correct column_metadata = result.metadata.query_column(0) self.assertEqual(column_metadata["structural_type"], str) column_metadata = result.metadata.query_column(1) self.assertEqual(column_metadata["structural_type"], np.int64)
def test_vector_parse_twice(self) -> None: dataset = test_utils.load_dataset(self._image_dataset_path) df = test_utils.get_dataframe(dataset, "learningData") hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams() cpp = ColumnParserPrimitive( hyperparams=hyperparams_class.defaults().replace( { "parsing_semantics": [ "https://metadata.datadrivendiscovery.org/types/FloatVector", ] } ) ) target_coords = [ 20.999598, 63.488694, 20.999598, 63.499462, 21.023702, 63.499462, 21.023702, 63.488694, ] result_df = cpp.produce(inputs=df).value result_coords = result_df["coordinates"][0] self.assertEquals(len(result_coords), len(target_coords)) for a, b in zip(target_coords, result_coords): self.assertAlmostEqual(a, b, 5) result_2_df = cpp.produce(inputs=result_df).value result_2_coords = result_2_df["coordinates"][0] self.assertEquals(len(result_2_coords), len(target_coords)) for a, b in zip(target_coords, result_2_coords): self.assertAlmostEqual(a, b, 5)
def _test_set_training_data(dataset_name, target_col, group_compose=False, split_train=False): dataset = test_utils.load_dataset( f'/datasets/seed_datasets_current/{dataset_name}/TRAIN/dataset_TRAIN') df = test_utils.get_dataframe(dataset, 'learningData', target_col) time_col = df.metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/DateTime", ))[0] original_times = df.iloc[:, time_col] df.iloc[:, time_col] = pd.to_datetime( df.iloc[:, time_col], format=datetime_format_strs[dataset_name]) df = df.sort_values(by=df.columns[time_col]) df.iloc[:, time_col] = original_times train_split = int(0.9 * df.shape[0]) train = df.iloc[:train_split, :].reset_index(drop=True) val = df.iloc[train_split:, :].reset_index(drop=True) df = df.reset_index(drop=True) preprocess = PreProcessPipeline(group_compose=group_compose) preprocess.fit(train) train_inputs, train_outputs = preprocess.produce(train) val_inputs, _ = preprocess.produce(val) all_inputs, all_outputs = preprocess.produce(df) deepar_hp = DeepArPrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] pred_length_idx = 1 if split_train else 0 deepar_hp = DeepArPrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] deepar = DeepArPrimitive(hyperparams=deepar_hp( deepar_hp.defaults(), epochs=1, steps_per_epoch=1, number_samples=10, prediction_length=min_pred_lengths[dataset_name][pred_length_idx] + 5, context_length=min_pred_lengths[dataset_name][pred_length_idx] - 5, quantiles=(0.1, 0.9), output_mean=False)) if split_train: deepar.set_training_data(inputs=train_inputs, outputs=train_outputs) else: deepar.set_training_data(inputs=all_inputs, outputs=all_outputs) if group_compose: assert deepar._grouping_columns == [train_inputs.shape[1] - 1] else: assert grouping_cols[dataset_name] == deepar._grouping_columns assert freqs[dataset_name] == deepar._freq assert real_cols[dataset_name] == deepar._real_columns assert isinstance(deepar._deepar_dataset.get_distribution_type(), distr[dataset_name]) deepar.fit() return deepar, preprocess, train_inputs, val_inputs, all_inputs
def _test_set_training_data(dataset_name, target_col, group_compose=False, split_train=False): dataset = test_utils.load_dataset( f'/datasets/seed_datasets_current/{dataset_name}/TRAIN/dataset_TRAIN') df = test_utils.get_dataframe(dataset, 'learningData', target_col) time_col = df.metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/DateTime", ))[0] original_times = df.iloc[:, time_col] df.iloc[:, time_col] = pd.to_datetime( df.iloc[:, time_col], format=datetime_format_strs[dataset_name]) df = df.sort_values(by=df.columns[time_col]) df.iloc[:, time_col] = original_times train_split = int(0.9 * df.shape[0]) train = df.iloc[:train_split, :].reset_index(drop=True) val = df.iloc[train_split:, :].reset_index(drop=True) df = df.reset_index(drop=True) preprocess = PreProcessPipeline(group_compose=group_compose) preprocess.fit(train) train_inputs, train_outputs = preprocess.produce(train) val_inputs, _ = preprocess.produce(val) all_inputs, all_outputs = preprocess.produce(df) nbeats_hp = NBEATSPrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] pred_length_idx = 1 if split_train else 0 nbeats_hp = NBEATSPrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] nbeats = NBEATSPrimitive(hyperparams=nbeats_hp( nbeats_hp.defaults(), epochs=1, steps_per_epoch=1, num_estimators=1, prediction_length=min_pred_lengths[dataset_name][pred_length_idx] + 5, #quantiles = (0.1, 0.9), )) if os.path.isdir(nbeats.hyperparams['weights_dir']): shutil.rmtree(nbeats.hyperparams['weights_dir']) if split_train: nbeats.set_training_data(inputs=train_inputs, outputs=train_outputs) else: nbeats.set_training_data(inputs=all_inputs, outputs=all_outputs) if group_compose: assert nbeats._grouping_columns == [train_inputs.shape[1] - 1] else: assert grouping_cols[dataset_name] == nbeats._grouping_columns assert freqs[dataset_name] == nbeats._freq nbeats.fit() return nbeats, preprocess, train_inputs, val_inputs, all_inputs
def test_produce_no_fit(self) -> None: dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") dataframe.drop(columns=["delta", "echo"], inplace=True) hyperparams_class = RankedLinearSVCPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] hyperparams = hyperparams_class.defaults() ranked_lsvc = RankedLinearSVCPrimitive(hyperparams=hyperparams) ranked_lsvc.set_training_data( inputs=dataframe[["alpha", "bravo"]], outputs=pd.DataFrame({"charlie": dataframe["charlie"].astype(int)}), ) results = ranked_lsvc.produce( inputs=dataframe[["alpha", "bravo"]]).value expected_labels = [1, 1, 1, 0, 0, 0, 0, 0, 0] expected_rank = [8, 8, 8, 5, 5, 5, 2, 2, 2] expected_confidence = [ 0.729, 0.729, 0.729, 0.268, 0.268, 0.268, 0.051, 0.051, 0.051, ] self.assertListEqual(list(results["charlie"]), expected_labels) self.assertListEqual(list(results["rank"]), expected_rank) np.testing.assert_almost_equal(list(results["confidence"]), expected_confidence, decimal=3) self.assertListEqual( results.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/Score", )), [1], ) self.assertListEqual( results.metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/PredictedTarget", )), [0, 1, 2], ), self.assertListEqual( results.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/Rank", )), [2], )
def generate_dataset(inputFile, outputFile, n): """ Generates a random tab delimitered .txt file with column names as in the input file and random values from the possible values of each column in the input file. :param inputFile: :param outputFile: :param n: rows in output file """ column_name_values = get_dataframe(inputFile) data_set = generate_randomDF(column_name_values, n) data_set.to_csv("Data/" + outputFile, sep="\t", index_label="foundid")
def test_defaults(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # create the imputer hyperparams_class = ReplaceSingletonsPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] replacer = ReplaceSingletonsPrimitive( hyperparams=hyperparams_class.defaults()) result = replacer.produce(inputs=dataframe).value self.assertEqual(result["alpha"].iloc[4], utils.SINGLETON_INDICATOR)
def _test_ts(dataset_name, target_col, group_compose=False, split_train=False): nbeats, preprocess, inputs_train, inputs_val, inputs_all = _test_set_training_data( dataset_name, target_col, group_compose=group_compose, split_train=split_train ) # _test_produce_train_data(nbeats, inputs_train, inputs_val, inputs_all) dataset = test_utils.load_dataset( f"/datasets/seed_datasets_current/{dataset_name}/TEST/dataset_TEST/" ) df = test_utils.get_dataframe(dataset, "learningData", target_col) inputs_test, _ = preprocess.produce(df) _test_produce_test_data(nbeats, inputs_test)
def test_no_missing(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # create the imputer hyperparams_class = CategoricalImputerPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] imputer = CategoricalImputerPrimitive( hyperparams=hyperparams_class.defaults().replace({ "strategy": "most_frequent", "use_columns": [3] })) result = imputer.produce(inputs=dataframe).value self.assertEqual(result["charlie"].iloc[2], "whiskey")
def test_defaults(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") dataframe = ListEncoderPrimitiveTestCase._convert_lists(dataframe) # create the imputer hyperparams_class = ListEncoderPrimitive.metadata.query()["primitive_code"][ "class_type_arguments" ]["Hyperparams"] encoder = ListEncoderPrimitive(hyperparams=hyperparams_class.defaults()) encoder.set_training_data(inputs=dataframe) encoder.fit() result = encoder.produce(inputs=dataframe).value self._assert_result(result)
def _test_ts(dataset_name, target_col, group_compose=False, split_train=False): deepar, preprocess, inputs_train, inputs_val, inputs_all = _test_set_training_data( dataset_name, target_col, group_compose=group_compose, split_train=split_train) _test_produce_train_data(deepar, inputs_train, inputs_val, inputs_all) dataset = test_utils.load_dataset( f'/datasets/seed_datasets_current/{dataset_name}/TEST/dataset_TEST/') df = test_utils.get_dataframe(dataset, 'learningData', target_col) inputs_test, _ = preprocess.produce(df) _test_produce_test_data(deepar, inputs_test) _test_produce_confidence_intervals(deepar, inputs_all) _test_produce_confidence_intervals(deepar, inputs_test)
def test_basic(self) -> None: dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") dataframe.drop(columns=["delta", "echo"], inplace=True) hyperparams_class = IsolationForestPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] hyperparams = hyperparams_class.defaults().replace({"n_jobs": -1}) isp = IsolationForestPrimitive(hyperparams=hyperparams) isp.set_training_data(inputs=dataframe[["alpha", "bravo"]], ) isp.fit() results = isp.produce(inputs=dataframe[["alpha", "bravo"]]).value self.assertListEqual(list(results["outlier_label"]), [-1, -1, -1, -1, -1, -1, -1, -1, -1])
def test_band_mapping_replace(self) -> None: dataset = test_utils.load_dataset(self._dataset_path) dataset.metadata = dataset.metadata.add_semantic_type( ("learningData", metadata_base.ALL_ELEMENTS, 2), "https://metadata.datadrivendiscovery.org/types/GroupingKey", ) dataset.metadata = dataset.metadata.add_semantic_type( ("learningData", metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/FileName", ) dataset.metadata = dataset.metadata.add_semantic_type( ("learningData", metadata_base.ALL_ELEMENTS, 5), "https://metadata.datadrivendiscovery.org/types/FloatVector", ) dataset.metadata = dataset.metadata.update( ("0", ), {"location_base_uris": self._media_path}) dataset.metadata = dataset.metadata.update( ("learningData", metadata_base.ALL_ELEMENTS, 1), {"location_base_uris": [self._media_path]}, ) dataframe = test_utils.get_dataframe(dataset, "learningData") hyperparams_class = DataFrameSatelliteImageLoaderPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] hyperparams = hyperparams_class.defaults().replace({ "return_result": "replace", "n_jobs": -1 }) loader = DataFrameSatelliteImageLoaderPrimitive( hyperparams=hyperparams) result_dataframe = loader.produce(inputs=dataframe).value # verify the output self.assertListEqual(list(result_dataframe.shape), [2, 7]) self.assertListEqual(list(result_dataframe["image_file"][0].shape), [12, 120, 120]) self.assertEqual(result_dataframe["d3mIndex"][0], "1") self.assertEqual(result_dataframe["group_id"][0], "S2A_MSIL2A_20170613T101031_0_49") self.assertEqual(result_dataframe["d3mIndex"][1], "2") self.assertEqual(result_dataframe["group_id"][1], "2") self.assertEqual( result_dataframe.metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/LocationPolygon", )), [5], )
def test_defaults(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # create the imputer hyperparams_class = CategoricalImputerPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] imputer = CategoricalImputerPrimitive( hyperparams=hyperparams_class.defaults()) result = imputer.produce(inputs=dataframe).value self.assertEqual(result["alpha"].iloc[2], "whiskey") self.assertEqual(result["bravo"].iloc[2], "whiskey") self.assertEqual(result["charlie"].iloc[2], "whiskey") self.assertEqual(result["delta"].iloc[2], utils.MISSING_VALUE_INDICATOR)
def test_classification_singleton_label(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") dataframe = dataframe.iloc[0:6] # create the encoder hyperparams_class = TextEncoderPrimitive.metadata.query()["primitive_code"][ "class_type_arguments" ]["Hyperparams"] encoder = TextEncoderPrimitive(hyperparams=hyperparams_class.defaults()) encoder.set_training_data( inputs=dataframe.iloc[:, [0, 1]], outputs=dataframe[['bravo']] ) # should fail in this case because we have a label with a cardinality of 1 self.assertRaises(ValueError, encoder.fit)
def load_entity_table(label): # ''' # Description: Allows the user to analyze the individual words within each category of entity. # Params: Selected category of entity from dropdown. # Returns: HTML table of entities within the entity category. # ''' global interface_obj df = utils.get_dataframe(label, interface_obj.entity_dic) columns = df.columns return html.Table( [html.Tr([html.Th(label)])] + [html.Tr([html.Th(col) for col in columns])] + [ html.Tr([html.Td(df.iloc[i][col]) for col in columns]) for i in range(len(df)) ], id='entity-table')
def test_basic(self) -> None: dataset = test_utils.load_dataset(self._tabular_dataset_path) df = test_utils.get_dataframe(dataset, "learningData") df.metadata = df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "http://schema.org/Integer" ) df.metadata = df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 2), "http://schema.org/Float" ) hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams() cpp = ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) result_df = cpp.produce(inputs=df).value self.assertEqual(result_df["d3mIndex"].dtype, np.dtype("int64")) self.assertEqual(result_df["alpha"].dtype, np.dtype("int64")) self.assertEqual(result_df["bravo"].dtype, np.dtype("float64")) self.assertEqual(result_df["charlie"].dtype, np.dtype("int64")) self.assertEqual(result_df["delta"].dtype, np.dtype("object")) self.assertEqual(result_df["echo"].dtype, np.dtype("float64"))
def get_model_cds_X_test(split, SI, param=Parameters.standard): """ The days before this date are training set The days on and after this date are the testing set :param split: :param SI: :return: """ df = get_dataframe(SI) cds = CDS(df.index, df.CLOSE, df.TURN, SI) # Load from pickle instead of retraining model, X_test, y_test = prepare_model(cds, split_date=split, load_from_disk=False, save_to_disk=False, evaluate=True, **param) return model, cds, X_test
def test_single_row(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # create the encoder hyperparams_class = BinaryEncoderPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] encoder = BinaryEncoderPrimitive(hyperparams=hyperparams_class. defaults().replace({"min_binary": 3})) encoder.set_training_data(inputs=dataframe) encoder.fit() result = encoder.produce(inputs=dataframe.head(1)).value self.assertEqual(len(result.index), 1) self.assertEqual( result.metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/Attribute", )), [1, 2, 3, 4, 5, 6, 7], )
def test_normalized(self) -> None: dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") dataframe.drop(columns=["delta", "echo"], inplace=True) hyperparams_class = RankedLinearSVCPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] hyperparams = hyperparams_class.defaults().replace( {"scaling": "standardize"}) ranked_lsvc = RankedLinearSVCPrimitive(hyperparams=hyperparams) # this is here because CalibratedClassifierCV fails if predicted labels does not contain at least # one of all possible labels dataframe["charlie"][1] = 1.0 dataframe["charlie"][8] = 1.0 ranked_lsvc.set_training_data( inputs=dataframe[["alpha", "bravo"]], outputs=pd.DataFrame({"charlie": dataframe["charlie"].astype(int)}), ) ranked_lsvc.fit() results = ranked_lsvc.produce( inputs=dataframe[["alpha", "bravo"]]).value expected_labels = [1, 1, 1, 0, 0, 0, 1, 1, 1] expected_confidence = [ 0.807, 0.807, 0.807, 0.218, 0.218, 0.218, 0.923, 0.923, 0.923, ] expected_rank = [5, 5, 5, 2, 2, 2, 8, 8, 8] self.assertListEqual(list(results["charlie"]), expected_labels) np.testing.assert_almost_equal(list(results["confidence"]), expected_confidence, decimal=3) self.assertListEqual(list(results["rank"]), expected_rank)
def _load_data(self) -> None: dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") dataframe.metadata = dataframe.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 2), "https://metadata.datadrivendiscovery.org/types/FloatVector", ) hyperparam_class = ColumnParserPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] cpp = ColumnParserPrimitive(hyperparams=hyperparam_class.defaults( ).replace({ "parsing_semantics": ( "http://schema.org/Boolean", "http://schema.org/Integer", "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/FloatVector", ) })) return cpp.produce(inputs=dataframe).value
def test_constant(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # create the imputer hyperparams_class = CategoricalImputerPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] imputer = CategoricalImputerPrimitive( hyperparams=hyperparams_class.defaults().replace( { "strategy": "constant", "fill_value": "empty", "use_columns": [1, 2, 3, 4], })) result = imputer.produce(inputs=dataframe).value self.assertEqual(result["alpha"].iloc[2], "empty") self.assertEqual(result["bravo"].iloc[2], "empty") self.assertEqual(result["charlie"].iloc[2], "whiskey") self.assertEqual(result["delta"].iloc[2], "empty")
def test_defaults_produce(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # mark grouping key dataframe.metadata = dataframe.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/GroupingKey", ) # create the clustering primitive and cluster hyperparams_class = Sloth.metadata.query()["primitive_code"][ "class_type_arguments" ]["Hyperparams"] sloth = Sloth(hyperparams=hyperparams_class.defaults()) result = sloth.produce(inputs=dataframe).value # check if the first four columns match the original pd.testing.assert_frame_equal(result.iloc[:, :-1], dataframe) # check that the first two keys are in their own cluster, and the the last two are in # the same cluster clusters_by_key = result[["key", "__cluster"]].drop_duplicates() self.assertNotEquals(clusters_by_key.iloc[0, 1], clusters_by_key.iloc[1, 1]) self.assertNotEquals(clusters_by_key.iloc[1, 1], clusters_by_key.iloc[2, 1]) self.assertEquals(clusters_by_key.iloc[2, 1], clusters_by_key.iloc[3, 1]) # check metadata is correct for new column column_metadata = result.metadata.query_column(4) self.assertListEqual( list(column_metadata["semantic_types"]), [ "https://metadata.datadrivendiscovery.org/types/Attribute", "https://metadata.datadrivendiscovery.org/types/ConstructedAttribute", "http://schema.org/Integer", ], ), self.assertEqual(column_metadata["structural_type"], np.int64)
def test_buildFromCode(self): deck = get_dataframe() testDeck = deck['cardCode'].value_counts().to_dict() f = lambda key, value: str(value) + ':' + key cardList = [] for key, value in zip(testDeck.keys(), testDeck.values()): cardList.append(f(key, value)) testDeck = LoRDeck(cardList) code = testDeck.encode() testDataframe = buildFromCode(code) valid = deck['cardCode'].unique().tolist() test = testDataframe['cardCode'].unique().tolist() valid.sort() test.sort() for x, y in zip(valid, test): self.assertEqual(x, y)
def _load_data( self, dataframe_name, date_time_index=None, value_indices=[], parsing_hyperparams=None, ): dataset = test_utils.load_dataset(self._dataset_path) timeseries_df = test_utils.get_dataframe(dataset, dataframe_name) self._load_semantics_into_data( timeseries_df, group_index=1, date_time_index=date_time_index, value_indices=value_indices, ) hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams() if parsing_hyperparams: cpp = ColumnParserPrimitive(hyperparams=hyperparams_class.defaults( ).replace(parsing_hyperparams)) else: cpp = ColumnParserPrimitive( hyperparams=hyperparams_class.defaults()) return cpp.produce(inputs=timeseries_df).value
def test_matching(): input_file = "Constants/ColumnsAndValuesData.txt" column_name_values = get_dataframe(input_file) data_set = generate_randomDF(column_name_values, 100) column_names = list(data_set.columns) x_column_names = column_names x_column_names.remove("foundid") x_column_names.insert(0, "lostid") x_values = list(dict(data_set.iloc[0]).values()) x_dict = {x_column_names[i]: x_values[i] for i in range(len(x_column_names))} x = pd.DataFrame(x_dict, index=[0]) x_dataset=Dataset('lost','single',x) y_dataset=Dataset('found','multiple',data_set) print(data_set.head()) print(x.head()) print(type(x)) Matching.do_matching(x_dataset, y_dataset, 5)
def test_get_set_params(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # create the imputer hyperparams_class = BinaryEncoderPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] encoder = BinaryEncoderPrimitive(hyperparams=hyperparams_class. defaults().replace({"min_binary": 3})) encoder.set_training_data(inputs=dataframe) encoder.fit() hyperparams = encoder.hyperparams params = encoder.get_params() encoder = BinaryEncoderPrimitive(hyperparams=hyperparams) encoder.set_params(params=params) result = encoder.produce(inputs=dataframe).value self.assertEqual(len(result.index), 5) self.assertSequenceEqual( list(result.columns), [ "d3mIndex", "charlie", "delta", "__binary_0", "__binary_1", "__binary_2", "__binary_3", "__binary_4", ], ) self.assertSequenceEqual( result.dtypes.tolist(), [object, object, object, int, int, int, int, int])
def test_datetime(self) -> None: dataset = test_utils.load_dataset(self._dataset_path) df = test_utils.get_dataframe(dataset, "0") df.metadata = df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 4), "http://schema.org/DateTime" ) hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams() cpp = ColumnParserPrimitive( hyperparams=hyperparams_class.defaults().replace( { "parsing_semantics": [ "http://schema.org/DateTime", ] } ) ) result_df = cpp.produce(inputs=df).value self.assertListEqual( list(result_df["sierra"]), [ common_utils.parse_datetime_to_float(date, fuzzy=True) for date in df["sierra"] ], )
def test_basic(self) -> None: dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") dataframe.drop(columns=["delta", "echo"], inplace=True) hyperparams_class = RankedLinearSVCPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] hyperparams = hyperparams_class.defaults() ranked_lsvc = RankedLinearSVCPrimitive(hyperparams=hyperparams) ranked_lsvc.set_training_data( inputs=dataframe[["alpha", "bravo"]], outputs=pd.DataFrame({"charlie": dataframe["charlie"].astype(int)}), ) ranked_lsvc.fit() results = ranked_lsvc.produce( inputs=dataframe[["alpha", "bravo"]]).value expected_labels = [1, 1, 1, 0, 0, 0, 0, 0, 0] expected_confidence = [ 0.73, 0.73, 0.73, 0.269, 0.269, 0.269, 0.052, 0.052, 0.052, ] expected_rank = [8, 8, 8, 5, 5, 5, 2, 2, 2] self.assertListEqual(list(results["charlie"]), expected_labels) np.testing.assert_almost_equal(list(results["confidence"]), expected_confidence, decimal=3) self.assertListEqual(list(results["rank"]), expected_rank)
def test_defaults(self) -> None: # load test data into a dataframe dataset = test_utils.load_dataset(self._dataset_path) dataframe = test_utils.get_dataframe(dataset, "learningData") # create the imputer hyperparams_class = OneHotEncoderPrimitive.metadata.query( )["primitive_code"]["class_type_arguments"]["Hyperparams"] encoder = OneHotEncoderPrimitive( hyperparams=hyperparams_class.defaults()) encoder.set_training_data(inputs=dataframe) encoder.fit() result = encoder.produce(inputs=dataframe).value self.assertEqual(len(result.index), 5) self.assertEqual( result.metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/Attribute", )), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ) self.assertSequenceEqual(list(result.columns), ["d3mIndex"] + [f"__onehot_{i}" for i in range(10)]) self.assertSequenceEqual(result.dtypes.tolist(), [object] + [float for i in range(10)])