from d3m.base import utils as d3m_utils import os import pandas as pd # load the ISI datamart, currently the url is here, may change in the future isi_datamart_url = "http://dsbox02.isi.edu:9999/blazegraph/namespace/datamart3/sparql" a = Datamart(connection_url=isi_datamart_url) # load the D3M dataset,here we use "DA_poverty_estimation" as exmaple ,please change to your dataset path loader = D3MDatasetLoader() path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json" json_file = os.path.abspath(path) all_dataset_uri = 'file://{}'.format(json_file) all_dataset = loader.load(dataset_uri=all_dataset_uri) # run denormlaize primitive denormalize_hyperparams = hyper_denormalize.defaults() denormalize_primitive = DenormalizePrimitive(hyperparams = denormalize_hyperparams) all_dataset = denormalize_primitive.produce(inputs = all_dataset).value """ start search, run search with data function. Here because the dataset do not have any "Text" semantic type columns, the system will said that no columns can be augment """ search_res = a.search_with_data(query=None, supplied_data=all_dataset) """ run get next page, we will get real search results, it will only have 2 wikidata search results Explain: here we do not find any "Qnodes" semantic type columns, so we will try to run wikifier before searching in wikidata database Then, We will generate 2 Q nodes columns for FIPS and State.
def test_1(self): print('\n') print('running test-2..............') # Loading training dataset. base_path = "/ubc_primitives/datasets/seed_datasets_current/LL1_TXT_CLS_apple_products_sentiment" dataset_doc_path = os.path.join(base_path,\ 'TRAIN/dataset_TRAIN',\ 'datasetDoc.json') dataset = Dataset.load('file://{dataset_doc_path}'.format( dataset_doc_path=dataset_doc_path)) # Step 0: Denormalize primitive denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams( ) denormalize_primitive = DenormalizePrimitive( hyperparams=denormalize_hyperparams_class.defaults()) denormalized_dataset = denormalize_primitive.produce(inputs=dataset) print(denormalized_dataset.value) print('------------------------') # Step 1: Dataset to DataFrame dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams( ) dataframe_primitive = DatasetToDataFramePrimitive( hyperparams=dataframe_hyperparams_class.defaults()) dataframe = dataframe_primitive.produce( inputs=denormalized_dataset.value) print(dataframe.value) print('------------------------') # Step 2: DataFrame to features bow_hyperparams_class = BagOfWords.metadata.get_hyperparams() bow_primitive = BagOfWords( hyperparams=bow_hyperparams_class.defaults()) bow_primitive_out = bow_primitive.produce(inputs=dataframe.value) # Step 3: Dataset to DataFrame kmeans_hyperparams_class = KMeansClusteringPrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] kmeans_hyperparams = kmeans_hyperparams_class.defaults().replace({ 'n_clusters': 4, 'n_init': 10, 'max_iter': 1000 }) kmeans_primitive = KMeansClusteringPrimitive( hyperparams=kmeans_hyperparams) kmeans_primitive.set_training_data(inputs=bow_primitive_out.value) kmeans_primitive.fit() #----------------------------------------------------------------------- # Loading Testing dataset. dataset_doc_path2 = os.path.join(base_path,\ 'SCORE/dataset_SCORE',\ 'datasetDoc.json') dataset2 = Dataset.load('file://{dataset_doc_path}'.format( dataset_doc_path=dataset_doc_path2)) # Step 0: Denormalize primitive score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams( ) score_denormalize_primitive = DenormalizePrimitive( hyperparams=score_denormalize_hyperparams_class.defaults()) score_denormalized_dataset = score_denormalize_primitive.produce( inputs=dataset2) print(denormalized_dataset.value) print('------------------------') # Step 1: Dataset to DataFrame score_dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams( ) score_dataframe_primitive = DatasetToDataFramePrimitive( hyperparams=score_dataframe_hyperparams_class.defaults()) score_dataframe = score_dataframe_primitive.produce( inputs=score_denormalized_dataset.value) print(score_dataframe.value) print('------------------------') # Step 2: Read images to DataFrame score_bow_dataframe = bow_primitive.produce( inputs=score_dataframe.value) print(score_bow_dataframe.value) print('------------------------') score = kmeans_primitive.produce(inputs=score_bow_dataframe.value) score = score.value print(score) print('------------------------') for col in range(score.shape[1]): col_dict = dict( score.metadata.query((metadata_base.ALL_ELEMENTS, col))) print('Meta-data - {}'.format(col), col_dict) # Computer Error ground_truth = (( score_dataframe.value['sentiment']).to_numpy()).astype(np.float) predictions = ((score.iloc[:, -1]).to_numpy()).astype(np.float) print('------------------------') print('Predictions') print(predictions) print('------------------------') print('Ground Truth') print(ground_truth) print('------------------------') print('------------------------') print('MLP Test missclassification rate (lower better): ', (100 * (1 - np.mean(ground_truth == predictions)))) print('------------------------')
def test_1(self): """ Feature extraction only and Testing on seed dataset from D3M datasets """ print('\n') print('########################') print('#--------TEST-1--------#') print('########################') # Get volumes: all_weights = os.listdir('./static') all_weights = {w: os.path.join('./static', w) for w in all_weights} # Loading dataset. path1 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/TRAIN/dataset_TRAIN/datasetDoc.json')) dataset = Dataset.load(dataset_uri=path1) # Get dataset paths path2 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/SCORE/dataset_TEST/datasetDoc.json')) score_dataset = Dataset.load(dataset_uri=path2) # Step 0: Denormalize primitive denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams() denormalize_primitive = DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults()) denormalized_dataset = denormalize_primitive.produce(inputs=dataset) print(denormalized_dataset.value) print('------------------------') print('Loading Training Dataset....') # Step 1: Dataset to DataFrame dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams() dataframe_primitive = DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) dataframe = dataframe_primitive.produce(inputs=denormalized_dataset.value) print(dataframe.value) print('------------------------') print('Loading Testing Dataset....') # Step 0: Denormalize primitive score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams() score_denormalize_primitive = DenormalizePrimitive(hyperparams=score_denormalize_hyperparams_class.defaults()) score_denormalized_dataset = score_denormalize_primitive.produce(inputs=score_dataset) print(score_denormalized_dataset.value) print('------------------------') score_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams() score_primitive = DatasetToDataFramePrimitive(hyperparams=score_hyperparams_class.defaults()) score = score_primitive.produce(inputs=score_denormalized_dataset.value) print(score.value) print('------------------------') extractA_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] extractA_hyperparams_class = extractA_hyperparams_class.defaults().replace( { 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/FileName',) } ) extractA_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractA_hyperparams_class) extractA = extractA_primitive.produce(inputs=dataframe.value) print(extractA.value) print('------------------------') extractP_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] extractP_hyperparams = extractP_hyperparams_class.defaults().replace( { 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',) } ) extractP_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractP_hyperparams) extractP = extractP_primitive.produce(inputs=dataframe.value) print(extractP.value) print('------------------------') # Call primitives hyperparams_class = ConvolutionalNeuralNetwork.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] hyperparams_class = hyperparams_class.defaults().replace( { 'feature_extract_only': False, 'cnn_type': 'mobilenet', 'num_iterations': 150, 'output_dim': 1 } ) primitive = ConvolutionalNeuralNetwork(hyperparams=hyperparams_class, volumes=all_weights) primitive.set_training_data(inputs = dataframe.value, outputs = extractP.value) test_out = primitive.fit() test_out = primitive.produce(inputs=score.value) test_out = test_out.value print(test_out) print('------------------------') for col in range(test_out.shape[1]): col_dict = dict(test_out.metadata.query((metadata_base.ALL_ELEMENTS, col))) print('Meta-data - {}'.format(col), col_dict) # Computer Error ground_truth = ((score.value['WRISTBREADTH']).to_numpy()).astype(np.float) predictions = (test_out.iloc[:, -1]).to_numpy() print(ground_truth) print(predictions) print('------------------------') print('Mean squared error (lower better): ', (np.mean((predictions - ground_truth)**2))) print('------------------------')
from sklearn.metrics import hamming_loss from d3m.container.dataset import D3MDatasetLoader, Dataset, CSVLoader from common_primitives.denormalize import DenormalizePrimitive, Hyperparams as hyper_Den from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive, Hyperparams as hyper_Dat from common_primitives.extract_columns_semantic_types import ExtractColumnsBySemanticTypesPrimitive, Hyperparams as hyper_Ext from dsbox.spen.application.MLPClassifier import MLCHyperparams, Params, MLClassifier from dsbox.datapreprocessing.cleaner.to_numeric import ToNumeric, Hyperparams as hyper_Nu from dsbox.datapreprocessing.cleaner.encoder import Encoder, EncHyperparameter as hyper_En h0 = hyper_Den.defaults() h1 = hyper_Dat.defaults() primitive_0 = DenormalizePrimitive(hyperparams=h0) primitive_1 = DatasetToDataFramePrimitive(hyperparams=h1) dataset_train_file_path = 'bibtex_dataset/bibtex_dataset/datasetDoc.json' dataset = D3MDatasetLoader() dataset_train = dataset.load('file://{dataset_doc_path}'.format( dataset_doc_path=os.path.abspath(dataset_train_file_path))) dataset_org = primitive_0.produce(inputs=dataset_train) res_df = primitive_1.produce(inputs=dataset_org.value) h2 = hyper_Ext({ 'semantic_types': ( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Attribute', ),
def test_2(self): """ Training and Testing on seed dataset from D3M datasets """ print('\n') print('########################') print('#--------TEST-2--------#') print('########################') # Get volumes: all_weights = os.listdir('./static') all_weights = {w: os.path.join('./static', w) for w in all_weights} # Loading dataset. path1 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/TRAIN/dataset_TRAIN/datasetDoc.json')) dataset = Dataset.load(dataset_uri=path1) # Get dataset paths path2 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/SCORE/dataset_TEST/datasetDoc.json')) score_dataset = Dataset.load(dataset_uri=path2) # Step 0: Denormalize primitive denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams() denormalize_primitive = DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults()) denormalized_dataset = denormalize_primitive.produce(inputs=dataset) print(denormalized_dataset.value) print('------------------------') print('Loading Training Dataset....') # Step 1: Dataset to DataFrame dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams() dataframe_primitive = DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) dataframe = dataframe_primitive.produce(inputs=denormalized_dataset.value) print(dataframe.value) print('------------------------') print('Loading Testing Dataset....') # Step 0: Denormalize primitive score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams() score_denormalize_primitive = DenormalizePrimitive(hyperparams=score_denormalize_hyperparams_class.defaults()) score_denormalized_dataset = score_denormalize_primitive.produce(inputs=score_dataset) print(score_denormalized_dataset.value) print('------------------------') score_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams() score_primitive = DatasetToDataFramePrimitive(hyperparams=score_hyperparams_class.defaults()) score = score_primitive.produce(inputs=score_denormalized_dataset.value) print(score.value) print('------------------------') # Call primitives hyperparams_class = ConvolutionalNeuralNetwork.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] hyperparams_class = hyperparams_class.defaults().replace( { 'include_top': False, 'cnn_type': 'mobilenet', 'output_dim': 1, } ) primitive = ConvolutionalNeuralNetwork(hyperparams=hyperparams_class, volumes=all_weights) test_out = primitive.produce(inputs=dataframe.value) print(test_out) print('------------------------') extractA_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] extractA_hyperparams_class = extractA_hyperparams_class.defaults().replace( { 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',) } ) extractA_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractA_hyperparams_class) extractA = extractA_primitive.produce(inputs=test_out.value) print(extractA.value) print('------------------------') extractP_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] extractP_hyperparams = extractP_hyperparams_class.defaults().replace( { 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',) } ) extractP_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractP_hyperparams) extractP = extractP_primitive.produce(inputs=dataframe.value) extractP = extractP.value # Update Metadata from SuggestedTarget to TrueTarget for col in range((extractP).shape[1]): col_dict = dict(extractP.metadata.query((metadata_base.ALL_ELEMENTS, col))) col_dict['structural_type'] = type(1.0) col_dict['name'] = "WRISTBREADTH" col_dict["semantic_types"] = ("http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/TrueTarget",) extractP.metadata = extractP.metadata.update((metadata_base.ALL_ELEMENTS, col), col_dict) print(extractP) print('------------------------') # Call primitives score_out = primitive.produce(inputs=score.value) XGB_hyperparams_class = XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'] XGB_primitive = XGBoostGBTreeRegressorPrimitive(hyperparams=XGB_hyperparams_class.defaults()) XGB_primitive.set_training_data(inputs=test_out.value, outputs=extractP) XGB_primitive.fit() test_out_xgb = XGB_primitive.produce(inputs=score_out.value) test_out_xgb = test_out_xgb.value print('Predictions') print(test_out_xgb) print('------------------------') # Computer Error ground_truth = ((score.value['WRISTBREADTH']).to_numpy()).astype(np.float) predictions = (test_out_xgb.iloc[:, -1]).to_numpy() print(ground_truth) print(predictions) print('------------------------') print('Mean squared error (lower better): ', (np.mean((predictions - ground_truth)**2))) print('------------------------')