Example #1
0
from d3m.base import utils as d3m_utils
import os
import pandas as pd

# load the ISI datamart, currently the url is here, may change in the future
isi_datamart_url = "http://dsbox02.isi.edu:9999/blazegraph/namespace/datamart3/sparql"
a = Datamart(connection_url=isi_datamart_url)
# load the D3M dataset,here we use "DA_poverty_estimation" as exmaple ,please change to your dataset path
loader = D3MDatasetLoader()
path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
json_file = os.path.abspath(path)
all_dataset_uri = 'file://{}'.format(json_file)
all_dataset = loader.load(dataset_uri=all_dataset_uri)
# run denormlaize primitive
denormalize_hyperparams = hyper_denormalize.defaults()
denormalize_primitive = DenormalizePrimitive(hyperparams = denormalize_hyperparams)
all_dataset = denormalize_primitive.produce(inputs = all_dataset).value


"""
start search, run search with data function.
Here because the dataset do not have any "Text" semantic type columns,
the system will said that no columns can be augment
"""
search_res = a.search_with_data(query=None, supplied_data=all_dataset)

"""
run get next page, we will get real search results, it will only have 2 wikidata search results
Explain:
here we do not find any "Qnodes" semantic type columns, so we will try to run wikifier before searching in wikidata database
Then, We will generate 2 Q nodes columns for FIPS and State. 
Example #2
0
    def test_1(self):
        print('\n')
        print('running test-2..............')
        # Loading training dataset.
        base_path = "/ubc_primitives/datasets/seed_datasets_current/LL1_TXT_CLS_apple_products_sentiment"
        dataset_doc_path = os.path.join(base_path,\
                                        'TRAIN/dataset_TRAIN',\
                                        'datasetDoc.json')
        dataset = Dataset.load('file://{dataset_doc_path}'.format(
            dataset_doc_path=dataset_doc_path))

        # Step 0: Denormalize primitive
        denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams(
        )
        denormalize_primitive = DenormalizePrimitive(
            hyperparams=denormalize_hyperparams_class.defaults())
        denormalized_dataset = denormalize_primitive.produce(inputs=dataset)

        print(denormalized_dataset.value)
        print('------------------------')

        # Step 1: Dataset to DataFrame
        dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams(
        )
        dataframe_primitive = DatasetToDataFramePrimitive(
            hyperparams=dataframe_hyperparams_class.defaults())
        dataframe = dataframe_primitive.produce(
            inputs=denormalized_dataset.value)

        print(dataframe.value)
        print('------------------------')

        # Step 2: DataFrame to features
        bow_hyperparams_class = BagOfWords.metadata.get_hyperparams()
        bow_primitive = BagOfWords(
            hyperparams=bow_hyperparams_class.defaults())
        bow_primitive_out = bow_primitive.produce(inputs=dataframe.value)

        # Step 3: Dataset to DataFrame
        kmeans_hyperparams_class = KMeansClusteringPrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        kmeans_hyperparams = kmeans_hyperparams_class.defaults().replace({
            'n_clusters':
            4,
            'n_init':
            10,
            'max_iter':
            1000
        })
        kmeans_primitive = KMeansClusteringPrimitive(
            hyperparams=kmeans_hyperparams)
        kmeans_primitive.set_training_data(inputs=bow_primitive_out.value)
        kmeans_primitive.fit()

        #-----------------------------------------------------------------------
        # Loading Testing dataset.
        dataset_doc_path2 = os.path.join(base_path,\
                                         'SCORE/dataset_SCORE',\
                                         'datasetDoc.json')
        dataset2 = Dataset.load('file://{dataset_doc_path}'.format(
            dataset_doc_path=dataset_doc_path2))

        # Step 0: Denormalize primitive
        score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams(
        )
        score_denormalize_primitive = DenormalizePrimitive(
            hyperparams=score_denormalize_hyperparams_class.defaults())
        score_denormalized_dataset = score_denormalize_primitive.produce(
            inputs=dataset2)

        print(denormalized_dataset.value)
        print('------------------------')

        # Step 1: Dataset to DataFrame
        score_dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams(
        )
        score_dataframe_primitive = DatasetToDataFramePrimitive(
            hyperparams=score_dataframe_hyperparams_class.defaults())
        score_dataframe = score_dataframe_primitive.produce(
            inputs=score_denormalized_dataset.value)

        print(score_dataframe.value)
        print('------------------------')

        # Step 2: Read images to DataFrame
        score_bow_dataframe = bow_primitive.produce(
            inputs=score_dataframe.value)

        print(score_bow_dataframe.value)
        print('------------------------')

        score = kmeans_primitive.produce(inputs=score_bow_dataframe.value)
        score = score.value

        print(score)
        print('------------------------')

        for col in range(score.shape[1]):
            col_dict = dict(
                score.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            print('Meta-data - {}'.format(col), col_dict)

        # Computer Error
        ground_truth = ((
            score_dataframe.value['sentiment']).to_numpy()).astype(np.float)
        predictions = ((score.iloc[:, -1]).to_numpy()).astype(np.float)
        print('------------------------')
        print('Predictions')
        print(predictions)
        print('------------------------')
        print('Ground Truth')
        print(ground_truth)
        print('------------------------')

        print('------------------------')
        print('MLP Test missclassification rate (lower better):  ',
              (100 * (1 - np.mean(ground_truth == predictions))))
        print('------------------------')
Example #3
0
    def test_1(self):
        """
        Feature extraction only and Testing on seed dataset from D3M datasets
        """
        print('\n')
        print('########################')
        print('#--------TEST-1--------#')
        print('########################')

        # Get volumes:
        all_weights = os.listdir('./static')
        all_weights = {w: os.path.join('./static', w) for w in all_weights}

        # Loading dataset.
        path1 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/TRAIN/dataset_TRAIN/datasetDoc.json'))
        dataset = Dataset.load(dataset_uri=path1)

        # Get dataset paths
        path2 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/SCORE/dataset_TEST/datasetDoc.json'))
        score_dataset = Dataset.load(dataset_uri=path2)

        # Step 0: Denormalize primitive
        denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        denormalize_primitive = DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults())
        denormalized_dataset  = denormalize_primitive.produce(inputs=dataset)
        print(denormalized_dataset.value)
        print('------------------------')

        print('Loading Training Dataset....')
        # Step 1: Dataset to DataFrame
        dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams()
        dataframe_primitive = DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults())
        dataframe = dataframe_primitive.produce(inputs=denormalized_dataset.value)
        print(dataframe.value)
        print('------------------------')

        print('Loading Testing Dataset....')
        # Step 0: Denormalize primitive
        score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        score_denormalize_primitive = DenormalizePrimitive(hyperparams=score_denormalize_hyperparams_class.defaults())
        score_denormalized_dataset  = score_denormalize_primitive.produce(inputs=score_dataset)
        print(score_denormalized_dataset.value)
        print('------------------------')

        score_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams()
        score_primitive = DatasetToDataFramePrimitive(hyperparams=score_hyperparams_class.defaults())
        score = score_primitive.produce(inputs=score_denormalized_dataset.value)
        print(score.value)
        print('------------------------')

        extractA_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        extractA_hyperparams_class = extractA_hyperparams_class.defaults().replace(
                {
                'semantic_types': ('https://metadata.datadrivendiscovery.org/types/FileName',)
                }
        )
        extractA_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractA_hyperparams_class)
        extractA = extractA_primitive.produce(inputs=dataframe.value)
        print(extractA.value)
        print('------------------------')

        extractP_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        extractP_hyperparams = extractP_hyperparams_class.defaults().replace(
                {
                'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)
                }
        )
        extractP_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractP_hyperparams)
        extractP = extractP_primitive.produce(inputs=dataframe.value)
        print(extractP.value)
        print('------------------------')

        # Call primitives
        hyperparams_class = ConvolutionalNeuralNetwork.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        hyperparams_class = hyperparams_class.defaults().replace(
                {
                'feature_extract_only': False,
                'cnn_type': 'mobilenet',
                'num_iterations': 150,
                'output_dim': 1
                }
        )
        primitive = ConvolutionalNeuralNetwork(hyperparams=hyperparams_class, volumes=all_weights)
        primitive.set_training_data(inputs = dataframe.value, outputs = extractP.value)
        test_out  = primitive.fit()
        test_out  = primitive.produce(inputs=score.value)
        test_out  = test_out.value

        print(test_out)
        print('------------------------')
        for col in range(test_out.shape[1]):
            col_dict = dict(test_out.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            print('Meta-data - {}'.format(col), col_dict)

        # Computer Error
        ground_truth = ((score.value['WRISTBREADTH']).to_numpy()).astype(np.float)
        predictions  = (test_out.iloc[:, -1]).to_numpy()

        print(ground_truth)
        print(predictions)
        print('------------------------')

        print('Mean squared error (lower better): ', (np.mean((predictions - ground_truth)**2)))
        print('------------------------')
Example #4
0
from sklearn.metrics import hamming_loss

from d3m.container.dataset import D3MDatasetLoader, Dataset, CSVLoader

from common_primitives.denormalize import DenormalizePrimitive, Hyperparams as hyper_Den
from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive, Hyperparams as hyper_Dat
from common_primitives.extract_columns_semantic_types import ExtractColumnsBySemanticTypesPrimitive, Hyperparams as hyper_Ext

from dsbox.spen.application.MLPClassifier import MLCHyperparams, Params, MLClassifier
from dsbox.datapreprocessing.cleaner.to_numeric import ToNumeric, Hyperparams as hyper_Nu
from dsbox.datapreprocessing.cleaner.encoder import Encoder, EncHyperparameter as hyper_En

h0 = hyper_Den.defaults()
h1 = hyper_Dat.defaults()
primitive_0 = DenormalizePrimitive(hyperparams=h0)
primitive_1 = DatasetToDataFramePrimitive(hyperparams=h1)

dataset_train_file_path = 'bibtex_dataset/bibtex_dataset/datasetDoc.json'
dataset = D3MDatasetLoader()

dataset_train = dataset.load('file://{dataset_doc_path}'.format(
    dataset_doc_path=os.path.abspath(dataset_train_file_path)))
dataset_org = primitive_0.produce(inputs=dataset_train)
res_df = primitive_1.produce(inputs=dataset_org.value)

h2 = hyper_Ext({
    'semantic_types': (
        'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
        'https://metadata.datadrivendiscovery.org/types/Attribute',
    ),
Example #5
0
    def test_2(self):
        """
        Training and Testing on seed dataset from D3M datasets
        """
        print('\n')
        print('########################')
        print('#--------TEST-2--------#')
        print('########################')

        # Get volumes:
        all_weights = os.listdir('./static')
        all_weights = {w: os.path.join('./static', w) for w in all_weights}

        # Loading dataset.
        path1 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/TRAIN/dataset_TRAIN/datasetDoc.json'))
        dataset = Dataset.load(dataset_uri=path1)

        # Get dataset paths
        path2 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/SCORE/dataset_TEST/datasetDoc.json'))
        score_dataset = Dataset.load(dataset_uri=path2)

        # Step 0: Denormalize primitive
        denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        denormalize_primitive = DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults())
        denormalized_dataset  = denormalize_primitive.produce(inputs=dataset)
        print(denormalized_dataset.value)
        print('------------------------')

        print('Loading Training Dataset....')
        # Step 1: Dataset to DataFrame
        dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams()
        dataframe_primitive = DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults())
        dataframe = dataframe_primitive.produce(inputs=denormalized_dataset.value)
        print(dataframe.value)
        print('------------------------')

        print('Loading Testing Dataset....')
        # Step 0: Denormalize primitive
        score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        score_denormalize_primitive = DenormalizePrimitive(hyperparams=score_denormalize_hyperparams_class.defaults())
        score_denormalized_dataset  = score_denormalize_primitive.produce(inputs=score_dataset)
        print(score_denormalized_dataset.value)
        print('------------------------')

        score_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams()
        score_primitive = DatasetToDataFramePrimitive(hyperparams=score_hyperparams_class.defaults())
        score = score_primitive.produce(inputs=score_denormalized_dataset.value)
        print(score.value)
        print('------------------------')

        # Call primitives
        hyperparams_class = ConvolutionalNeuralNetwork.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        hyperparams_class = hyperparams_class.defaults().replace(
                {
                'include_top': False,
                'cnn_type': 'mobilenet',
                'output_dim': 1,
                }
        )
        primitive = ConvolutionalNeuralNetwork(hyperparams=hyperparams_class, volumes=all_weights)
        test_out  = primitive.produce(inputs=dataframe.value)

        print(test_out)
        print('------------------------')

        extractA_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        extractA_hyperparams_class = extractA_hyperparams_class.defaults().replace(
                {
                'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)
                }
        )
        extractA_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractA_hyperparams_class)
        extractA = extractA_primitive.produce(inputs=test_out.value)
        print(extractA.value)
        print('------------------------')

        extractP_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        extractP_hyperparams = extractP_hyperparams_class.defaults().replace(
                {
                'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)
                }
        )
        extractP_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractP_hyperparams)
        extractP = extractP_primitive.produce(inputs=dataframe.value)
        extractP = extractP.value
        # Update Metadata from SuggestedTarget to TrueTarget
        for col in range((extractP).shape[1]):
            col_dict = dict(extractP.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            col_dict['structural_type'] = type(1.0)
            col_dict['name']            = "WRISTBREADTH"
            col_dict["semantic_types"]  = ("http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/TrueTarget",)
            extractP.metadata           = extractP.metadata.update((metadata_base.ALL_ELEMENTS, col), col_dict)

        print(extractP)
        print('------------------------')

        # Call primitives
        score_out = primitive.produce(inputs=score.value)

        XGB_hyperparams_class = XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        XGB_primitive = XGBoostGBTreeRegressorPrimitive(hyperparams=XGB_hyperparams_class.defaults())
        XGB_primitive.set_training_data(inputs=test_out.value, outputs=extractP)
        XGB_primitive.fit()
        test_out_xgb = XGB_primitive.produce(inputs=score_out.value)
        test_out_xgb = test_out_xgb.value

        print('Predictions')
        print(test_out_xgb)
        print('------------------------')

        # Computer Error
        ground_truth = ((score.value['WRISTBREADTH']).to_numpy()).astype(np.float)
        predictions  = (test_out_xgb.iloc[:, -1]).to_numpy()

        print(ground_truth)
        print(predictions)
        print('------------------------')

        print('Mean squared error (lower better): ', (np.mean((predictions - ground_truth)**2)))
        print('------------------------')