Beispiel #1
0
def load_dataset(root_path, phase, inner_phase=None):
    inner_phase = inner_phase or phase
    path = os.path.join(root_path, phase, 'dataset_' + inner_phase,
                        'datasetDoc.json')
    if os.path.exists(path):
        return Dataset.load(dataset_uri='file://' + os.path.abspath(path))
    else:
        path = os.path.join(root_path, phase, 'dataset_' + phase,
                            'datasetDoc.json')
        return Dataset.load(dataset_uri=path)
Beispiel #2
0
 def load_dataset(cls,
                  d3m_dataset: Dataset) -> typing.Optional[pd.DataFrame]:
     entry_id = '0'
     for resource_id in d3m_dataset.keys():
         if "https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint" in \
                 d3m_dataset.metadata.query((resource_id,))['semantic_types']:
             entry_id = resource_id
             break
     if isinstance(d3m_dataset[entry_id], pd.DataFrame):
         return d3m_dataset[entry_id]
Beispiel #3
0
 def load_data(
         cls, data: pd.DataFrame or Dataset
     or str) -> typing.Optional[pd.DataFrame]:
     if isinstance(data, pd.DataFrame):
         return data
     if isinstance(data, Dataset):
         return cls.load_dataset(data)
     if isinstance(data, str):
         if data.endswith('.csv'):
             return pd.read_csv(data)
         else:
             return cls.load_dataset(
                 Dataset.load('file://{dataset_doc_path}'.format(
                     dataset_doc_path=os.path.abspath(data))))
Beispiel #4
0
def get_dataset(input_data, target_index=-2, index_column=-1, semantic_types=None, parse=False, media_dir=None):
    """
    A function that has as input a dataframe, and generates a D3M dataset.

    Parameters
    ----------
    input_data : pd.DataFrame
        The dataframe to be converted to d3m Dataset.
    target_index : int
        The index of the target, if index is not present, it will be ignored.
    index_column : int
        The index of the index target, if not provided it will look for d3m index, if not generate one.
    semantic_types : Sequence[Sequence[str]]
        A list of semantic types to be applied. The sequence must be of the same length of
        the dataframe columns.
    parse :
        A flag to determine if the dataset will contain parsed columns. By default is set to fault
        to make it compatible with most of D3M current infrastructure.
    media_dir : str
        The absolute path of the directory containing the image/video/csv files, if not present, it will be ignored

    Returns
    -------
    A D3M dataset.
    """
    data = make_unique_columns(input_data.copy(deep=True))
    if semantic_types is None:
        semantic_types = [[] for i in range(len(data.columns))]
        for i, _type in enumerate(input_data.dtypes):
            if _type == float:
                semantic_types[i].append('http://schema.org/Float')
            elif _type == int:
                semantic_types[i].append('http://schema.org/Integer')

    resources = {}

    if 'd3mIndex' in data.columns:
        index_column = list(data.columns).index("d3mIndex")
    else:
        if index_column == -1:
            data.insert(0, 'd3mIndex', range(len(data)))
            semantic_types.insert(0, [])
            target_index += 1
            index_column = 0

    data = container_pandas.DataFrame(data)

    # remove this
    if not parse:
        data = data.astype(str)
    metadata = metadata_base.DataMetadata()

    resources['learningData'] = data

    metadata = metadata.update(('learningData',), {
        'structural_type': type(data),
        'semantic_types': [
            'https://metadata.datadrivendiscovery.org/types/Table',
            'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint',
        ],
        'dimension': {
            'name': 'rows',
            'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
            'length': len(data),
        },
    })

    metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS), {
        'dimension': {
            'name': 'columns',
            'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
            'length': len(data.columns),
        },
    })

    for i, column_name in enumerate(data.columns):
        if i == index_column:
            metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), {
                'name': column_name,
                'structural_type': numpy.int64,
                'semantic_types': [
                    'http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
                ],
            })
        else:
            _structural_type = str
            if semantic_types[i]:
                _semantic_types = semantic_types[i]
                if 'http://schema.org/Float' in _semantic_types:
                    _structural_type = numpy.float64
                elif 'http://schema.org/Integer' in _semantic_types:
                    _structural_type = numpy.int64
            else:
                _semantic_types = ['https://metadata.datadrivendiscovery.org/types/UnknownType']

            if not parse:
                _structural_type = str
            if i == target_index:
                _semantic_types += ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget']
            else:
                _semantic_types += ['https://metadata.datadrivendiscovery.org/types/Attribute']

            # Add media dir if any
            if media_dir is not None and i != target_index:
                # Check the type of the first path
                first_file_path = data.iloc[0, i]
                suffix = first_file_path.split('.')[-1]
                if suffix in ['png', 'jpg']:
                    media_type = 'image'
                elif suffix in ['mp4', 'avi']:
                    media_type = 'video'
                else:
                    media_type = 'text'

                _semantic_types += ["https://metadata.datadrivendiscovery.org/types/FileName"]
                metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), {
                    'name': column_name,
                    'structural_type': str,
                    'semantic_types': _semantic_types,
                    "location_base_uris": [pathlib.Path(media_dir).as_uri()+'/'],
                    "media_types": [
                        media_type+"/"+suffix
                    ],
                })
            else:
                metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), {
                    'name': column_name,
                    'structural_type': _structural_type,
                    'semantic_types': _semantic_types,
                })

    dataset_id = str(uuid.uuid4())
    dataset_metadata = {
        'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
        'structural_type': Dataset,
        'id': dataset_id,
        'name': dataset_id,
        'digest': str(uuid.uuid4()),
        'dimension': {
            'name': 'resources',
            'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
            'length': len(resources),
        },
    }

    metadata = metadata.update((), dataset_metadata)

    dataset = Dataset(resources, metadata)
    return dataset
Beispiel #5
0
    def search(self, problem, timeout=None, budget=None):

        self.setup_search(timeout)

        dataset_id = problem['inputs'][0]['dataset_id']
        dataset = Dataset.load(self.datasets[dataset_id])
        metric = problem['problem']['performance_metrics'][0]['metric']

        LOGGER.info("Loading the template and the tuner")
        template_name = self._get_template(dataset, problem)
        template, tunables, defaults = load_template(template_name)
        tuner = GP(tunables, r_minimum=10)

        best_pipeline = None
        best_score = None
        best_normalized = 0

        if budget is not None:
            iterator = range(budget)
        else:
            iterator = itertools.count()   # infinite range

        try:
            proposal = defaults
            for i in iterator:
                self.check_stop()
                pipeline = self._new_pipeline(template, proposal)

                params = '\n'.join('{}: {}'.format(k, v) for k, v in proposal.items())
                LOGGER.info("Scoring pipeline %s: %s\n%s", i + 1, pipeline.id, params)
                try:
                    score = self.score_pipeline(dataset, problem, pipeline)
                    normalized_score = metric.normalize(score)
                except Exception:
                    LOGGER.exception("Error scoring pipeline %s", pipeline.id)
                    score = None
                    normalized_score = 0.0

                try:
                    self._save_pipeline(pipeline, normalized_score)
                except Exception:
                    LOGGER.exception("Error saving pipeline %s", pipeline.id)

                tuner.add(proposal, normalized_score)
                LOGGER.info("Pipeline %s score: %s - %s", pipeline.id, score, normalized_score)

                if normalized_score > best_normalized:
                    LOGGER.info("New best pipeline found! %s > %s", score, best_score)
                    best_pipeline = pipeline.id
                    best_score = score
                    best_normalized = normalized_score

                proposal = tuner.propose(1)

        except StopSearch:
            pass

        self.done = True
        return {
            'pipeline': best_pipeline,
            'score': best_score,
            'template': template_name,
            'data_modality': self._detect_data_modality(dataset),
            'task_type': problem['problem']['task_type'].name.lower(),
            'task_subtype': problem['problem']['task_subtype'].name.lower(),
            'tuning_iterations': i
        }
Beispiel #6
0
def create_partials_datasets(configuration, workspace_id):
    """Create partials datasets"""
    print(configuration)

    try:
        workspace = UserWorkspace.objects.get(pk=workspace_id)
    except UserWorkspace.DoesNotExist:
        return {
            KEY_SUCCESS: False,
            KEY_DATA: f' UserWorkspace not found for id {workspace_id}.'
        }

    MAX_DATASET_SIZE = 50
    MAX_DOMAIN_SIZE = 100
    # load dataframe and dataset schema
    if 'dataset_schema_path' in configuration:
        dataset_schema = json.load(
            open(configuration['dataset_schema_path'], 'r'))
        dataset = Dataset.load(
            f'file://{configuration["dataset_schema_path"]}')
    elif 'dataset' in configuration:
        dataset_schema = configuration['dataset_schema']
        dataframe = pd.DataFrame(configuration['dataset'])
    else:
        return {KEY_SUCCESS: False, KEY_DATA: 'no dataset supplied'}

    resource_schema = next(i for i in dataset_schema['dataResources']
                           if i['resType'] == 'table')

    if 'dataset_schema_path' in configuration:
        dataframe = dataset[resource_schema['resID']]

    domains = configuration['domains']
    # METADATA OF SCHEMA:
    # {variable: [domain], ...}

    if len(dataframe) > MAX_DATASET_SIZE:
        return {
            KEY_SUCCESS: False,
            KEY_DATA: 'initial dataset too large to expand into partials'
        }

    def write_dataset(name, writable_dataframe):
        dest_dir_info = create_destination_directory(workspace, name=name)
        if not dest_dir_info[KEY_SUCCESS]:
            return dest_dir_info

        dest_directory = dest_dir_info[KEY_DATA]
        csv_path = os.path.join(dest_directory, resource_schema['resPath'])
        shutil.rmtree(dest_directory)
        shutil.copytree(workspace.d3m_config.training_data_root,
                        dest_directory)
        os.remove(csv_path)
        writable_dataframe.to_csv(csv_path, index=False)

        return {
            KEY_SUCCESS: True,
            KEY_DATA: (path.join(dest_directory, 'datasetDoc.json'), csv_path)
        }

    dataset_schemas = {}
    dataset_paths = {}

    new_column_names = list(dataframe.columns.values)
    if 'd3mIndex' in new_column_names:
        d3mIndexIndex = new_column_names.index('d3mIndex')
        new_column_names[d3mIndexIndex] = str(
            new_column_names[d3mIndexIndex]) + 'Original'

    union_datasets = []
    for predictor in domains:
        synthetic_data = []
        predictor_idx = new_column_names.index(predictor)
        for row_idx in range(len(dataframe)):
            row = dataframe.iloc[row_idx].tolist()

            for support_member in domains[predictor][:MAX_DOMAIN_SIZE]:
                row_copy = list(row)
                row_copy[predictor_idx] = support_member
                synthetic_data.append(row_copy)

        synthetic_data = pd.DataFrame(synthetic_data, columns=new_column_names)

        if configuration['separate_variables']:
            synthetic_data.insert(0, 'd3mIndex',
                                  list(range(len(synthetic_data))))
            dataset_name = configuration['name'] + predictor

            result_write = write_dataset(dataset_name, synthetic_data)
            if not result_write[KEY_SUCCESS]:
                return result_write
            dataset_schema, dataset_path = result_write[KEY_DATA]

            dataset_schemas[dataset_name] = dataset_schema
            dataset_paths[dataset_name] = dataset_path
        else:
            union_datasets.append(synthetic_data)

    if union_datasets:
        synthetic_data = pd.concat(union_datasets)
        synthetic_data.insert(0, 'd3mIndex', list(range(len(synthetic_data))))

        result_write = write_dataset(configuration['name'], synthetic_data)
        if not result_write[KEY_SUCCESS]:
            return result_write
        dataset_schema, dataset_path = result_write[KEY_DATA]

        dataset_schemas[configuration['name']] = dataset_schema
        dataset_paths[configuration['name']] = dataset_path

    return {
        KEY_SUCCESS: True,
        KEY_DATA: {
            'dataset_schemas': dataset_schemas,
            'dataset_paths': dataset_paths
        }
    }
Beispiel #7
0
    def search(self, problem, timeout=None, budget=None, template_names=None):

        self.timeout = timeout
        best_pipeline = None
        best_score = None
        best_normalized = 0
        best_template_name = None
        template_names = template_names or list()
        data_modality = None
        task_type = None
        task_subtype = None
        iteration = 0
        errors = list()

        dataset_name, dataset_path = self._get_dataset_details(problem)
        dataset = Dataset.load(dataset_path)
        metric = problem['problem']['performance_metrics'][0]['metric']

        data_modality = detect_data_modality(dataset_path[7:])
        task_type = problem['problem']['task_type'].name.lower()
        task_subtype = problem['problem']['task_subtype'].name.lower()

        data_augmentation = self.get_data_augmentation(dataset, problem)

        LOGGER.info("Searching dataset %s: %s/%s/%s", dataset_name,
                    data_modality, task_type, task_subtype)

        try:
            self.setup_search()

            self.score_pipeline(dataset, problem, self.fallback)
            self.fallback.normalized_score = metric.normalize(
                self.fallback.score)
            self._save_pipeline(self.fallback)
            best_pipeline = self.fallback.id
            best_score = self.fallback.score
            best_template_name = FALLBACK_PIPELINE
            best_normalized = self.fallback.normalized_score

            LOGGER.info("Fallback pipeline score: %s - %s",
                        self.fallback.score, self.fallback.normalized_score)

            LOGGER.info("Loading the template and the tuner")
            if not template_names:
                template_names = self._get_templates(data_modality, task_type)

            if budget is not None:
                iterator = range(budget)
            else:
                iterator = itertools.count()  # infinite range

            selector_tuner = SelectorTuner(template_names, data_augmentation)

            for iteration in iterator:
                self.check_stop()
                template_name, template, proposal, defaults = selector_tuner.propose(
                )
                pipeline = self._new_pipeline(template, proposal)

                params = '\n'.join('{}: {}'.format(k, v)
                                   for k, v in proposal.items())
                LOGGER.warn("Scoring pipeline %s - %s: %s\n%s", iteration + 1,
                            template_name, pipeline.id, params)
                try:
                    self.score_pipeline(dataset, problem, pipeline)
                    pipeline.normalized_score = metric.normalize(
                        pipeline.score)
                    # raise Exception("This won't work")
                except Exception as ex:
                    LOGGER.exception(
                        "Error scoring pipeline %s for dataset %s",
                        pipeline.id, dataset_name)

                    if defaults:
                        error = '{}: {}'.format(type(ex).__name__, ex)
                        errors.append(error)
                        max_errors = min(len(selector_tuner.template_names),
                                         budget or np.inf)
                        if len(errors) >= max_errors:
                            raise Exception(errors)

                    pipeline.score = None
                    pipeline.normalized_score = 0.0

                try:
                    self._save_pipeline(pipeline)
                except Exception:
                    LOGGER.exception("Error saving pipeline %s", pipeline.id)

                selector_tuner.add(template_name, proposal,
                                   pipeline.normalized_score)
                LOGGER.info("Pipeline %s score: %s - %s", pipeline.id,
                            pipeline.score, pipeline.normalized_score)

                if pipeline.normalized_score > best_normalized:
                    LOGGER.warn(
                        "New best pipeline found: %s! %s is better than %s",
                        template_name, pipeline.score, best_score)
                    best_pipeline = pipeline.id
                    best_score = pipeline.score
                    best_normalized = pipeline.normalized_score
                    best_template_name = template_name

        except KeyboardInterrupt:
            pass
        except Exception:
            LOGGER.exception("Error processing dataset %s", dataset)

        finally:
            if self.timeout and self.hard_timeout:
                signal.alarm(0)

        self.done = True
        iterations = iteration - len(template_names) + 1
        if iterations <= 0:
            iterations = None

        return {
            'pipeline': best_pipeline,
            'cv_score': best_score,
            'template': best_template_name,
            'data_modality': data_modality,
            'task_type': task_type,
            'task_subtype': task_subtype,
            'tuning_iterations': iterations,
            'error': errors or None
        }
    def test_1(self):
        print('\n')
        print('running test-2..............')
        # Loading training dataset.
        base_path = "/ubc_primitives/datasets/seed_datasets_current/LL1_TXT_CLS_apple_products_sentiment"
        dataset_doc_path = os.path.join(base_path,\
                                        'TRAIN/dataset_TRAIN',\
                                        'datasetDoc.json')
        dataset = Dataset.load('file://{dataset_doc_path}'.format(
            dataset_doc_path=dataset_doc_path))

        # Step 0: Denormalize primitive
        denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams(
        )
        denormalize_primitive = DenormalizePrimitive(
            hyperparams=denormalize_hyperparams_class.defaults())
        denormalized_dataset = denormalize_primitive.produce(inputs=dataset)

        print(denormalized_dataset.value)
        print('------------------------')

        # Step 1: Dataset to DataFrame
        dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams(
        )
        dataframe_primitive = DatasetToDataFramePrimitive(
            hyperparams=dataframe_hyperparams_class.defaults())
        dataframe = dataframe_primitive.produce(
            inputs=denormalized_dataset.value)

        print(dataframe.value)
        print('------------------------')

        # Step 2: DataFrame to features
        bow_hyperparams_class = BagOfWords.metadata.get_hyperparams()
        bow_primitive = BagOfWords(
            hyperparams=bow_hyperparams_class.defaults())
        bow_primitive_out = bow_primitive.produce(inputs=dataframe.value)

        # Step 3: Dataset to DataFrame
        kmeans_hyperparams_class = KMeansClusteringPrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        kmeans_hyperparams = kmeans_hyperparams_class.defaults().replace({
            'n_clusters':
            4,
            'n_init':
            10,
            'max_iter':
            1000
        })
        kmeans_primitive = KMeansClusteringPrimitive(
            hyperparams=kmeans_hyperparams)
        kmeans_primitive.set_training_data(inputs=bow_primitive_out.value)
        kmeans_primitive.fit()

        #-----------------------------------------------------------------------
        # Loading Testing dataset.
        dataset_doc_path2 = os.path.join(base_path,\
                                         'SCORE/dataset_SCORE',\
                                         'datasetDoc.json')
        dataset2 = Dataset.load('file://{dataset_doc_path}'.format(
            dataset_doc_path=dataset_doc_path2))

        # Step 0: Denormalize primitive
        score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams(
        )
        score_denormalize_primitive = DenormalizePrimitive(
            hyperparams=score_denormalize_hyperparams_class.defaults())
        score_denormalized_dataset = score_denormalize_primitive.produce(
            inputs=dataset2)

        print(denormalized_dataset.value)
        print('------------------------')

        # Step 1: Dataset to DataFrame
        score_dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams(
        )
        score_dataframe_primitive = DatasetToDataFramePrimitive(
            hyperparams=score_dataframe_hyperparams_class.defaults())
        score_dataframe = score_dataframe_primitive.produce(
            inputs=score_denormalized_dataset.value)

        print(score_dataframe.value)
        print('------------------------')

        # Step 2: Read images to DataFrame
        score_bow_dataframe = bow_primitive.produce(
            inputs=score_dataframe.value)

        print(score_bow_dataframe.value)
        print('------------------------')

        score = kmeans_primitive.produce(inputs=score_bow_dataframe.value)
        score = score.value

        print(score)
        print('------------------------')

        for col in range(score.shape[1]):
            col_dict = dict(
                score.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            print('Meta-data - {}'.format(col), col_dict)

        # Computer Error
        ground_truth = ((
            score_dataframe.value['sentiment']).to_numpy()).astype(np.float)
        predictions = ((score.iloc[:, -1]).to_numpy()).astype(np.float)
        print('------------------------')
        print('Predictions')
        print(predictions)
        print('------------------------')
        print('Ground Truth')
        print(ground_truth)
        print('------------------------')

        print('------------------------')
        print('MLP Test missclassification rate (lower better):  ',
              (100 * (1 - np.mean(ground_truth == predictions))))
        print('------------------------')
Beispiel #9
0
def load_dataset(root_path, phase):
    path = os.path.join(root_path, phase, 'dataset_' + phase,
                        'datasetDoc.json')
    return Dataset.load(dataset_uri='file://' + path)
Beispiel #10
0
    def test_1(self):
        """
        Dataset test
        """
        print('\n')
        print('########################')
        print('#--------TEST-1--------#')
        print('########################')

        # Loading dataset.
        path1 = 'file://{uri}'.format(uri=os.path.abspath(
            '/ubc_primitives/datasets/seed_datasets_current/LL1_736_stock_market/SCORE/dataset_SCORE/datasetDoc.json'
        ))
        dataset = Dataset.load(dataset_uri=path1)

        # # Step 0: Denormalize primitive
        # denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        # denormalize_primitive = DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults())
        # denormalized_dataset  = denormalize_primitive.produce(inputs=dataset)
        # denormalized_dataset  = denormalized_dataset.value
        # print(denormalized_dataset)
        # print('------------------------')

        print('Loading Training Dataset....')
        # Step 0: Dataset to DataFrame
        dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams(
        )
        dataframe_primitive = DatasetToDataFramePrimitive(
            hyperparams=dataframe_hyperparams_class.defaults())
        dataframe = dataframe_primitive.produce(inputs=dataset)
        dataframe = dataframe.value
        print(dataframe)

        for col in range(dataframe.shape[1]):
            col_dict = dict(
                dataframe.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            print('Meta-data - {}'.format(col), col_dict)
        print('------------------------')

        # Step 1: Profiler
        print('Profiler')
        profiler_hyperparams_class = SimpleProfilerPrimitive.metadata.get_hyperparams(
        )
        profiler_primitive = SimpleProfilerPrimitive(
            hyperparams=profiler_hyperparams_class.defaults())
        profiler_primitive.set_training_data(inputs=dataframe)
        profiler_primitive.fit()
        profiler_dataframe = profiler_primitive.produce(inputs=dataframe)
        profiler_dataframe = profiler_dataframe.value
        print(profiler_dataframe)

        for col in range(profiler_dataframe.shape[1]):
            col_dict = dict(
                profiler_dataframe.metadata.query(
                    (metadata_base.ALL_ELEMENTS, col)))
            print('Meta-data - {}'.format(col), col_dict)
        print('------------------------')

        # Step 2: Column parser
        print('Column parser')
        parser_hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams(
        )
        parser_hyperparams = parser_hyperparams_class.defaults().replace({
            'parse_semantic_types': [
                "http://schema.org/Boolean", "http://schema.org/Integer",
                "http://schema.org/Float",
                "https://metadata.datadrivendiscovery.org/types/FloatVector",
                "http://schema.org/DateTime"
            ]
        })
        parser_primitive = ColumnParserPrimitive(
            hyperparams=parser_hyperparams)
        parser_dataframe = parser_primitive.produce(inputs=profiler_dataframe)
        parser_dataframe = parser_dataframe.value
        print(parser_dataframe)
        print('------------------------')

        for col in range(parser_dataframe.shape[1]):
            col_dict = dict(
                parser_dataframe.metadata.query(
                    (metadata_base.ALL_ELEMENTS, col)))
            print('Meta-data - {}'.format(col), col_dict)

        # Step 4: Extract dataframe
        print('Extract dataframe')
        extract_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams(
        )
        extract_hyperparams = extract_hyperparams_class.defaults().replace({
            'semantic_types':
            ['https://metadata.datadrivendiscovery.org/types/Attribute']
        })
        extract_primitive = ExtractColumnsBySemanticTypesPrimitive(
            hyperparams=extract_hyperparams)
        extract_dataframe = extract_primitive.produce(inputs=parser_dataframe)
        extract_dataframe = extract_dataframe.value
        print(extract_dataframe)
        print('------------------------')

        # Step 5: Extract target
        print('Extract target')
        extract_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams(
        )
        extract_hyperparams = extract_hyperparams_class.defaults().replace({
            'semantic_types':
            ['https://metadata.datadrivendiscovery.org/types/TrueTarget']
        })
        extract_primitive = ExtractColumnsBySemanticTypesPrimitive(
            hyperparams=extract_hyperparams)
        extract_targets = extract_primitive.produce(inputs=parser_dataframe)
        extract_targets = extract_targets.value
        print(extract_targets)
        print('------------------------')

        print('DMM Primitive....')
        dmm_hyperparams_class = DeepMarkovModelPrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        dmm_hyperparams = dmm_hyperparams_class.defaults()
        dmm_primitive = DeepMarkovModelPrimitive(hyperparams=dmm_hyperparams)
        dmm_primitive.set_training_data(inputs=extract_dataframe,
                                        outputs=extract_targets)
        print(dmm_primitive._training_inputs)
        dmm_primitive.fit()
Beispiel #11
0
    def test_1(self):
        """
        Feature extraction only and Testing on seed dataset from D3M datasets
        """
        print('\n')
        print('########################')
        print('#--------TEST-1--------#')
        print('########################')

        # Get volumes:
        all_weights = os.listdir('./static')
        all_weights = {w: os.path.join('./static', w) for w in all_weights}

        # Loading dataset.
        path1 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/TRAIN/dataset_TRAIN/datasetDoc.json'))
        dataset = Dataset.load(dataset_uri=path1)

        # Get dataset paths
        path2 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/SCORE/dataset_TEST/datasetDoc.json'))
        score_dataset = Dataset.load(dataset_uri=path2)

        # Step 0: Denormalize primitive
        denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        denormalize_primitive = DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults())
        denormalized_dataset  = denormalize_primitive.produce(inputs=dataset)
        print(denormalized_dataset.value)
        print('------------------------')

        print('Loading Training Dataset....')
        # Step 1: Dataset to DataFrame
        dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams()
        dataframe_primitive = DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults())
        dataframe = dataframe_primitive.produce(inputs=denormalized_dataset.value)
        print(dataframe.value)
        print('------------------------')

        print('Loading Testing Dataset....')
        # Step 0: Denormalize primitive
        score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        score_denormalize_primitive = DenormalizePrimitive(hyperparams=score_denormalize_hyperparams_class.defaults())
        score_denormalized_dataset  = score_denormalize_primitive.produce(inputs=score_dataset)
        print(score_denormalized_dataset.value)
        print('------------------------')

        score_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams()
        score_primitive = DatasetToDataFramePrimitive(hyperparams=score_hyperparams_class.defaults())
        score = score_primitive.produce(inputs=score_denormalized_dataset.value)
        print(score.value)
        print('------------------------')

        extractA_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        extractA_hyperparams_class = extractA_hyperparams_class.defaults().replace(
                {
                'semantic_types': ('https://metadata.datadrivendiscovery.org/types/FileName',)
                }
        )
        extractA_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractA_hyperparams_class)
        extractA = extractA_primitive.produce(inputs=dataframe.value)
        print(extractA.value)
        print('------------------------')

        extractP_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        extractP_hyperparams = extractP_hyperparams_class.defaults().replace(
                {
                'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)
                }
        )
        extractP_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractP_hyperparams)
        extractP = extractP_primitive.produce(inputs=dataframe.value)
        print(extractP.value)
        print('------------------------')

        # Call primitives
        hyperparams_class = ConvolutionalNeuralNetwork.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        hyperparams_class = hyperparams_class.defaults().replace(
                {
                'feature_extract_only': False,
                'cnn_type': 'mobilenet',
                'num_iterations': 150,
                'output_dim': 1
                }
        )
        primitive = ConvolutionalNeuralNetwork(hyperparams=hyperparams_class, volumes=all_weights)
        primitive.set_training_data(inputs = dataframe.value, outputs = extractP.value)
        test_out  = primitive.fit()
        test_out  = primitive.produce(inputs=score.value)
        test_out  = test_out.value

        print(test_out)
        print('------------------------')
        for col in range(test_out.shape[1]):
            col_dict = dict(test_out.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            print('Meta-data - {}'.format(col), col_dict)

        # Computer Error
        ground_truth = ((score.value['WRISTBREADTH']).to_numpy()).astype(np.float)
        predictions  = (test_out.iloc[:, -1]).to_numpy()

        print(ground_truth)
        print(predictions)
        print('------------------------')

        print('Mean squared error (lower better): ', (np.mean((predictions - ground_truth)**2)))
        print('------------------------')
Beispiel #12
0
    def test_2(self):
        """
        Training and Testing on seed dataset from D3M datasets
        """
        print('\n')
        print('########################')
        print('#--------TEST-2--------#')
        print('########################')

        # Get volumes:
        all_weights = os.listdir('./static')
        all_weights = {w: os.path.join('./static', w) for w in all_weights}

        # Loading dataset.
        path1 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/TRAIN/dataset_TRAIN/datasetDoc.json'))
        dataset = Dataset.load(dataset_uri=path1)

        # Get dataset paths
        path2 = 'file://{uri}'.format(uri=os.path.abspath('/ubc_primitives/datasets/seed_datasets_current/22_handgeometry/SCORE/dataset_TEST/datasetDoc.json'))
        score_dataset = Dataset.load(dataset_uri=path2)

        # Step 0: Denormalize primitive
        denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        denormalize_primitive = DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults())
        denormalized_dataset  = denormalize_primitive.produce(inputs=dataset)
        print(denormalized_dataset.value)
        print('------------------------')

        print('Loading Training Dataset....')
        # Step 1: Dataset to DataFrame
        dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams()
        dataframe_primitive = DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults())
        dataframe = dataframe_primitive.produce(inputs=denormalized_dataset.value)
        print(dataframe.value)
        print('------------------------')

        print('Loading Testing Dataset....')
        # Step 0: Denormalize primitive
        score_denormalize_hyperparams_class = DenormalizePrimitive.metadata.get_hyperparams()
        score_denormalize_primitive = DenormalizePrimitive(hyperparams=score_denormalize_hyperparams_class.defaults())
        score_denormalized_dataset  = score_denormalize_primitive.produce(inputs=score_dataset)
        print(score_denormalized_dataset.value)
        print('------------------------')

        score_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams()
        score_primitive = DatasetToDataFramePrimitive(hyperparams=score_hyperparams_class.defaults())
        score = score_primitive.produce(inputs=score_denormalized_dataset.value)
        print(score.value)
        print('------------------------')

        # Call primitives
        hyperparams_class = ConvolutionalNeuralNetwork.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        hyperparams_class = hyperparams_class.defaults().replace(
                {
                'include_top': False,
                'cnn_type': 'mobilenet',
                'output_dim': 1,
                }
        )
        primitive = ConvolutionalNeuralNetwork(hyperparams=hyperparams_class, volumes=all_weights)
        test_out  = primitive.produce(inputs=dataframe.value)

        print(test_out)
        print('------------------------')

        extractA_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        extractA_hyperparams_class = extractA_hyperparams_class.defaults().replace(
                {
                'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)
                }
        )
        extractA_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractA_hyperparams_class)
        extractA = extractA_primitive.produce(inputs=test_out.value)
        print(extractA.value)
        print('------------------------')

        extractP_hyperparams_class = ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        extractP_hyperparams = extractP_hyperparams_class.defaults().replace(
                {
                'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)
                }
        )
        extractP_primitive = ExtractColumnsBySemanticTypesPrimitive(hyperparams=extractP_hyperparams)
        extractP = extractP_primitive.produce(inputs=dataframe.value)
        extractP = extractP.value
        # Update Metadata from SuggestedTarget to TrueTarget
        for col in range((extractP).shape[1]):
            col_dict = dict(extractP.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            col_dict['structural_type'] = type(1.0)
            col_dict['name']            = "WRISTBREADTH"
            col_dict["semantic_types"]  = ("http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/TrueTarget",)
            extractP.metadata           = extractP.metadata.update((metadata_base.ALL_ELEMENTS, col), col_dict)

        print(extractP)
        print('------------------------')

        # Call primitives
        score_out = primitive.produce(inputs=score.value)

        XGB_hyperparams_class = XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        XGB_primitive = XGBoostGBTreeRegressorPrimitive(hyperparams=XGB_hyperparams_class.defaults())
        XGB_primitive.set_training_data(inputs=test_out.value, outputs=extractP)
        XGB_primitive.fit()
        test_out_xgb = XGB_primitive.produce(inputs=score_out.value)
        test_out_xgb = test_out_xgb.value

        print('Predictions')
        print(test_out_xgb)
        print('------------------------')

        # Computer Error
        ground_truth = ((score.value['WRISTBREADTH']).to_numpy()).astype(np.float)
        predictions  = (test_out_xgb.iloc[:, -1]).to_numpy()

        print(ground_truth)
        print(predictions)
        print('------------------------')

        print('Mean squared error (lower better): ', (np.mean((predictions - ground_truth)**2)))
        print('------------------------')