Python Dataset Examples, d3m.container.Dataset Python Examples

Example #1

0

Show file

File: concat.py Project: uncharted-distil/distil-primitives-contrib

    def produce(self,
                *,
                inputs: container.List,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:
        # build the list of dataframes from the list of inputs
        dataframes = []
        metadata = None
        for input in inputs:
            if isinstance(input, container.DataFrame):
                dataframes.append(input)
            try:
                _, main_dr = d3m_base_utils.get_tabular_resource(input, None)
                dataframes.append(main_dr)
                metadata = input.metadata
            except ValueError as error:
                raise exceptions.InvalidArgumentValueError(
                    "Failure to find tabular resource in dataset") from error

        if self.hyperparams["column_overlap"] == "exact":
            columns_to_handle = dataframes[0].columns
            if np.sum(
                    np.array([
                        np.all(df.columns == columns_to_handle)
                        for df in dataframes
                    ])) != len(dataframes):
                raise exceptions.InvalidArgumentValueError(
                    "Dataframes don't have same columns, cannot exact concat")
            concated = pd.concat(dataframes, ignore_index=True)
        elif self.hyperparams["column_overlap"] == "union":
            concated = pd.concat(dataframes, ignore_index=True)
        elif self.hyperparams["column_overlap"] == "intersection":
            concated = pd.concat(dataframes, join="inner", ignore_index=True)

        if self.hyperparams["remove_duplicate_rows"]:
            concated.drop_duplicates(subset="d3mIndex",
                                     keep="first",
                                     inplace=True,
                                     ignore_index=True)

        if metadata is None:
            metadata = container.Dataset({
                "learningData": concated.head(1)
            },
                                         generate_metadata=True).metadata
        outputs = container.Dataset({"learningData": concated}, metadata)
        outputs.metadata = outputs.metadata.update(
            (metadata_base.ALL_ELEMENTS, ),
            {"dimension": {
                "length": concated.shape[0]
            }})

        return base.CallResult(outputs)

Example #2

0

Show file

File: timeseries_formatter.py Project: uncharted-distil/distil-timeseries-loader

    def produce(self,
                *,
                inputs: container.Dataset,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:

        main_resource_index = self.hyperparams['main_resource_index']
        if main_resource_index is None:
            raise exceptions.InvalidArgumentValueError(
                'no main resource specified')

        file_index = self.hyperparams['file_col_index']
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata,
                                            main_resource_index, file_index):
                raise exceptions.InvalidArgumentValueError(
                    'column idx=' + str(file_index) +
                    ' from does not contain csv file names')
        else:
            file_index = self._find_csv_file_column(inputs.metadata)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    'no column from contains csv file names')

        # generate the long form timeseries data
        base_path = self._get_base_path(inputs.metadata, main_resource_index,
                                        file_index)
        output_data = []
        timeseries_dataframe = pd.DataFrame()
        for idx, tRow in inputs[main_resource_index].iterrows():
            # read the timeseries data
            csv_path = os.path.join(base_path, tRow[file_index])
            timeseries_row = pd.read_csv(csv_path)

            # add the timeseries id
            tRow = tRow.append(pd.Series({'series_id': int(idx)}))

            # combine the timeseries data with the value row
            output_data.extend([
                pd.concat([tRow, vRow])
                for vIdx, vRow in timeseries_row.iterrows()
            ])

        # add the timeseries index
        timeseries_dataframe = timeseries_dataframe.append(output_data,
                                                           ignore_index=True)

        # join the metadata from the 2 data resources
        timeseries_dataframe = container.DataFrame(timeseries_dataframe)

        # wrap as a D3M container
        #return base.CallResult(container.Dataset({'0': timeseries_dataframe}, metadata))
        return base.CallResult(
            container.Dataset({'0': timeseries_dataframe},
                              generate_metadata=True))

Example #3

0

Show file

    def time_classification_scores(self, rows):
        # This has been cut-and-paste from test_compute_scores.py
        truth = container.DataFrame({
            'd3mIndex': range(rows),
            'col0': (1, ) * rows
        })

        truth_dataset = container.Dataset({'learningData': truth},
                                          generate_metadata=True)
        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(
            ('learningData', metadata_base.ALL_ELEMENTS, 0),
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(
            ('learningData', metadata_base.ALL_ELEMENTS, 1),
            'https://metadata.datadrivendiscovery.org/types/Target')
        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(
            ('learningData', metadata_base.ALL_ELEMENTS, 1),
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')

        # predictions are identical to truth, should have no impact on performance.
        predictions = truth

        # configure primitive
        hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams(
        )
        metrics_class = hyperparams_class.configuration['metrics'].elements
        primitive = compute_scores.ComputeScoresPrimitive(
            hyperparams=hyperparams_class.defaults().replace({
                'metrics': [
                    metrics_class({
                        'metric': 'ACCURACY',
                        'pos_label': None,
                        'k': None,
                    }),
                    metrics_class({
                        'metric': 'F1_MICRO',
                        'pos_label': None,
                        'k': None,
                    }),
                    metrics_class({
                        'metric': 'F1_MACRO',
                        'pos_label': None,
                        'k': None,
                    })
                ],
            }))

        # run scoring.
        scores = primitive.produce(inputs=predictions,
                                   score_dataset=truth_dataset).value

Example #4

0

Show file

    def produce(self, *,
                left: Inputs,  # type: ignore
                right: Inputs,  # type: ignore
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:

        # attempt to extract the main table
        try:
            left_resource_id, left_df = utils.get_tabular_resource(left, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError("Failure to find tabular resource in left dataset") from error

        try:
            right_resource_id, right_df = utils.get_tabular_resource(right, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError("Failure to find tabular resource in right dataset") from error

        accuracy = self.hyperparams['accuracy']
        if accuracy <= 0.0 or accuracy > 1.0:
            raise exceptions.InvalidArgumentValueError('accuracy of ' + str(accuracy) + ' is out of range')

        left_col = self.hyperparams['left_col']
        right_col = self.hyperparams['right_col']

        # perform join based on semantic type
        join_type = self._get_join_semantic_type(left, left_resource_id, left_col, right, right_resource_id, right_col)
        joined: pd.Dataframe = None
        if join_type in self._STRING_JOIN_TYPES:
            joined = self._join_string_col(left_df, left_col, right_df, right_col, accuracy)
        elif join_type in self._NUMERIC_JOIN_TYPES:
            joined = self._join_numeric_col(left_df, left_col, right_df, right_col, accuracy)
        elif join_type in self._DATETIME_JOIN_TYPES:
            joined = self._join_datetime_col(left_df, left_col, right_df, right_col, accuracy)
        else:
            raise exceptions.InvalidArgumentValueError('join not surpported on type ' + str(join_type))

        # create a new dataset to hold the joined data
        resource_map = {}
        for resource_id, resource in left.items():  # type: ignore
            if resource_id == left_resource_id:
                resource_map[resource_id] = joined
            else:
                resource_map[resource_id] = resource
        result_dataset = container.Dataset(resource_map)

        return base.CallResult(result_dataset)

Example #5

0

Show file

    def produce(
            self,
            *,
            inputs: container.Dataset,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        main_resource_index = self.hyperparams['main_resource_index']
        if main_resource_index is None:
            raise exceptions.InvalidArgumentValueError(
                'no main resource specified')

        file_index = self.hyperparams['file_col_index']
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata,
                                            main_resource_index, file_index):
                raise exceptions.InvalidArgumentValueError(
                    'column idx=' + str(file_index) +
                    ' from does not contain csv file names')
        else:
            file_index = self._find_csv_file_column(inputs.metadata)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    'no column from contains csv file names')

        # generate the long form timeseries data
        base_path = self._get_base_path(inputs.metadata, main_resource_index,
                                        file_index)

        csv_paths = [
            os.path.join(base_path, f)
            for f in inputs[main_resource_index].iloc[:, file_index]
        ]
        ts_values = [pd.read_csv(path) for path in csv_paths]
        for ts, val in zip(ts_values, inputs[main_resource_index].values):
            ts[list(inputs[main_resource_index])] = pd.DataFrame(
                [list(val)], index=ts.index)
        timeseries_dataframe = pd.concat(ts_values)
        timeseries_dataframe = container.DataFrame(timeseries_dataframe)
        return base.CallResult(
            container.Dataset({'0': timeseries_dataframe},
                              generate_metadata=True))

Example #6

0

Show file

    def test_classification_non_d3mindex(self):
        truth = container.DataFrame([
            [1, 'happy-pleased'],
            [2, 'amazed-suprised'],
            [3, 'sad-lonely'],
            [4, 'relaxing-calm'],
        ], columns=['non_d3mIndex', 'class_label']) # Score dataset has a non-d3mIndex

        truth_dataset = container.Dataset({'learningData': truth}, generate_metadata=True)

        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Target')
        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')

        predictions = container.DataFrame([
            [1, 'happy-pleased'],
            [2, 'amazed-suprised'],
            [3, 'relaxing-calm'],
            [4, 'sad-lonely'],
        ], columns=['d3mIndex', 'class_label'], generate_metadata=True)

        hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams()
        metrics_class = hyperparams_class.configuration['metrics'].elements
        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'ACCURACY',
                'pos_label': None,
                'k': None,
            })],
        }))

        scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value

        self.assertEqual(scores.values.tolist(), [
            ['ACCURACY', 0.5, 0.5],
        ])

Example #7

0

Show file

    def produce(
        self,
        *,
        left: Inputs,  # type: ignore
        right: Inputs,  # type: ignore
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[Outputs]:

        # attempt to extract the main table
        try:
            left_resource_id, left_df = d3m_base_utils.get_tabular_resource(left, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in left dataset"
            ) from error

        try:
            right_resource_id, right_df = d3m_base_utils.get_tabular_resource(
                right, None
            )
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in right dataset"
            ) from error

        accuracy = self.hyperparams["accuracy"]
        absolute_accuracy = self.hyperparams["absolute_accuracy"]

        # hyperparams may be parsed as tuples
        # floats could be integers if round number is passed in
        if isinstance(accuracy, collections.Iterable):
            accuracy = [float(a) for a in accuracy]
        else:
            accuracy = float(accuracy)
        if isinstance(absolute_accuracy, collections.Iterable):
            absolute_accuracy = list(absolute_accuracy)

        if type(accuracy) == float and not type(absolute_accuracy) == bool:
            raise exceptions.InvalidArgumentValueError(
                "only 1 value of accuracy provided, but multiple values for absolute accuracy provided"
            )
        if (not type(accuracy) == float) and type(absolute_accuracy) == bool:
            raise exceptions.InvalidArgumentValueError(
                "only 1 for absolute accuracy provided, but multiple values of accuracy provided"
            )
        if type(accuracy) == float and not absolute_accuracy:
            if accuracy <= 0.0 or accuracy > 1.0:
                raise exceptions.InvalidArgumentValueError(
                    "accuracy of " + str(accuracy) + " is out of range"
                )
        elif type(accuracy) == list and type(absolute_accuracy) == list:
            if not len(accuracy) == len(absolute_accuracy):
                raise exceptions.InvalidArgumentValueError(
                    "the count of accuracy hyperparams does not match the count of absolute_accuracy hyperparams"
                )
            for i in range(len(accuracy)):
                if (accuracy[i] <= 0.0 or accuracy[i] > 1.0) and not absolute_accuracy[i]:
                    raise exceptions.InvalidArgumentValueError(
                        "accuracy of " + str(acc) + " is out of range"
                    )

        left_col = self.hyperparams["left_col"]
        right_col = self.hyperparams["right_col"]

        if type(left_col) != type(right_col) or (
            type(left_col) == list
            and len(left_col) != len(right_col)
            and type(accuracy) != list
            and len(accuracy) != len(left_col)
        ):
            raise exceptions.InvalidArgumentTypeError(
                "both left_col and right_col need to have same data type and if they are lists, the same list lengths"
            )
        if type(left_col) == str:
            left_col = [left_col]
            right_col = [right_col]
            accuracy = [accuracy]
            absolute_accuracy = [absolute_accuracy]

        join_types = [
            self._get_join_semantic_type(
                left,
                left_resource_id,
                left_col[i],
                right,
                right_resource_id,
                right_col[i],
            )
            for i in range(len(left_col))
        ]

        num_splits = 32
        joined_split = [None for i in range(num_splits)]
        left_df_split = np.array_split(left_df, num_splits)
        jobs = [delayed(self._produce_threaded)(
            index = i,
            left_df_full = left_df,
            left_dfs = left_df_split,
            right_df = right_df,
            join_types = join_types,
            left_col = left_col,
            right_col = right_col,
            accuracy = accuracy,
            absolute_accuracy = absolute_accuracy
        ) for i in range(num_splits)]
        joined_data = Parallel(n_jobs=self.hyperparams["n_jobs"], backend="loky", verbose=10)(jobs)

        # joined data needs to maintain order to mimic none split joining
        for i, d in joined_data:
            joined_split[i] = d
        joined = pd.concat(joined_split, ignore_index = True)

        # create a new dataset to hold the joined data
        resource_map = {}
        float_vector_columns = {}
        for resource_id, resource in left.items():  # type: ignore
            if resource_id == left_resource_id:
                for column in joined.columns:
                    # need to avoid bug in container.Dataset, it doesn't like vector columns
                    if type(joined[column].iloc[0]) == np.ndarray:
                        float_vector_columns[column] = joined[column]
                        joined[column] = np.NAN
                resource_map[resource_id] = joined
            else:
                resource_map[resource_id] = resource

        # Generate metadata for the dataset using only the first row of the resource for speed -
        # metadata generation runs over each cell in the dataframe, but we only care about column
        # level generation.  Once that's done, set the actual dataframe value.
        result_dataset = container.Dataset(
            {k: v.head(1) for k, v in resource_map.items()}, generate_metadata=True
        )
        for k, v in resource_map.items():
            result_dataset[k] = v
            result_dataset.metadata = result_dataset.metadata.update(
                (k,), {"dimension": {"length": v.shape[0]}}
            )

        for key in float_vector_columns.keys():
            df = result_dataset[left_resource_id]
            df[key] = float_vector_columns[key]
            float_vec_loc = df.columns.get_loc(key)
            float_vec_col_indices = df.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/FloatVector",)
            )
            if float_vec_loc not in float_vec_col_indices:
                df.metadata = df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, float_vec_loc),
                    "https://metadata.datadrivendiscovery.org/types/FloatVector",
                )

        return base.CallResult(result_dataset)

Example #8

0

Show file

File: ta1-pipeline.py Project: RqS/dsbox-profiling

    jsonCall = json.load(inputFile)
    inputFile.close()

# Load the problem description schema
with open( path.join(jsonCall['train_data'], 'problem_TRAIN', 'problemDoc.json' ) , 'r') as inputFile:
    problemSchema = json.load(inputFile)
    inputFile.close()

# Load the json dataset description file
with open( path.join(jsonCall['train_data'], 'dataset_TRAIN', 'datasetDoc.json' ) , 'r') as inputFile:
    datasetSchema = json.load(inputFile)
    inputFile.close()

# Load dataset
ds_uri = 'file://' + path.join(jsonCall['train_data'], 'dataset_TRAIN', 'datasetDoc.json')
ds = container.Dataset(resources=dict(), metadata=DataMetadata())
ds = ds.load(ds_uri)

# Profile dataset
param = Hyperparams.sample()
prof = Profiler(hyperparams=param)
ds2 = prof.produce(inputs=ds)

# Get resource Ids, return ['0'] for this dataset
print(ds.metadata.get_elements( () ))

# Get available columns, returns [0, 1, 2, ..., 30] for 38_sick dataset
print(ds.metadata.get_elements(('0', ALL_ELEMENTS)))

# Metadata for column 1
column_one_metadata = ds.metadata.query(('0', ALL_ELEMENTS, 1))

Example #9

0

Show file

File: generate-database-datasets.py Project: tods-doc/axolotl

def handler(arguments):
    random_state = numpy.random.RandomState(arguments.random_seed)

    resources = {}
    generate_main_resources(random_state, resources, arguments.size)

    if arguments.dataset_type == DatasetType.COUNTS_PER_USER:
        generate_learning_data_counts_per_user(random_state, resources)

    elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST:
        generate_learning_data_comments_per_post(random_state, resources)

    elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST:
        generate_learning_data_has_user_made_comment_on_post(
            random_state, resources)

    else:
        raise ValueError(
            f"Unknown dataset type: {arguments.dataset_type.name}")

    dataset = container.Dataset(resources, generate_metadata=True)
    update_metadata_main_resources(dataset, arguments.dataset_id,
                                   arguments.dataset_type.name, arguments.size,
                                   arguments.random_seed)

    if arguments.dataset_type == DatasetType.COUNTS_PER_USER:
        update_metadata_counts_per_user(dataset)

    elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST:
        update_metadata_comments_per_post(dataset)

    elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST:
        update_metadata_has_user_made_comment_on_post(dataset)

    else:
        raise ValueError(
            f"Unknown dataset type: {arguments.dataset_type.name}")

    dataset_output_uri = 'file://' + os.path.join(
        os.path.abspath(arguments.output_dir), arguments.dataset_id,
        'datasetDoc.json')

    dataset.save(dataset_output_uri)

    os.makedirs(
        os.path.join(os.path.abspath(arguments.output_dir),
                     arguments.problem_id))

    with open(os.path.join(os.path.abspath(arguments.output_dir),
                           arguments.problem_id, 'problemDoc.json'),
              'x',
              encoding='utf8') as problem_file:
        if arguments.dataset_type == DatasetType.COUNTS_PER_USER:
            task_keywords = ['regression', 'multivariate']
            metric = {
                'metric': 'rootMeanSquaredError',
            }
            targets = [
                {
                    'targetIndex': 0,
                    'resID': 'learningData',
                    'colIndex': 2,
                    'colName': 'posts_count',
                },
                {
                    'targetIndex': 1,
                    'resID': 'learningData',
                    'colIndex': 3,
                    'colName': 'comments_count',
                },
            ]
        elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST:
            task_keywords = ['regression', 'univariate']
            metric = {
                'metric': 'rootMeanSquaredError',
            }
            targets = [
                {
                    'targetIndex': 0,
                    'resID': 'learningData',
                    'colIndex': 2,
                    'colName': 'comments_count',
                },
            ]
        elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST:
            task_keywords = ['classification', 'binary']
            metric = {
                'metric': 'f1',
                'posLabel': 'yes',
            }
            targets = [
                {
                    'targetIndex': 0,
                    'resID': 'learningData',
                    'colIndex': 3,
                    'colName': 'made_comment',
                },
            ]

        json.dump(
            {
                'about': {
                    'problemID': arguments.problem_id,
                    'problemName':
                    f"Database problem of type {arguments.dataset_type.name}",
                    'taskKeywords': task_keywords,
                    'problemSchemaVersion': '4.0.0',
                },
                'inputs': {
                    'data': [
                        {
                            'datasetID': arguments.dataset_id,
                            'targets': targets,
                        },
                    ],
                    'performanceMetrics': [
                        metric,
                    ],
                },
                'expectedOutputs': {
                    'predictionsFile': 'predictions.csv',
                    'scoresFile': 'scores.csv',
                },
            },
            problem_file,
            indent=2)

Example #10

0

Show file

File: denormalize.py Project: byu-dml/dsbox-primitives

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        # If only one resource is in the dataset, we do not have anything to do.
        if inputs.metadata.query(())['dimension']['length'] == 1:
            return base.CallResult(inputs)

        main_resource_id = self.hyperparams['starting_resource']

        if main_resource_id is None:
            for resource_id in inputs.keys():
                if 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint' in inputs.metadata.query(
                    (resource_id, )).get('semantic_types', []):
                    main_resource_id = resource_id
                    break

        if main_resource_id is None:
            raise ValueError(
                "A Dataset with multiple resources without an entry point and no starting resource specified as a hyper-parameter."
            )

        main_data = inputs[main_resource_id]
        main_columns_length = inputs.metadata.query(
            (main_resource_id,
             metadata_base.ALL_ELEMENTS))['dimension']['length']

        # There is only one resource now.
        top_level_metadata = dict(inputs.metadata.query(()))
        top_level_metadata['dimension'] = dict(top_level_metadata['dimension'])
        top_level_metadata['dimension']['length'] = 1

        # !!! changed part: remove unloaded metadata to pass the check function
        metadata = inputs.metadata.clear(
            top_level_metadata, source=self).set_for_value(None, source=self)
        other_keys = [*inputs]
        other_keys.remove(main_resource_id)
        for each_key in other_keys:
            metadata = metadata.remove(selector=(each_key, ), recursive=True)
        # changed finished

        #metadata = inputs.metadata.clear(top_level_metadata, source=self).set_for_value(None, source=self)

        # Resource is not anymore an entry point.
        entry_point_metadata = dict(inputs.metadata.query(
            (main_resource_id, )))
        entry_point_metadata['semantic_types'] = [
            semantic_type
            for semantic_type in entry_point_metadata['semantic_types']
            if semantic_type !=
            'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'
        ]
        metadata = metadata.update((main_resource_id, ),
                                   entry_point_metadata,
                                   source=self)

        data = None

        for column_index in range(main_columns_length):
            column_metadata = inputs.metadata.query(
                (main_resource_id, metadata_base.ALL_ELEMENTS, column_index))

            if 'foreign_key' not in column_metadata:
                # We just copy over data and metadata.
                data, metadata = self._add_column(
                    main_resource_id, data, metadata,
                    self._get_column(main_data, column_index), column_metadata)
            else:
                assert column_metadata['foreign_key'][
                    'type'] == 'COLUMN', column_metadata

                if 'column_index' in column_metadata['foreign_key']:
                    data, metadata = self._join_by_index(
                        main_resource_id,
                        inputs,
                        column_index,
                        data,
                        metadata,
                        column_metadata['foreign_key']['resource_id'],
                        column_metadata['foreign_key']['column_index'],
                    )
                elif 'column_name' in column_metadata['foreign_key']:
                    data, metadata = self._join_by_name(
                        main_resource_id,
                        inputs,
                        column_index,
                        data,
                        metadata,
                        column_metadata['foreign_key']['resource_id'],
                        column_metadata['foreign_key']['column_name'],
                    )
                else:
                    assert False, column_metadata

        resources = {}
        resources[main_resource_id] = data

        # Number of columns had changed.
        all_rows_metadata = dict(
            inputs.metadata.query(
                (main_resource_id, metadata_base.ALL_ELEMENTS)))
        all_rows_metadata['dimension'] = dict(all_rows_metadata['dimension'])
        all_rows_metadata['dimension']['length'] = data.shape[1]
        metadata = metadata.update(
            (main_resource_id, metadata_base.ALL_ELEMENTS),
            all_rows_metadata,
            for_value=resources,
            source=self)

        # !!! changed part: load all dataset to resources
        '''
        other_keys = [*inputs]
        other_keys.remove(main_resource_id)
        for each_key in other_keys:
            metadata = metadata.remove(selector = (each_key,),recursive = True, source = resources)
        '''
        '''
        # this change only works for d3m v2018.6.5, for v2018.7.10, even the "metadata.remove" will check the resouces and metadata relationship: so we have to load all data to the resources before check/remove
        # !!! changed part: remove unloaded metadata to pass the check function
        other_keys = [*inputs]
        other_keys.remove(main_resource_id)
        for each_key in other_keys:
            metadata = metadata.remove(selector = (each_key,),recursive = True, source = resources)
        # changed finished
        '''
        metadata.check(resources)

        dataset = container.Dataset(resources, metadata)

        return base.CallResult(dataset)

Example #11

0

Show file

File: time_series_formatter.py Project: uncharted-distil/distil-primitives

    def produce(self,
                *,
                inputs: container.Dataset,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:

        # if this is a single resource dataset we don't need to reformat it
        if len(inputs) < 2:
            return base.CallResult(inputs)

        # find the main resource if supplied, infer if not
        main_resource_id, main_resource = base_utils.get_tabular_resource(
            inputs, self.hyperparams["main_resource_id"])
        if main_resource_id is None:
            raise exceptions.InvalidArgumentValueError(
                "no main resource specified")

        # find the csv file column resource if supplied, infer if not
        file_index = self.hyperparams["file_col_index"]
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata, main_resource_id,
                                            file_index):
                raise exceptions.InvalidArgumentValueError(
                    "column idx=" + str(file_index) +
                    " from does not contain csv file names")
        else:
            file_index = self._find_csv_file_column(inputs.metadata,
                                                    main_resource_id)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    "no column from contains csv file names")

        # generate the long form timeseries data
        base_path = self._get_base_path(inputs.metadata, main_resource_id,
                                        file_index)
        csv_paths = [
            os.path.join(base_path, local_path)
            for local_path in inputs[main_resource_id].iloc[:, file_index]
        ]
        new_dfs = [pd.read_csv(path) for path in csv_paths]
        original_dfs = [
            pd.DataFrame(
                np.tile(row, (df.shape[0], 1)),
                columns=inputs[main_resource_id].columns,
                index=df.index,
            ) for row, df in zip(inputs[main_resource_id].values, new_dfs)
        ]
        combined_dfs = [
            original_df.join(new_df)
            for original_df, new_df in zip(original_dfs, new_dfs)
        ]
        output_data = pd.concat(combined_dfs)
        timeseries_dataframe = container.DataFrame(output_data)
        timeseries_dataframe.reset_index(drop=True, inplace=True)

        # make sure that all timeseries have the same length, most downstream tasks will appreciate this.
        if self.hyperparams["equal_length"]:
            min_length = (timeseries_dataframe.groupby(
                timeseries_dataframe.columns[file_index]).count().min().
                          values[0])
            group_count = timeseries_dataframe.groupby(
                timeseries_dataframe.columns[file_index]).cumcount()
            timeseries_dataframe = timeseries_dataframe.assign(
                group_count=group_count)
            timeseries_dataframe = timeseries_dataframe[
                timeseries_dataframe["group_count"] < min_length]
            timeseries_dataframe = timeseries_dataframe.drop(["group_count"],
                                                             axis=1)

        # create a dataset to hold the result
        timeseries_dataset = container.Dataset(
            {self._resource_id: timeseries_dataframe}, generate_metadata=True)
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (), {"id": inputs.metadata.query(())["id"]})
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (), {"digest": inputs.metadata.query(())["digest"]})

        # copy main resource column metadata to timeseries dataframe
        num_main_resource_cols = inputs.metadata.query(
            (main_resource_id,
             metadata_base.ALL_ELEMENTS))["dimension"]["length"]
        for i in range(num_main_resource_cols):
            source = inputs.metadata.query(
                (main_resource_id, metadata_base.ALL_ELEMENTS, i))
            timeseries_dataset.metadata = timeseries_dataset.metadata.update_column(
                i, source, at=(self._resource_id, ))

        # remove the foreign key entry from the filename column if it exists
        metadata = dict(
            timeseries_dataset.metadata.query(
                (self._resource_id, metadata_base.ALL_ELEMENTS, file_index)))
        metadata["foreign_key"] = metadata_base.NO_VALUE
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (self._resource_id, metadata_base.ALL_ELEMENTS, file_index),
            metadata)

        # copy timeseries column metadata to timeseries if its available in the metadata (which is not necssarily true anymore)
        source = self._find_timeseries_metadata(inputs)
        i = 0
        start_idx = 0
        if source is not None:
            for col_info in source["file_columns"]:
                timeseries_dataset.metadata = timeseries_dataset.metadata.update_column(
                    i + num_main_resource_cols,
                    col_info,
                    at=(self._resource_id, ))
                i += 1
            # flag all other columns as attributes
            start_idx = i + num_main_resource_cols
        else:
            # loop over the appended time series columns
            start_idx = original_dfs[0].shape[1]

        for i in range(start_idx, timeseries_dataframe.shape[1]):
            timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
                (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                "https://metadata.datadrivendiscovery.org/types/Attribute",
            )
            struct_type = timeseries_dataset.metadata.query(
                (self._resource_id, metadata_base.ALL_ELEMENTS,
                 i))["structural_type"]
            if struct_type == np.float64:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Float",
                    ))
            elif struct_type == np.int64:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Integer",
                    ))
            else:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Text",
                    ))

        # mark the filename column as a grouping key
        timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS, file_index),
            "https://metadata.datadrivendiscovery.org/types/GroupingKey",
        )

        # mark the d3mIndex as a primary multi-key since there are now multiple instances of the value present
        primary_index_col = (
            timeseries_dataset.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/PrimaryKey",
                 ),
                at=(self._resource_id, ),
            ))
        timeseries_dataset.metadata = timeseries_dataset.metadata.remove_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS,
             primary_index_col[0]),
            "https://metadata.datadrivendiscovery.org/types/PrimaryKey",
        )
        timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS,
             primary_index_col[0]),
            "https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey",
        )

        return base.CallResult(timeseries_dataset)

Example #12

0

Show file

File: test_container_metadata.py Project: tods-doc/tamu_d3m

    def test_complex_value(self):
        self.maxDiff = None

        dataset = container.Dataset({
            '0': container.DataFrame({
                'A': [
                    container.ndarray(numpy.array(['a', 'b', 'c'])),
                    container.ndarray(numpy.array([1, 2, 3], dtype=numpy.int64)),
                    container.ndarray(numpy.array([1.0, 2.0, 3.0])),
                ],
                'B': [
                    container.List(['a', 'b', 'c']),
                    container.List([1, 2, 3]),
                    container.List([1.0, 2.0, 3.0]),
                ],
            }),
        }, generate_metadata=False)

        dataset_metadata = dataset.metadata.generate(dataset, compact=True)

        self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.dataset.Dataset',
                'dimension': {
                    'name': 'resources',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
                    'length': 1,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'd3m.container.pandas.DataFrame',
                'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
                'dimension': {
                    'name': 'rows',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
                    'length': 3,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'name': 'columns',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
                    'length': 2,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'length': 3
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0],
            'metadata': {
                'structural_type': 'd3m.container.numpy.ndarray',
                'name': 'A',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1],
            'metadata': {
                'structural_type': 'd3m.container.list.List',
                'name': 'B',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 0, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.str_',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 0, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'str',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 1, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.int64',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 1, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'int',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 2, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.float64',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 2, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'float',
            }
        }])

        dataset_metadata = dataset.metadata.generate(dataset, compact=False)

        self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'dimension': {
                    'length': 1,
                    'name': 'resources',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
                },
               'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json',
               'structural_type': 'd3m.container.dataset.Dataset',
            },
        }, {
            'selector': ['0'],
            'metadata': {
                'dimension': {
                    'length': 3,
                    'name': 'rows',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
                },
               'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
               'structural_type': 'd3m.container.pandas.DataFrame',
            },
        },
        {
            'selector': ['0', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'length': 2,
                    'name': 'columns',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
                },
            },
        },
        {
            'selector': ['0', '__ALL_ELEMENTS__', 0],
            'metadata': {
                'name': 'A',
            },
        },
        {
            'selector': ['0', '__ALL_ELEMENTS__', 1],
            'metadata': {
                'name': 'B',
            },
        },
        {
            'selector': ['0', 0, 0],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.numpy.ndarray',
            },
        },
        {
            'selector': ['0', 0, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.str_'
            },
        },
        {
            'selector': ['0', 0, 1],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.list.List',
            },
        }, {
            'selector': ['0', 0, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'str',
            },
        }, {
            'selector': ['0', 1, 0],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.numpy.ndarray',
            },
        }, {
            'selector': ['0', 1, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.int64',
            },
        }, {
            'selector': ['0', 1, 1],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.list.List',
            },
        }, {
            'selector': ['0', 1, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'int',
            },
        }, {
            'selector': ['0', 2, 0],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.numpy.ndarray',
            },
        }, {
            'selector': ['0', 2, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.float64',
            },
        },
        {
            'selector': ['0', 2, 1],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.list.List',
            },
        }, {
            'selector': ['0', 2, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'float',
            },
        }])

Example #13

0

Show file

File: test_container_metadata.py Project: tods-doc/tamu_d3m

    def test_dataset(self):
        dataframe = container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})
        dataframe.A = dataframe.A.astype(numpy.int64, copy=False)
        dataset = container.Dataset({'0': dataframe}, generate_metadata=False)

        compact_metadata = dataset.metadata.generate(dataset, compact=True)
        noncompact_metadata = dataset.metadata.generate(dataset, compact=False)

        self.assertEqual(utils.to_json_structure(compact_metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.dataset.Dataset',
                'dimension': {
                    'name': 'resources',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
                    'length': 1,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'd3m.container.pandas.DataFrame',
                'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
                'dimension': {
                    'name': 'rows',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
                    'length': 3,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'name': 'columns',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
                    'length': 2,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0],
            'metadata': {
                'name': 'A',
                'structural_type': 'numpy.int64',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1],
            'metadata': {
                'name': 'B',
                'structural_type': 'str',
            },
        }])

        self.assertEqual(utils.to_json_structure(noncompact_metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.dataset.Dataset',
                'dimension': {
                    'name': 'resources',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
                    'length': 1,
                },
            },
        }, {
            'selector': ['0'],
            'metadata': {
                'structural_type': 'd3m.container.pandas.DataFrame',
                'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
                'dimension': {
                    'name': 'rows',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
                    'length': 3,
                },
            },
        }, {
            'selector': ['0', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'name': 'columns',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
                    'length': 2,
                },
            },
        }, {
            'selector': ['0', '__ALL_ELEMENTS__', 0],
            'metadata': {
                'name': 'A',
                'structural_type': 'numpy.int64',
            },
        }, {
            'selector': ['0', '__ALL_ELEMENTS__', 1],
            'metadata': {
                'name': 'B',
                'structural_type': 'str',
            },
        }])

Example #14

0

Show file

    def test_all_labels(self):
        truth = container.DataFrame([
            [3, 'happy-pleased'],
            [3, 'relaxing-calm'],
            [7, 'amazed-suprised'],
            [7, 'happy-pleased'],
            [13, 'quiet-still'],
            [13, 'sad-lonely'],
        ], columns=['d3mIndex', 'class_label'])

        truth_dataset = container.Dataset({'learningData': truth}, generate_metadata=True)

        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Target')
        truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')

        predictions = container.DataFrame([
            [3, 'happy-pleased'],
            [3, 'sad-lonely'],
            [7, 'amazed-suprised'],
            [7, 'happy-pleased'],
            [13, 'quiet-still'],
            [13, 'happy-pleased'],
        ], columns=['d3mIndex', 'class_label'], generate_metadata=True)

        hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams()
        metrics_class = hyperparams_class.configuration['metrics'].elements
        all_labels_class = hyperparams_class.configuration['all_labels'].elements
        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'HAMMING_LOSS',
                'pos_label': None,
                'k': None,
            })],
        }))

        scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value
        self.assertEqual(scores.values.tolist(), [
            ['HAMMING_LOSS', 0.26666666666666666, 0.7333333333333334],
        ])

        self.assertEqual(scores.metadata.query_column(0)['name'], 'metric')
        self.assertEqual(scores.metadata.query_column(1)['name'], 'value')
        self.assertEqual(scores.metadata.query_column(2)['name'], 'normalized')

        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'HAMMING_LOSS',
                'pos_label': None,
                'k': None,
            })],
            'all_labels': [all_labels_class({
                'column_name': 'class_label',
                'labels': ['happy-pleased', 'relaxing-calm', 'amazed-suprised', 'quiet-still', 'sad-lonely', 'foobar'],
            })],
        }))

        scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value
        self.assertEqual(scores.values.tolist(), [
            ['HAMMING_LOSS', 0.2222222222222222, 0.7777777777777778],
        ])

        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'HAMMING_LOSS',
                'pos_label': None,
                'k': None,
            })],
            'all_labels': [all_labels_class({
                'column_name': 'class_label',
                'labels': ['happy-pleased', 'relaxing-calm', 'amazed-suprised'],
            })],
        }))

        with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Truth contains extra labels'):
            primitive.produce(inputs=predictions, score_dataset=truth_dataset)

        truth_dataset.metadata = truth_dataset.metadata.update_column(1, {
            'all_distinct_values': ['happy-pleased', 'relaxing-calm', 'amazed-suprised', 'quiet-still', 'sad-lonely', 'foobar'],
        }, at=('learningData',))

        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'HAMMING_LOSS',
                'pos_label': None,
                'k': None,
            })],
        }))

        scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value
        self.assertEqual(scores.values.tolist(), [
            ['HAMMING_LOSS', 0.2222222222222222, 0.7777777777777778],
        ])

        truth_dataset.metadata = truth_dataset.metadata.update_column(1, {
            'all_distinct_values': ['happy-pleased', 'relaxing-calm', 'amazed-suprised'],
        }, at=('learningData',))

        primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({
            'metrics': [metrics_class({
                'metric': 'HAMMING_LOSS',
                'pos_label': None,
                'k': None,
            })],
        }))

        with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Truth contains extra labels'):
            primitive.produce(inputs=predictions, score_dataset=truth_dataset)