Beispiel #1
0
    def _get_target_columns_metadata(
            cls, outputs_metadata: metadata_base.DataMetadata,
            hyperparams) -> List[OrderedDict]:
        """
        Output metadata of selected columns.
        Args:
            outputs_metadata: metadata_base.DataMetadata
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            d3m.metadata.base.DataMetadata
        """
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict(
                outputs_metadata.query_column(column_index))

            # Update semantic types and prepare it for predicted targets.
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([])
            add_semantic_types = []
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            target_columns_metadata.append(column_metadata)

        return target_columns_metadata
Beispiel #2
0
    def _update_metadata(
        self, metadata: metadata_base.DataMetadata,
        resource_id: metadata_base.SelectorSegment
    ) -> metadata_base.DataMetadata:
        resource_metadata = dict(metadata.query((resource_id, )))

        if 'structural_type' not in resource_metadata or not issubclass(
                resource_metadata['structural_type'], container.DataFrame):
            raise TypeError(
                "The Dataset resource is not a DataFrame, but \"{type}\".".
                format(type=resource_metadata.get('structural_type', None), ))

        resource_metadata.update(
            {
                'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
            }, )

        new_metadata = metadata_base.DataMetadata(resource_metadata)

        new_metadata = metadata.copy_to(new_metadata, (resource_id, ))

        # Resource is not anymore an entry point.
        new_metadata = new_metadata.remove_semantic_type(
            (),
            'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint')

        return new_metadata
Beispiel #3
0
    def _get_target_columns_metadata(
            self,
            outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]:
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict(
                outputs_metadata.query_column(column_index))

            # Update semantic types and prepare it for predicted targets.
            semantic_types = list(column_metadata.get('semantic_types', []))
            if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types:
                semantic_types.append(
                    'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
                )
            semantic_types = [
                semantic_type for semantic_type in semantic_types
                if semantic_type !=
                'https://metadata.datadrivendiscovery.org/types/TrueTarget'
            ]
            column_metadata['semantic_types'] = semantic_types

            target_columns_metadata.append(column_metadata)

        return target_columns_metadata
Beispiel #4
0
    def _copy_elements_metadata(
            cls,
            source_metadata: metadata_base.DataMetadata,
            selector_prefix: metadata_base.Selector,
            selector: metadata_base.Selector,
            target_metadata: metadata_base.DataMetadata,
            *,
            source: typing.Any = None) -> metadata_base.DataMetadata:

        if source is None:
            source = cls

        elements = source_metadata.get_elements(
            list(selector_prefix) + list(selector))

        for element in elements:
            new_selector = list(selector) + [element]
            metadata = source_metadata.query(
                list(selector_prefix) + new_selector)

            target_metadata = target_metadata.update(new_selector,
                                                     metadata,
                                                     source=source)
            target_metadata = cls._copy_elements_metadata(source_metadata,
                                                          selector_prefix,
                                                          new_selector,
                                                          target_metadata,
                                                          source=source)

        return target_metadata
    def _update_metadata(
        cls,
        metadata: metadata_base.DataMetadata,
        resource_id: metadata_base.SelectorSegment,
    ) -> metadata_base.DataMetadata:
        resource_metadata = dict(metadata.query((resource_id, )))

        if "structural_type" not in resource_metadata or not issubclass(
                resource_metadata["structural_type"], container.DataFrame):
            raise TypeError(
                'The Dataset resource is not a DataFrame, but "{type}".'.
                format(type=resource_metadata.get("structural_type", None), ))

        resource_metadata.update(
            {
                "schema": metadata_base.CONTAINER_SCHEMA_VERSION,
            }, )

        new_metadata = metadata_base.DataMetadata(resource_metadata)

        new_metadata = metadata.copy_to(new_metadata, (resource_id, ))

        # Resource is not anymore an entry point.
        new_metadata = new_metadata.remove_semantic_type(
            (),
            "https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint")

        return new_metadata
Beispiel #6
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        self._fitted = True
        categorical_attributes = DataMetadata.list_columns_with_semantic_types(
            self=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
                ]
            )

        all_attributes = DataMetadata.list_columns_with_semantic_types(
            self=self._training_data.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]
            )

        self._s_cols = container.List(set(all_attributes).intersection(categorical_attributes))
        _logger.debug("%d of categorical attributes found." % (len(self._s_cols)))

        if len(self._s_cols) > 0:
            # temp_model = defaultdict(LabelEncoder)
            # self._training_data.iloc[:, self._s_cols].apply(lambda x: temp_model[x.name].fit(x))
            # self._model = dict(temp_model)
            self._model = {}
            for col_index in self._s_cols:
                self._model[col_index] = self._training_data.iloc[:, col_index].dropna().unique()

        return CallResult(None, has_finished=True)
Beispiel #7
0
    def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int],
                                        outputs_metadata: metadata_base.DataMetadata, hyperparams):
        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in input_indices:
            column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
            if column_name is None:
                column_name = "output_{}".format(column_index)

            column_metadata = OrderedDict(inputs_metadata.query_column(column_index))
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([])
            add_semantic_types = set()
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        #  If outputs has more columns than index, add Attribute Type to all remaining
        if outputs_length > len(input_indices):
            for column_index in range(len(input_indices), outputs_length):
                column_metadata = OrderedDict()
                semantic_types = set()
                semantic_types.add(hyperparams["return_semantic_type"])
                column_name = "output_{}".format(column_index)
                column_metadata["semantic_types"] = list(semantic_types)
                column_metadata["name"] = str(column_name)
                target_columns_metadata.append(column_metadata)

        return target_columns_metadata
    def _get_target_columns_metadata(
            cls, outputs_metadata: metadata_base.DataMetadata,
            hyperparams) -> List[OrderedDict]:
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict(
                outputs_metadata.query_column(column_index))

            # Update semantic types and prepare it for predicted targets.
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([
                "https://metadata.datadrivendiscovery.org/types/TrueTarget",
                "https://metadata.datadrivendiscovery.org/types/SuggestedTarget",
            ])
            add_semantic_types = set([
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
            ])
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            target_columns_metadata.append(column_metadata)

        return target_columns_metadata
Beispiel #9
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:

        if self._fitted:
            return CallResult(None)

        if self._input_data is None:
            raise ValueError('Missing training(fitting) data.')

        # Look at attribute columns only
        # print('fit in', self._input_data.columns)
        data = self._input_data.copy()
        all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        # Remove columns with all empty values, structural type str
        numeric = DataMetadata.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in all_attributes]

        self._empty_columns = []
        _logger.debug(f'Numeric columns: {numeric}')
        for element in numeric:
            if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str:
                if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]:
                    _logger.debug(f'Empty numeric str column: {element}')
                    self._empty_columns.append(element)

        # Remove columns with all empty values, structural numeric
        is_empty = pd.isnull(data).sum(axis=0) == data.shape[0]
        for i in all_attributes:
            if is_empty.iloc[i] and i not in self._empty_columns:
                _logger.debug(f'Empty numeric str column: {element}')
                self._empty_columns.append(i)

        _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns]))

        data = container.DataFrame.remove_columns(data, self._empty_columns)

        categorical_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata,
                                                                        semantic_types=[
                                                                            "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                                                                            "https://metadata.datadrivendiscovery.org/types/CategoricalData"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes))
        self._cat_columns = data.columns[self._cat_col_index].tolist()

        _logger.debug('Encoding columns: {}'.format(self._cat_columns))

        mapping = {}
        for column_name in self._cat_columns:
            col = data[column_name]
            temp = self._trim_features(col, self.hyperparams['n_limit'])
            if temp:
                mapping[temp[0]] = temp[1]
        self._mapping = mapping
        self._fitted = True
        return CallResult(None, has_finished=True)
Beispiel #10
0
    def __get_fitted(self):
        attribute = DataMetadata.list_columns_with_semantic_types(
            self._train_x.metadata,
            ['https://metadata.datadrivendiscovery.org/types/Attribute'])

        # Mean for numerical columns

        self._numeric_columns = DataMetadata.list_columns_with_semantic_types(
            self._train_x.metadata,
            ['http://schema.org/Integer', 'http://schema.org/Float'])
        self._numeric_columns = [
            x for x in self._numeric_columns if x in attribute
        ]

        _logger.debug('numeric columns %s', str(self._numeric_columns))

        # Convert selected columns to_numeric, then compute column mean, then convert to_dict
        self.mean_values = self._train_x.iloc[:, self._numeric_columns].apply(
            lambda col: pd.to_numeric(col, errors='coerce')).mean(
                axis=0).to_dict()

        for name in self.mean_values.keys():
            if pd.isnull(self.mean_values[name]):
                self.mean_values[name] = 0.0

        # Mode for categorical columns
        self._categoric_columns = DataMetadata.list_columns_with_semantic_types(
            self._train_x.metadata, [
                'https://metadata.datadrivendiscovery.org/types/CategoricalData',
                'http://schema.org/Boolean'
            ])
        self._categoric_columns = [
            x for x in self._categoric_columns if x in attribute
        ]

        _logger.debug('categorical columns %s', str(self._categoric_columns))

        mode_values = self._train_x.iloc[:, self._categoric_columns].mode(
            axis=0).iloc[0].to_dict()
        for name in mode_values.keys():
            if pd.isnull(mode_values[name]):
                # mode is nan
                rest = self._train_x[name].dropna()
                if rest.shape[0] == 0:
                    # every value is nan
                    mode = 0
                else:
                    mode = rest.mode().iloc[0]
                mode_values[name] = mode
        self.mean_values.update(mode_values)

        if self._verbose:
            import pprint
            print('mean imputation:')
            pprint.pprint(self.mean_values)

        _logger.debug('Mean values:')
        for name, value in self.mean_values.items():
            _logger.debug('  %s %s', name, str(value))
    def _get_base_path(self, inputs_metadata: metadata_base.DataMetadata,
                       res_id: str, column_index: int) -> str:
        # get the base uri from the referenced column
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))

        ref_col_index = column_metadata['foreign_key']['column_index']
        ref_res_id = column_metadata['foreign_key']['resource_id']

        return inputs_metadata.query((ref_res_id, metadata_base.ALL_ELEMENTS,
                                      ref_col_index))['location_base_uris'][0]
 def _add_target_semantic_types(cls, metadata: metadata_base.DataMetadata,
                                source: typing.Any,  target_names: typing.List = None,) -> metadata_base.DataMetadata:
     for column_index in range(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']):
         metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index),
                                               'https://metadata.datadrivendiscovery.org/types/Target',
                                               source=source)
         metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index),
                                               'https://metadata.datadrivendiscovery.org/types/PredictedTarget',
                                               source=source)
         if target_names:
             metadata = metadata.update((metadata_base.ALL_ELEMENTS, column_index), {
                 'name': target_names[column_index],
             }, source=source)
     return metadata
Beispiel #13
0
    def _produce_column_metadata(
        self, inputs_metadata: metadata_base.DataMetadata, column_index: int, read_files: typing.Sequence[typing.Any],
    ) -> metadata_base.DataMetadata:
        column_metadata = inputs_metadata.select_columns([column_index])
        column_metadata = column_metadata.update_column(0, {
            'structural_type': self._file_structural_type,
            # Clear metadata useful for filename columns.
            'location_base_uris': metadata_base.NO_VALUE,
            'media_types': metadata_base.NO_VALUE,
        })

        # It is not a filename anymore.
        column_metadata = column_metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/FileName')

        # At least one semantic type from listed semantic types should be set.
        semantic_types = column_metadata.query_column(0).get('semantic_types', [])
        if not set(semantic_types) & set(self._file_semantic_types):
            # Add the first one.
            column_metadata = column_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), self._file_semantic_types[0])

        for row_index, file in enumerate(read_files):
            # Copy metadata only if we have a container type.
            if isinstance(file, types.Container):
                column_metadata = file.metadata.copy_to(column_metadata, (), (row_index, 0))

        column_metadata = column_metadata.compact(['name', 'structural_type', 'media_types', 'location_base_uris', 'semantic_types'])

        return column_metadata
Beispiel #14
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        numerical_attributes = DataMetadata.list_columns_with_semantic_types(
            self=self._training_data.metadata,
            semantic_types=["http://schema.org/Float", "http://schema.org/Integer"])

        all_attributes = DataMetadata.list_columns_with_semantic_types(
            self=self._training_data.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"])
        self._s_cols = list(set(all_attributes).intersection(numerical_attributes))
        # print(" %d columns scaled" % (len(self._s_cols)))
        if len(self._s_cols) > 0:
            self._model.fit(self._training_data.iloc[:, self._s_cols])
            self._fitted = True
        else:
            self._fitted = False
        return CallResult(None, has_finished=True, iterations_done=1)
Beispiel #15
0
    def _reassign_boundaries(self, inputs_metadata: metadata_base.DataMetadata, columns: typing.List[int]) -> metadata_base.DataMetadata:
        """
        Moves metadata about boundaries from the filename column to image object column.
        """

        outputs_metadata = inputs_metadata
        columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

        for column_index in range(columns_length):
            column_metadata = outputs_metadata.query_column(column_index)

            if 'boundary_for' not in column_metadata:
                continue

            # TODO: Support also "column_name" boundary metadata.
            if 'column_index' not in column_metadata['boundary_for']:
                continue

            try:
                i = columns.index(column_metadata['boundary_for']['column_index'])
            except ValueError:
                continue

            outputs_metadata = outputs_metadata.update_column(column_index, {
                'boundary_for': {
                    # We know that "columns" were appended at the end.
                    'column_index': columns_length - len(columns) + i,
                }
            })

        return outputs_metadata
Beispiel #16
0
    def set_training_data(self, *, inputs: Input) -> None:
        """
        Sets training data of this primitive.

        Parameters
        ----------
        inputs : Input
            The inputs.
        """
        attribute = DataMetadata.list_columns_with_semantic_types(
            inputs.metadata,
            ['https://metadata.datadrivendiscovery.org/types/Attribute'])
        nan_sum = 0
        for col in attribute:
            if str(inputs.dtypes[inputs.columns[col]]) != "object":
                nan_sum += inputs.iloc[:, col].isnull().sum()
            else:
                for i in range(inputs.shape[0]):
                    if inputs.iloc[i, col] == "" or pd.isnull(
                            inputs.iloc[i, col]):
                        nan_sum += 1
        if nan_sum == 0:  # no missing value exists
            if self._verbose:
                _logger.info('no missing value in train dataset')

        self._train_x = inputs
        self._is_fitted = False
Beispiel #17
0
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int,
                            hyperparams: Hyperparams) -> bool:

        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (str, )
        accepted_semantic_types = set()
        accepted_semantic_types.add(
            "https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'],
                          accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))

        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False
        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False
Beispiel #18
0
    def _add_column(
        self, main_resource_id: str, data: pandas.DataFrame,
        metadata: metadata_base.DataMetadata, column_data: pandas.DataFrame,
        column_metadata: typing.Dict
    ) -> typing.Tuple[pandas.DataFrame, metadata_base.DataMetadata]:

        assert column_data.shape[1] == 1

        if data is None:
            data = column_data
        else:
            #import pdb
            #pdb.set_trace()
            #data = data.reset_index().drop(columns=['index'])
            column_data = column_data.set_index(data.index)
            #column_data = column_data.reset_index().drop(columns=['index'])
            data = pandas.concat([data, column_data], axis=1)
            '''
            data = data.reset_index().drop(columns=['index'])
            selected_data_key = column_data.columns
            for each_key in selected_data_key:
                data[each_key] = column_data[each_key]
            '''
        metadata = metadata.update(
            (main_resource_id, metadata_base.ALL_ELEMENTS, data.shape[1] - 1),
            column_metadata,
            source=self)

        return data, metadata
Beispiel #19
0
    def _add_target_columns_metadata(
            cls, outputs_metadata: metadata_base.DataMetadata, hyperparams):
        """
        Add target columns metadata
        Args:
            outputs_metadata: metadata.base.DataMetadata
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            List[OrderedDict]
        """
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            # column_name = "output_{}".format(column_index)
            column_metadata = OrderedDict()
            semantic_types = set()
            semantic_types.add(hyperparams["return_semantic_type"])
            column_metadata['semantic_types'] = list(semantic_types)

            # column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata
Beispiel #20
0
    def _can_use_column(
        self, inputs_metadata: metadata_base.DataMetadata, column_index: int
    ) -> bool:
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """

        column_metadata = inputs_metadata.query_column(column_index)

        semantic_types = column_metadata.get("semantic_types", [])

        # We detect only on columns which have no semantic types or where it is explicitly set as unknown.
        if (
            not semantic_types
            or "https://metadata.datadrivendiscovery.org/types/UnknownType"
            in semantic_types
        ):
            return True

        # A special case to handle setting "https://metadata.datadrivendiscovery.org/types/TrueTarget".
        if (
            "https://metadata.datadrivendiscovery.org/types/SuggestedTarget"
            in semantic_types
        ):
            return True

        return False
    def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict()
            semantic_types = []
            semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
            column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
            if column_name is None:
                column_name = "output_{}".format(column_index)
            column_metadata["semantic_types"] = semantic_types
            column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata
Beispiel #22
0
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int,
                            hyperparams: Hyperparams) -> bool:
        """
        Output whether a column can be processed.
        Args:
            inputs_metadata: d3m.metadata.base.DataMetadata
            column_index: int

        Returns:
            bool
        """
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add(
            "https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'],
                          accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))
        return True
        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False

        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False
Beispiel #23
0
    def _split_column(self, inputs):
        """
            Inner function to sample part of the column of the input dataset
        """
        input_dataset_shape = inputs[self._main_resource_id].shape
        # find target column, we should not split these column
        target_column = DataMetadata.list_columns_with_semantic_types(
            self._training_inputs.metadata,
            ['https://metadata.datadrivendiscovery.org/types/TrueTarget'],
            at=(self._main_resource_id, ))
        if not target_column:
            self._logger.warn("No target column found from the input dataset.")
        index_column = DataMetadata.get_index_columns(
            self._training_inputs.metadata, at=(self._main_resource_id, ))
        if not index_column:
            self._logger.warn("No index column found from the input dataset.")

        outputs = copy.copy(inputs)
        if self._status is Status.TRAIN:
            # check again on the amount of the attributes column only
            # we only need to sample when attribute column numbers are larger than threshould
            attribute_column_length = (input_dataset_shape[1] -
                                       len(index_column) - len(target_column))
            if attribute_column_length > self._threshold_column_length:
                attribute_column = set(range(input_dataset_shape[1]))
                for each_target_column in target_column:
                    attribute_column.remove(each_target_column)
                for each_index_column in index_column:
                    attribute_column.remove(each_index_column)

                # generate the remained column index randomly and sort it
                self._column_remained = random.sample(
                    attribute_column, self._threshold_column_length)
                self._column_remained.extend(target_column)
                self._column_remained.extend(index_column)
                self._column_remained.sort()

        if len(self._column_remained) > 0:
            # Just to make sure.
            outputs.metadata = copy.deepcopy(inputs.metadata)
            outputs[self._main_resource_id] = inputs[
                self._main_resource_id].iloc[:, self._column_remained]
            outputs.metadata = self._select_columns_metadata(
                outputs.metadata, self._main_resource_id,
                self._column_remained)

        return outputs
 def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                           res_id: str) -> typing.Optional[int]:
     indices = inputs_metadata.list_columns_with_semantic_types(
         cls._semantic_types, at=(res_id, ))
     for i in indices:
         if cls._is_csv_file_column(inputs_metadata, res_id, i):
             return i
     return None
    def _get_date_cols(data):
        dates = DataMetadata.list_columns_with_semantic_types(
            data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Time"
            ])

        return dates
Beispiel #26
0
    def _can_use_outputs_column(self,
                                outputs_metadata: metadata_base.DataMetadata,
                                column_index: int) -> bool:
        column_metadata = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        return 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in column_metadata.get(
            'semantic_types', [])
    def _get_ref_resource(self, inputs_metadata: metadata_base.DataMetadata,
                          res_id: str, column_index: int) -> str:
        # get the referenced resource from the referenced column
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))
        ref_res_id = column_metadata['foreign_key']['resource_id']

        return ref_res_id
    def _parse_metadata(cls, *, metadata: metadata_module.DataMetadata):
        flatten = lambda l: [item for sublist in l for item in sublist]

        mdlu = cls._init_metadata_lookup()

        num_res = metadata.query(())['dimension']['length']
        resources = [str(x) for x in range(num_res - 1)]
        resources.append('learningData')
        #primary_key = [ [ (res_id, metadata_module.ALL_ELEMENTS, col_id) for x in range(metadata.query((res_id, metadata_module.ALL_ELEMENTS))['dimension']['length'])
        #                              if 'semantic_types' in metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id)) and primary_sem_type in metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id))['semantic_types'] ]
        #                           for res_id in resources ]
        primary_key = [[
            (res_id, metadata_module.ALL_ELEMENTS, col_id) for col_id in range(
                metadata.query((
                    res_id,
                    metadata_module.ALL_ELEMENTS))['dimension']['length'])
            if 'd3mIndex' == metadata.query((res_id,
                                             metadata_module.ALL_ELEMENTS,
                                             col_id))['name']
        ] for res_id in resources]
        primary_key = flatten(primary_key)
        if len(primary_key) != 1:
            raise Exception('One primary key supported')
        cls._update_metadata_lookup(mdlu, 'primary_key', primary_key[0])
        cls._update_metadata_lookup(mdlu, 'primary_resource_id',
                                    (primary_key[0][0], ))

        primary_resource_cols = metadata.query(
            (mdlu['primary_resource_id']['selector'][0],
             metadata_module.ALL_ELEMENTS))
        for col_id in range(primary_resource_cols['dimension']['length']):
            cmd = metadata.query((mdlu['primary_resource_id']['selector'][0],
                                  metadata_module.ALL_ELEMENTS, col_id))
            col_name = cmd['name']
            if 'semantic_types' in cmd:
                st = cmd['semantic_types']
                if 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in st:
                    cls._update_metadata_lookup(
                        mdlu,
                        'targets',
                        #(mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_name)
                        (mdlu['primary_resource_id']['selector'][0],
                         metadata_module.ALL_ELEMENTS, col_id))

        return mdlu if cls._valid_metadata_lookup(mdlu) else None
Beispiel #29
0
    def _can_use_column(cls, inputs_metadata: metadata_base.DataMetadata,
                        column_index: int) -> bool:
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))
        semantic_type = column_metadata.get('semantic_types', None)

        if semantic_type is None:
            return False

        return 'http://schema.org/Integer' in semantic_type or 'http://schema.org/Float' in semantic_type
Beispiel #30
0
    def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices, hyperparams) -> List[OrderedDict]:
        outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in column_indices:
            column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
            column_metadata = OrderedDict(inputs_metadata.query_column(column_index))
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([])
            add_semantic_types = set()
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata