コード例 #1
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        categorical_attributes = common_utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
            ])

        all_attributes = common_utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ])

        self._s_cols = container.List(
            set(all_attributes).intersection(categorical_attributes))
        print("[INFO] %d of categorical attributes found." %
              (len(self._s_cols)))

        if len(self._s_cols) > 0:
            temp_model = defaultdict(LabelEncoder)
            self._training_data.iloc[:, self._s_cols].apply(
                lambda x: temp_model[x.name].fit(x))
            self._model = dict(temp_model)
            self._fitted = True
        else:
            self._fitted = False
コード例 #2
0
ファイル: labler.py プロジェクト: usc-isi-i2/dsbox-cleaning
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        self._fitted = True
        categorical_attributes = common_utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
            ])

        all_attributes = common_utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ])

        self._s_cols = container.List(
            set(all_attributes).intersection(categorical_attributes))
        _logger.debug("%d of categorical attributes found." %
                      (len(self._s_cols)))

        if len(self._s_cols) > 0:
            # temp_model = defaultdict(LabelEncoder)
            # self._training_data.iloc[:, self._s_cols].apply(lambda x: temp_model[x.name].fit(x))
            # self._model = dict(temp_model)
            self._model = {}
            for col_index in self._s_cols:
                self._model[
                    col_index] = self._training_data.iloc[:, col_index].dropna(
                    ).unique()

        return CallResult(None, has_finished=True)
コード例 #3
0
    def __get_fitted(self):
        attribute = utils.list_columns_with_semantic_types(
            self._train_x.metadata,
            ['https://metadata.datadrivendiscovery.org/types/Attribute'])

        # Mean for numerical columns

        self._numeric_columns = utils.list_columns_with_semantic_types(
            self._train_x.metadata,
            ['http://schema.org/Integer', 'http://schema.org/Float'])
        self._numeric_columns = [
            x for x in self._numeric_columns if x in attribute
        ]

        _logger.debug('numeric columns %s', str(self._numeric_columns))

        # Convert selected columns to_numeric, then compute column mean, then convert to_dict
        self.mean_values = self._train_x.iloc[:, self._numeric_columns].apply(
            lambda col: pd.to_numeric(col, errors='coerce')).mean(
                axis=0).to_dict()

        for name in self.mean_values.keys():
            if pd.isnull(self.mean_values[name]):
                self.mean_values[name] = 0.0

        # Mode for categorical columns
        self._categoric_columns = utils.list_columns_with_semantic_types(
            self._train_x.metadata, [
                'https://metadata.datadrivendiscovery.org/types/CategoricalData',
                'http://schema.org/Boolean'
            ])
        self._categoric_columns = [
            x for x in self._categoric_columns if x in attribute
        ]

        _logger.debug('categorical columns %s', str(self._categoric_columns))

        mode_values = self._train_x.iloc[:, self._categoric_columns].mode(
            axis=0).iloc[0].to_dict()
        for name in mode_values.keys():
            if pd.isnull(mode_values[name]):
                # mode is nan
                rest = self._train_x[name].dropna()
                if rest.shape[0] == 0:
                    # every value is nan
                    mode = 0
                else:
                    mode = rest.mode().iloc[0]
                mode_values[name] = mode
        self.mean_values.update(mode_values)

        if self._verbose:
            import pprint
            print('mean imputation:')
            pprint.pprint(self.mean_values)

        _logger.debug('Mean values:')
        for name, value in self.mean_values.items():
            _logger.debug('  %s %s', name, str(value))
コード例 #4
0
ファイル: unfold.py プロジェクト: byu-dml/dsbox-primitives
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        primary_key_cols = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/PrimaryKey"]
        )

        unfold_cols = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=self.hyperparams["unfold_semantic_types"]
        )

        if not primary_key_cols:
            warnings.warn("Did not find primary key column for grouping. Will not unfold")
            return CallResult(inputs)

        if not unfold_cols:
            warnings.warn("Did not find any column to unfold. Will not unfold")
            return CallResult(inputs)

        primary_key_col_names = [inputs.columns[pos] for pos in primary_key_cols]
        unfold_col_names = [inputs.columns[pos] for pos in unfold_cols]

        if self.hyperparams["use_pipeline_id_semantic_type"]:
            pipeline_id_cols = common_utils.list_columns_with_semantic_types(
                metadata=inputs.metadata,
                semantic_types=["https://metadata.datadrivendiscovery.org/types/PipelineId"]
            )

            if len(pipeline_id_cols) >= 2:
                warnings.warn("Multiple pipeline id columns found. Will use first.")

            if pipeline_id_cols:
                inputs = inputs.sort_values(primary_key_col_names + [inputs.columns[pos] for pos in pipeline_id_cols])
                self._sorted_pipe_ids = sorted(inputs.iloc[:, pipeline_id_cols[0]].unique())
            else:
                warnings.warn(
                    "No pipeline id column found by 'https://metadata.datadrivendiscovery.org/types/PipelineId'")

        new_df = self._get_new_df(inputs=inputs, use_cols=primary_key_cols + unfold_cols)

        groupby_df = inputs.groupby(primary_key_col_names)[unfold_col_names].aggregate(
            lambda x: container.List(x)).reset_index(drop=False)

        ret_df = container.DataFrame(groupby_df)
        ret_df.metadata = new_df.metadata
        ret_df = self._update_metadata_dimension(df=ret_df)

        split_col_names = [inputs.columns[pos] for pos in unfold_cols]

        ret_df = self._split_aggregated(df=ret_df, split_col_names=split_col_names)
        ret_df = common_utils.remove_columns(
            inputs=ret_df,
            column_indices=[ret_df.columns.get_loc(name) for name in split_col_names]
        )

        return CallResult(ret_df)
コード例 #5
0
ファイル: ensemble_voting.py プロジェクト: RqS/dsbox-cleaning
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:

        index_col = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/PrimaryKey"
            ])
        if not index_col:
            warnings.warn(
                "Did not find primary key column. Can not vote, output origin")
            return CallResult(inputs)

        predict_target_col = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget"
            ])
        if not index_col:
            warnings.warn(
                "Did not find PredictedTarget column. Can not vote, output origin"
            )
            return CallResult(inputs)

        df = inputs.copy()
        new_df = self._get_index_and_target_df(inputs=df,
                                               use_cols=index_col +
                                               predict_target_col)

        if self.hyperparams["ensemble_method"] == 'majority':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).agg(lambda x: x.value_counts().index[0]).reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        if self.hyperparams["ensemble_method"] == 'max':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).max().reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        if self.hyperparams["ensemble_method"] == 'min':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).min().reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        return CallResult(self._update_metadata(df=ret_df))
コード例 #6
0
ファイル: encoder.py プロジェクト: liangmuxin/dsbox-cleaning
    def fit(self, *, timeout: float = None, iterations: int = None) -> None:

        if self._fitted:
            return

        if self._input_data is None:
            raise ValueError('Missing training(fitting) data.')

        # Look at attribute columns only
        # print('fit in', self._input_data.columns)
        data = self._input_data.copy()
        all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        # Remove columns with all empty values, structural type str
        numeric = utils.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in all_attributes]
        for element in numeric:
            if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str:
                if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]:
                    self._empty_columns.append(element)

        # Remove columns with all empty values, structural numeric
        is_empty = pd.isnull(data).sum(axis=0) == data.shape[0]
        for i in all_attributes:
            if is_empty.iloc[i]:
                self._empty_columns.append(i)

        _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns]))

        data = utils.remove_columns(data, self._empty_columns, source='ISI DSBox Data Encoder')

        categorical_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata,
                                                                        semantic_types=[
                                                                            "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                                                                            "https://metadata.datadrivendiscovery.org/types/CategoricalData"])
        all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes))
        self._cat_columns = data.columns[self._cat_col_index].tolist()

        _logger.debug('Encoding columns: {}'.format(self._cat_columns))

        mapping = {}
        for column_name in self._cat_columns:
            col = data[column_name]
            temp = self._trim_features(col, self.hyperparams['n_limit'])
            if temp:
                mapping[temp[0]] = temp[1]
        self._mapping = mapping
        self._fitted = True
コード例 #7
0
    def set_training_data(self, *, inputs: Input) -> None:
        """
        Sets training data of this primitive.

        Parameters
        ----------
        inputs : Input
            The inputs.
        """
        attribute = utils.list_columns_with_semantic_types(
            inputs.metadata,
            ['https://metadata.datadrivendiscovery.org/types/Attribute'])
        nan_sum = 0
        for col in attribute:
            if str(inputs.dtypes[inputs.columns[col]]) != "object":
                nan_sum += inputs.iloc[:, col].isnull().sum()
            else:
                for i in range(inputs.shape[0]):
                    if inputs.iloc[i, col] == "" or pd.isnull(
                            inputs.iloc[i, col]):
                        nan_sum += 1
        if nan_sum == 0:  # no missing value exists
            if self._verbose:
                print("Warning: no missing value in train dataset")
                _logger.info('no missing value in train dataset')

        self._train_x = inputs
        self._is_fitted = False
コード例 #8
0
    def _get_date_cols(data):
        dates = common_utils.list_columns_with_semantic_types(
            metadata=data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Time"
            ])

        return dates
コード例 #9
0
 def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                           res_id: int) -> typing.Optional[int]:
     indices = utils.list_columns_with_semantic_types(inputs_metadata,
                                                      cls._semantic_types,
                                                      at=(res_id, ))
     for i in indices:
         if cls._is_csv_file_column(inputs_metadata, res_id, i):
             return i
     return None
コード例 #10
0
ファイル: vertical_concat.py プロジェクト: RqS/dsbox-cleaning
 def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
     new_df = pd.concat([x for x in inputs], ignore_index=self.hyperparams["ignore_index"])
     if self.hyperparams["sort_on_primary_key"]:
         primary_key_col = common_utils.list_columns_with_semantic_types(metadata=new_df.metadata, semantic_types=[
             "https://metadata.datadrivendiscovery.org/types/PrimaryKey"])
         if not primary_key_col:
             warnings.warn("No PrimaryKey column found. Will not sort on PrimaryKey")
             return CallResult(self._update_metadata(new_df))
         new_df = new_df.sort_values([new_df.columns[pos] for pos in primary_key_col])
     return CallResult(self._update_metadata(new_df))
コード例 #11
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        numerical_attributes = utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "http://schema.org/Float", "http://schema.org/Integer"
            ])

        all_attributes = utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ])
        self._s_cols = list(
            set(all_attributes).intersection(numerical_attributes))
        # print(" %d columns scaled" % (len(self._s_cols)))
        if len(self._s_cols) > 0:
            self._model.fit(self._training_data.iloc[:, self._s_cols])
            self._fitted = True
        else:
            self._fitted = False
コード例 #12
0
ファイル: splitter.py プロジェクト: usc-isi-i2/dsbox-cleaning
    def _split_column(self, inputs):
        """
            Inner function to sample part of the column of the input dataset
        """
        input_dataset_shape = inputs[self._main_resource_id].shape
        # find target column, we should not split these column
        target_column = utils.list_columns_with_semantic_types(self._training_inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(self._main_resource_id,))
        if not target_column:
            self._logger.warn("No target column found from the input dataset.")
        index_column = utils.get_index_columns(self._training_inputs.metadata,at=(self._main_resource_id,))
        if not index_column:
            self._logger.warn("No index column found from the input dataset.")

        outputs = copy.copy(inputs)
        if self._status is Status.TRAIN:
            # check again on the amount of the attributes column only
            # we only need to sample when attribute column numbers are larger than threshould
            attribute_column_length = (input_dataset_shape[1] - len(index_column) - len(target_column))
            if attribute_column_length > self._threshold_column_length:
                attribute_column = set(range(input_dataset_shape[1]))
                for each_target_column in target_column:
                    attribute_column.remove(each_target_column)
                for each_index_column in index_column:
                    attribute_column.remove(each_index_column)

                # generate the remained column index randomly and sort it
                self._column_remained = random.sample(attribute_column, self._threshold_column_length)
                self._column_remained.extend(target_column)
                self._column_remained.extend(index_column)
                self._column_remained.sort()
            # use common primitive's RemoveColumnsPrimitive inner function to finish sampling

        if len(self._column_remained) > 0: 
            # Just to make sure.
            outputs.metadata = inputs.metadata.set_for_value(outputs, generate_metadata=False)
            outputs[self._main_resource_id] = inputs[self._main_resource_id].iloc[:, self._column_remained]
            outputs.metadata = RemoveColumnsPrimitive._select_columns_metadata(outputs.metadata, self._main_resource_id, self._column_remained)

        return outputs
コード例 #13
0
    def __iterativeRegress(self, data, iterations):
        '''
        init with simple imputation, then apply regression to impute iteratively
        '''
        # for now, cancel the evaluation part for iterativeRegress
        # is_eval = False
        # if (label_col_name==None or len(label_col_name)==0):
        #     is_eval = False
        # else:
        #     is_eval = True

        # indices for numeric attribute columns only
        attribute = utils.list_columns_with_semantic_types(
            data.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute'])
        numeric = utils.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in attribute]

        keys = data.keys()
        missing_col_id = []
        numeric_data = data.iloc[:, numeric].apply(
            lambda col: pd.to_numeric(col, errors='coerce'))
        data = mvp.df2np(numeric_data, missing_col_id, self._verbose)

        # Impute numerical attributes only
        missing_col_id = [x for x in missing_col_id if x in numeric]
        missing_col_data = data[:, missing_col_id]

        # If all values in a column are missing, set that column to zero
        all_missing = np.sum(np.isnan(missing_col_data), axis=0) == missing_col_data.shape[0]
        for col, col_missing in enumerate(all_missing):
            if col_missing:
                missing_col_data[:, col] = 0

        imputed_data = np.zeros([data.shape[0], len(missing_col_id)])
        imputed_data_lastIter = missing_col_data
        # coeff_matrix = np.zeros([len(missing_col_id), data.shape[1]-1]) #coefficient vector for each missing value column
        model_list = [None] * len(missing_col_id)     # store the regression model
        epoch = iterations
        counter = 0
        # mean init all missing-value columns
        init_imputation = ["mean"] * len(missing_col_id)
        next_data = mvp.imputeData(data, missing_col_id, init_imputation, self._verbose)

        while (counter < epoch):
            for i in range(len(missing_col_id)):
                target_col = missing_col_id[i]
                next_data[:, target_col] = missing_col_data[:, i]  # recover the column that to be imputed

                data_clean, model_list[i] = mvp.bayeImpute(next_data, target_col, self._verbose)
                next_data[:, target_col] = data_clean[:, target_col]    # update bayesian imputed column
                imputed_data[:, i] = data_clean[:, target_col]    # add the imputed data

                # if (is_eval):
                #     self.__evaluation(data_clean, label)

            # if (counter > 0):
            #     distance = np.square(imputed_data - imputed_data_lastIter).sum()
            #     if self._verbose: print("changed distance: {}".format(distance))
            imputed_data_lastIter = np.copy(imputed_data)
            counter += 1
        data[:, missing_col_id] = imputed_data_lastIter
        # convert model_list to dict
        model_dict = {}
        for i in range(len(model_list)):
            model_dict[keys[missing_col_id[i]]] = model_list[i]

        return data, model_dict
コード例 #14
0
 def _find_real_vector_column(
         cls, inputs_metadata: metadata_base.DataMetadata
 ) -> typing.Optional[int]:
     indices = utils.list_columns_with_semantic_types(
         inputs_metadata, cls._semantic_types)
     return indices[0] if len(indices) > 0 else None
コード例 #15
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        """
        precond: run fit() before

        Parameters:
        ----------
        data: pandas dataframe
        """

        if (not self._is_fitted):
            # todo: specify a NotFittedError, like in sklearn
            raise ValueError("Calling produce before fitting.")

        # if (pd.isnull(inputs).sum().sum() == 0):    # no missing value exists
        #     if self._verbose: print ("Warning: no missing value in test dataset")
        #     self._has_finished = True
        #     return CallResult(inputs, self._has_finished, self._iterations_done)

        if (timeout is None):
            timeout = 2**31 - 1

        if isinstance(inputs, pd.DataFrame):
            data = inputs.copy()
        else:
            data = inputs[0].copy()

        # setup the timeout
        with stopit.ThreadingTimeout(timeout) as to_ctx_mrg:
            assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING

            # start completing data...
            if self._verbose:
                print("=========> impute by mean value of the attribute:")

            data.iloc[:, self.
                      _numeric_columns] = data.iloc[:, self.
                                                    _numeric_columns].apply(
                                                        lambda col: pd.
                                                        to_numeric(col,
                                                                   errors=
                                                                   'coerce'))

            # assume the features of testing data are same with the training data
            # therefore, only use the mean_values to impute, should get a clean dataset
            attribute = utils.list_columns_with_semantic_types(
                data.metadata,
                ['https://metadata.datadrivendiscovery.org/types/Attribute'])
            for col in attribute:
                if str(inputs.dtypes[inputs.columns[col]]) != "object":
                    data.iloc[:,
                              col].fillna(self.mean_values[data.columns[col]],
                                          inplace=True)
                else:
                    for i in range(data.shape[0]):
                        if data.iloc[i, col] == "" or pd.isnull(
                                data.iloc[i, col]):
                            data.iloc[i, col] = self.mean_values[
                                data.columns[col]]
            data_clean = data

            # Update metadata
            for col in self._numeric_columns:
                old_metadata = dict(
                    data_clean.metadata.query((mbase.ALL_ELEMENTS, col)))
                dtype = data_clean.iloc[:, col].dtype
                if str(dtype).lower().startswith("int"):
                    if "http://schema.org/Integer" not in old_metadata[
                            'semantic_types']:
                        old_metadata['semantic_types'] += (
                            "http://schema.org/Integer", )
                    old_metadata["structural_type"] = type(10)
                elif str(dtype).lower().startswith("float"):
                    if "http://schema.org/Float" not in old_metadata[
                            'semantic_types']:
                        old_metadata['semantic_types'] += (
                            "http://schema.org/Float", )
                    old_metadata["structural_type"] = type(10.2)

                data_clean.metadata = data_clean.metadata.update(
                    (mbase.ALL_ELEMENTS, col), old_metadata)

        value = None
        if to_ctx_mrg.state == to_ctx_mrg.EXECUTED:
            self._has_finished = True
            self._iterations_done = True
            value = data_clean
        elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT:
            _logger.warn('Produce timed out')
            self._has_finished = False
            self._iterations_done = False
        return CallResult(value, self._has_finished, self._iterations_done)
コード例 #16
0
ファイル: greedy.py プロジェクト: usc-isi-i2/dsbox-cleaning
    def __imputationGreedySearch(self, data, label):
        """
        running greedy search for imputation combinations
        """

        # indices for numeric attribute columns only
        attribute = utils.list_columns_with_semantic_types(
            data.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute'])
        numeric = utils.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        d3m_index = utils.list_columns_with_semantic_types(
            data.metadata, ['https://metadata.datadrivendiscovery.org/types/PrimaryKey'])
        numeric = [x for x in numeric if x in attribute]

        col_names = data.keys()
        # 1. convert to np array and get missing value column id
        missing_col_id = []
        data = mvp.df2np(data, missing_col_id, self._verbose)

        # Imput numerical attribute columns only. Should consider imputing category attribute
        missing_col_id = [x for x in missing_col_id if x in numeric]

        label = label.values

        # init for the permutation
        permutations = [0] * len(missing_col_id)   # length equal with the missing_col_id; value represents the id for imputation_strategies
        pos = len(permutations) - 1
        min_score = float("inf")
        max_score = -float("inf")
        max_strategy_id = 0
        best_combo = [0] * len(missing_col_id)  # init for best combo

        # greedy search for the best permutation
        iteration = 1
        while (iteration > 0):
            for i in range(len(permutations)):
                max_strategy_id = permutations[i]

                for strategy in range(len(self._imputation_strategies)):
                    permutations[i] = strategy
                    imputation_list = [self._imputation_strategies[x] for x in permutations]

                    data_clean = mvp.imputeData(data, missing_col_id, imputation_list, self._verbose)
                    if self._verbose:
                        print("for the missing value imputation combination: {} ".format(permutations))
                    score = self.__evaluation(data_clean, label)
                    if (score > max_score):
                        max_score = score
                        max_strategy_id = strategy
                        best_combo = permutations
                    min_score = min(score, min_score)

                permutations[i] = max_strategy_id

            iteration -= 1

        if self._verbose:
            print("max score is {}, min score is {}\n".format(max_score, min_score))
            print("and the best score is given by the imputation combination: ")

        best_imputation = {}    # key: col_name; value: imputation strategy
        for i in range(len(best_combo)):
            best_imputation[col_names[missing_col_id[i]]] = self._imputation_strategies[best_combo[i]]
            if self._verbose:
                print(self._imputation_strategies[best_combo[i]] + " for the column {}".format(col_names[missing_col_id[i]]))

        return best_imputation
コード例 #17
0
    def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]:
        """
        precond: run fit() before

        to complete the data, based on the learned parameters, support:
        -> greedy search

        also support the untrainable methods:
        -> iteratively regression
        -> other

        Parameters:
        ----------
        data: pandas dataframe
        label: pandas series, used for the evaluation of imputation

        TODO:
        ----------
        1. add evaluation part for __simpleImpute()

        """

        # inputs = inputs.convert_objects(convert_numeric=True)
        attribute = utils.list_columns_with_semantic_types(
            inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute'])
        numeric = utils.list_columns_with_semantic_types(
            inputs.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in attribute]

        # keys = data.keys()
        # missing_col_id = []
        inputs = inputs.iloc[:, numeric].apply(
            lambda col: pd.to_numeric(col, errors='coerce'))
        # data = mvp.df2np(numeric_data, missing_col_id, self._verbose)

        for i in numeric:
            old_metadata = dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
            old_metadata["structural_type"] = inputs.iloc[:, i].values.dtype.type
            inputs.metadata = inputs.metadata.update((mbase.ALL_ELEMENTS, i), old_metadata)

        # Impute numerical attributes only

        if (not self._is_fitted):
            # todo: specify a NotFittedError, like in sklearn
            raise ValueError("Calling produce before fitting.")

        if (pd.isnull(inputs).sum().sum() == 0):    # no missing value exists
            if self._verbose:
                print("Warning: no missing value in test dataset")
            self._has_finished = True
            return CallResult(inputs, self._has_finished, self._iterations_done)

        if (timeout is None):
            timeout = 2**31 - 1
        if (iterations is None):
            self._iterations_done = True
            iterations = 30  # only works for iteratively_regre method

        data = inputs.copy()
        # record keys:
        keys = data.keys()
        index = data.index

        # setup the timeout
        with stopit.ThreadingTimeout(timeout) as to_ctx_mrg:
            assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING

            # start completing data...
            if self._verbose:
                print("=========> iteratively regress method:")
            data_clean = self.__regressImpute(data, self._best_imputation, iterations)
        value = None
        if to_ctx_mrg.state == to_ctx_mrg.EXECUTED:
            self._is_fitted = True
            self._has_finished = True
            value = pd.DataFrame(data_clean, index, keys)
            value = container.DataFrame(value)
            value.metadata = data.metadata
        elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT:
            print("Timed Out...")
            self._is_fitted = False
            self._has_finished = False
            self._iterations_done = False
        return CallResult(value, self._has_finished, self._iterations_done)
コード例 #18
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Need training data from set_training_data first.
        The encoder would record specified columns to encode and column values to
        unary encode later in the produce step.
        """
        if self._fitted:
            return

        if self._training_inputs is None:
            raise ValueError('Missing training(fitting) data.')

        data = self._training_inputs.copy()
        all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        # Remove columns with all empty values, structural type str
        numeric = utils.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in all_attributes]
        for element in numeric:
            if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ())==str:
                if pd.isnull(pd.to_numeric(data.iloc[:,element], errors='coerce')).sum() == data.shape[0]:
                    self._empty_columns.append(element)

        # Remove columns with all empty values, structural numeric
        is_empty = pd.isnull(data).sum(axis=0) == data.shape[0]
        for i in all_attributes:
            if is_empty.iloc[i]:
                self._empty_columns.append(i)
        self._empty_columns = list(set(self._empty_columns))
        self._empty_columns.reverse()
        self._empty_columns = container.List(self._empty_columns)
        data = utils.remove_columns(data, self._empty_columns)
        # print('fit', data.shape)

        categorical_attributes = utils.list_columns_with_semantic_types(
            metadata=data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
                ]
            )
        all_attributes = utils.list_columns_with_semantic_types(
            metadata=data.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]
            )
        self._cat_col_index = container.List(set(all_attributes).intersection(numeric))
        self._cat_columns = container.List(data.columns[self._cat_col_index].tolist())
        #import pdb
        #pdb.set_trace()
        numerical_values = data.iloc[:, self._cat_col_index].apply(
            lambda col: pd.to_numeric(col, errors='coerce'))

        self._all_columns = set(data.columns)

        # mapping
        idict = {}
        for name in self._cat_columns:
            col = numerical_values[name]
            idict[name] = sorted(col.unique())
        self._mapping = idict

        if self._text2int:
            texts = data.drop(self._mapping.keys(),axis=1)
            texts = texts.select_dtypes(include=[object])
            le = Label_encoder()
            le.fit_pd(texts)
            self._textmapping = le.get_params()

        # determine whether to run unary encoder on the given column or not
        data_enc = data.iloc[:, self._cat_col_index].apply(lambda col: pd.to_numeric(col, errors='coerce'))
        for column_name in data_enc:
            col = data_enc[column_name]
            col.is_copy = False
            # only apply unary encoder when the amount of the numerical data is less than 12
            if col.unique().shape[0] < 13:
                self._requirement[column_name] = True
            else:
                self._requirement[column_name] = False

        self._fitted = True

        return CallResult(None, has_finished=True, iterations_done=1)
コード例 #19
0
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        # make sure the target column is of a valid type
        target_idx = self.hyperparams['target_col_index']
        if not self._can_use_column(inputs.metadata, target_idx):
            raise exceptions.InvalidArgumentValueError(
                'column idx=' + str(target_idx) + ' from ' +
                str(inputs.columns) +
                ' does not contain continuous or discrete type')

        # check if target is discrete or continuous
        semantic_types = inputs.metadata.query_column(
            target_idx)['semantic_types']
        discrete = len(set(semantic_types).intersection(
            self._discrete_types)) > 0

        # make a copy of the inputs and clean out any missing data
        feature_df = inputs.copy()
        feature_df.dropna(inplace=True)

        # split out the target feature
        target_df = feature_df.iloc[:, target_idx]

        # drop features that are not compatible with ranking
        feature_indices = set(
            utils.list_columns_with_semantic_types(inputs.metadata,
                                                   self._semantic_types))
        role_indices = set(
            utils.list_columns_with_semantic_types(inputs.metadata,
                                                   self._roles))
        feature_indices = feature_indices.intersection(role_indices)

        all_indices = set(range(0, inputs.shape[1]))
        skipped_indices = all_indices.difference(feature_indices)
        skipped_indices.add(target_idx)  # drop the target too
        for i, v in enumerate(skipped_indices):
            feature_df.drop(inputs.columns[v], axis=1, inplace=True)

        # figure out the discrete and continuous feature indices and create an array
        # that flags them
        discrete_indices = utils.list_columns_with_semantic_types(
            inputs.metadata, self._discrete_types)
        discrete_flags = [False] * feature_df.shape[1]
        for v in discrete_indices:
            col_name = inputs.columns[v]
            if col_name in feature_df:
                col_idx = feature_df.columns.get_loc(col_name)
                discrete_flags[col_idx] = True

        target_np = target_df.values
        feature_np = feature_df.values

        # compute mutual information for discrete or continuous target
        ranked_features_np = None
        if discrete:
            ranked_features_np = mutual_info_classif(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)
        else:
            ranked_features_np = mutual_info_regression(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)

        # merge back into a single list of col idx / rank value tuples
        data: typing.List[typing.Tuple[int, str, float]] = []
        data = self._append_rank_info(inputs, data, ranked_features_np,
                                      feature_df)

        cols = ['idx', 'name', 'rank']
        results = container.DataFrame(data=data, columns=cols)
        results = results.sort_values(by=['rank'],
                                      ascending=False).reset_index(drop=True)

        # wrap as a D3M container - metadata should be auto generated
        return base.CallResult(results)
コード例 #20
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        index_col = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/PrimaryKey"
            ])
        if not index_col:
            warnings.warn(
                "Did not find primary key column. Can not vote, output origin")
            return CallResult(inputs)

        predict_target_col = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget"
            ])
        if not index_col:
            warnings.warn(
                "Did not find PredictedTarget column. Can not vote, output origin"
            )
            return CallResult(inputs)

        df = inputs.copy()
        # temporary fix for index type problem
        # fix data type to be correct here
        for each_col in index_col:
            col_semantic_type = df.metadata.query(
                (ALL_ELEMENTS, each_col))['semantic_types']
            if 'http://schema.org/Integer' in col_semantic_type and df[
                    df.columns[each_col]].dtype == 'O':
                df[df.columns[each_col]] = df[df.columns[each_col]].astype(int)

        new_df = self._get_index_and_target_df(inputs=df,
                                               use_cols=index_col +
                                               predict_target_col)

        if self.hyperparams["ensemble_method"] == 'majority':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).agg(lambda x: x.value_counts().index[0]).reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        if self.hyperparams["ensemble_method"] == 'max':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).max().reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        if self.hyperparams["ensemble_method"] == 'min':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).min().reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        return CallResult(self._update_metadata(df=ret_df))
コード例 #21
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:

        dataframe_resource_id, dataframe = base_utils.get_tabular_resource(
            inputs,
            self.hyperparams["dataframe_resource"])  # get attribute columns

        hyperparams_class = (
            dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()
            ["primitive_code"]["class_type_arguments"]["Hyperparams"])
        primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults())

        dataframe_meta = primitive.produce(inputs=inputs).value

        attributes = list_columns_with_semantic_types(
            metadata=dataframe_meta.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ],
        )

        base_file_path = "/".join(
            inputs.metadata._current_metadata.metadata["location_uris"]
            [0].split("/")[:-1])
        edge_list = pd.read_csv(os.path.join(base_file_path, "graphs",
                                             "edgeList.csv"),
                                index_col=0)
        if len(edge_list.columns) > 2:
            graph = nx.from_pandas_edgelist(
                edge_list,
                source=edge_list.columns[0],
                target=edge_list.columns[1],
                edge_attr=edge_list.columns[2],
            )
        else:
            graph = nx.from_pandas_edgelist(edge_list,
                                            source=edge_list.columns[0],
                                            target=edge_list.columns[1])

        if len(attributes) > 1:
            # add attributers to nodes.
            attribute_node_map = dataframe_meta[
                dataframe_meta.columns[attributes]]
            attribute_node_map["nodeID"] = attribute_node_map["nodeID"].astype(
                int)
            attribute_node_map.index = attribute_node_map["nodeID"]
            attribute_cols = attribute_node_map.columns
            attribute_node_map.drop(["nodeID"], axis=1)
            attribute_node_map = attribute_node_map.to_dict(orient="index")

            for i in graph.nodes:
                default = {attribute: 0 for attribute in attribute_cols}
                default["nodeID"] = i
                graph.nodes[i].update(attribute_node_map.get(i, default))

        else:
            # featurizer expects at a minimum nodeids to be present
            for i in graph.nodes:
                default = {}
                default["nodeID"] = i
                graph.nodes[i].update(default)
        # int2str_map = dict(zip(graph.nodes, [str(n) for n in graph.nodes]))
        # graph = nx.relabel_nodes(graph, mapping=int2str_map)

        dataframe.metadata = self._update_metadata(inputs.metadata,
                                                   dataframe_resource_id)

        assert isinstance(dataframe, container.DataFrame), type(dataframe)

        U_train = {"graph": graph}
        y_train = self.produce_target(inputs=inputs).value
        X_train = dataframe  # TODO use attribute in vertex classification

        X_train = self._typify_dataframe(X_train)
        X_train.value = pd.DataFrame(X_train.value["nodeID"])
        return base.CallResult([X_train, y_train, U_train])