Esempio n. 1
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        self._fitted = True
        categorical_attributes = DataMetadata.list_columns_with_semantic_types(
            self=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
                ]
            )

        all_attributes = DataMetadata.list_columns_with_semantic_types(
            self=self._training_data.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]
            )

        self._s_cols = container.List(set(all_attributes).intersection(categorical_attributes))
        _logger.debug("%d of categorical attributes found." % (len(self._s_cols)))

        if len(self._s_cols) > 0:
            # temp_model = defaultdict(LabelEncoder)
            # self._training_data.iloc[:, self._s_cols].apply(lambda x: temp_model[x.name].fit(x))
            # self._model = dict(temp_model)
            self._model = {}
            for col_index in self._s_cols:
                self._model[col_index] = self._training_data.iloc[:, col_index].dropna().unique()

        return CallResult(None, has_finished=True)
Esempio n. 2
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:

        if self._fitted:
            return CallResult(None)

        if self._input_data is None:
            raise ValueError('Missing training(fitting) data.')

        # Look at attribute columns only
        # print('fit in', self._input_data.columns)
        data = self._input_data.copy()
        all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        # Remove columns with all empty values, structural type str
        numeric = DataMetadata.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in all_attributes]

        self._empty_columns = []
        _logger.debug(f'Numeric columns: {numeric}')
        for element in numeric:
            if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str:
                if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]:
                    _logger.debug(f'Empty numeric str column: {element}')
                    self._empty_columns.append(element)

        # Remove columns with all empty values, structural numeric
        is_empty = pd.isnull(data).sum(axis=0) == data.shape[0]
        for i in all_attributes:
            if is_empty.iloc[i] and i not in self._empty_columns:
                _logger.debug(f'Empty numeric str column: {element}')
                self._empty_columns.append(i)

        _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns]))

        data = container.DataFrame.remove_columns(data, self._empty_columns)

        categorical_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata,
                                                                        semantic_types=[
                                                                            "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                                                                            "https://metadata.datadrivendiscovery.org/types/CategoricalData"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes))
        self._cat_columns = data.columns[self._cat_col_index].tolist()

        _logger.debug('Encoding columns: {}'.format(self._cat_columns))

        mapping = {}
        for column_name in self._cat_columns:
            col = data[column_name]
            temp = self._trim_features(col, self.hyperparams['n_limit'])
            if temp:
                mapping[temp[0]] = temp[1]
        self._mapping = mapping
        self._fitted = True
        return CallResult(None, has_finished=True)
Esempio n. 3
0
    def __get_fitted(self):
        attribute = DataMetadata.list_columns_with_semantic_types(
            self._train_x.metadata,
            ['https://metadata.datadrivendiscovery.org/types/Attribute'])

        # Mean for numerical columns

        self._numeric_columns = DataMetadata.list_columns_with_semantic_types(
            self._train_x.metadata,
            ['http://schema.org/Integer', 'http://schema.org/Float'])
        self._numeric_columns = [
            x for x in self._numeric_columns if x in attribute
        ]

        _logger.debug('numeric columns %s', str(self._numeric_columns))

        # Convert selected columns to_numeric, then compute column mean, then convert to_dict
        self.mean_values = self._train_x.iloc[:, self._numeric_columns].apply(
            lambda col: pd.to_numeric(col, errors='coerce')).mean(
                axis=0).to_dict()

        for name in self.mean_values.keys():
            if pd.isnull(self.mean_values[name]):
                self.mean_values[name] = 0.0

        # Mode for categorical columns
        self._categoric_columns = DataMetadata.list_columns_with_semantic_types(
            self._train_x.metadata, [
                'https://metadata.datadrivendiscovery.org/types/CategoricalData',
                'http://schema.org/Boolean'
            ])
        self._categoric_columns = [
            x for x in self._categoric_columns if x in attribute
        ]

        _logger.debug('categorical columns %s', str(self._categoric_columns))

        mode_values = self._train_x.iloc[:, self._categoric_columns].mode(
            axis=0).iloc[0].to_dict()
        for name in mode_values.keys():
            if pd.isnull(mode_values[name]):
                # mode is nan
                rest = self._train_x[name].dropna()
                if rest.shape[0] == 0:
                    # every value is nan
                    mode = 0
                else:
                    mode = rest.mode().iloc[0]
                mode_values[name] = mode
        self.mean_values.update(mode_values)

        if self._verbose:
            import pprint
            print('mean imputation:')
            pprint.pprint(self.mean_values)

        _logger.debug('Mean values:')
        for name, value in self.mean_values.items():
            _logger.debug('  %s %s', name, str(value))
Esempio n. 4
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        numerical_attributes = DataMetadata.list_columns_with_semantic_types(
            self=self._training_data.metadata,
            semantic_types=["http://schema.org/Float", "http://schema.org/Integer"])

        all_attributes = DataMetadata.list_columns_with_semantic_types(
            self=self._training_data.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"])
        self._s_cols = list(set(all_attributes).intersection(numerical_attributes))
        # print(" %d columns scaled" % (len(self._s_cols)))
        if len(self._s_cols) > 0:
            self._model.fit(self._training_data.iloc[:, self._s_cols])
            self._fitted = True
        else:
            self._fitted = False
        return CallResult(None, has_finished=True, iterations_done=1)
Esempio n. 5
0
    def set_training_data(self, *, inputs: Input) -> None:
        """
        Sets training data of this primitive.

        Parameters
        ----------
        inputs : Input
            The inputs.
        """
        attribute = DataMetadata.list_columns_with_semantic_types(
            inputs.metadata,
            ['https://metadata.datadrivendiscovery.org/types/Attribute'])
        nan_sum = 0
        for col in attribute:
            if str(inputs.dtypes[inputs.columns[col]]) != "object":
                nan_sum += inputs.iloc[:, col].isnull().sum()
            else:
                for i in range(inputs.shape[0]):
                    if inputs.iloc[i, col] == "" or pd.isnull(
                            inputs.iloc[i, col]):
                        nan_sum += 1
        if nan_sum == 0:  # no missing value exists
            if self._verbose:
                _logger.info('no missing value in train dataset')

        self._train_x = inputs
        self._is_fitted = False
 def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                           res_id: str) -> typing.Optional[int]:
     indices = inputs_metadata.list_columns_with_semantic_types(
         cls._semantic_types, at=(res_id, ))
     for i in indices:
         if cls._is_csv_file_column(inputs_metadata, res_id, i):
             return i
     return None
    def _get_date_cols(data):
        dates = DataMetadata.list_columns_with_semantic_types(
            data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Time"
            ])

        return dates
 def _get_floatvector_column(self,
                             inputs_metadata: metadata_base.DataMetadata):
     fv_column = self.hyperparams["column"]
     if fv_column:
         return fv_column
     fv_columns = inputs_metadata.list_columns_with_semantic_types(
         self._floatvector_semantic)
     if len(fv_columns) > 0:
         return fv_columns[0]
     logger.warning(
         "inputs provided contains no specified FloatVector column and lacks columns with FloatVector semantic"
     )
     return None
Esempio n. 9
0
    def _split_column(self, inputs):
        """
            Inner function to sample part of the column of the input dataset
        """
        input_dataset_shape = inputs[self._main_resource_id].shape
        # find target column, we should not split these column
        target_column = DataMetadata.list_columns_with_semantic_types(
            self._training_inputs.metadata,
            ['https://metadata.datadrivendiscovery.org/types/TrueTarget'],
            at=(self._main_resource_id, ))
        if not target_column:
            self._logger.warn("No target column found from the input dataset.")
        index_column = DataMetadata.get_index_columns(
            self._training_inputs.metadata, at=(self._main_resource_id, ))
        if not index_column:
            self._logger.warn("No index column found from the input dataset.")

        outputs = copy.copy(inputs)
        if self._status is Status.TRAIN:
            # check again on the amount of the attributes column only
            # we only need to sample when attribute column numbers are larger than threshould
            attribute_column_length = (input_dataset_shape[1] -
                                       len(index_column) - len(target_column))
            if attribute_column_length > self._threshold_column_length:
                attribute_column = set(range(input_dataset_shape[1]))
                for each_target_column in target_column:
                    attribute_column.remove(each_target_column)
                for each_index_column in index_column:
                    attribute_column.remove(each_index_column)

                # generate the remained column index randomly and sort it
                self._column_remained = random.sample(
                    attribute_column, self._threshold_column_length)
                self._column_remained.extend(target_column)
                self._column_remained.extend(index_column)
                self._column_remained.sort()

        if len(self._column_remained) > 0:
            # Just to make sure.
            outputs.metadata = copy.deepcopy(inputs.metadata)
            outputs[self._main_resource_id] = inputs[
                self._main_resource_id].iloc[:, self._column_remained]
            outputs.metadata = self._select_columns_metadata(
                outputs.metadata, self._main_resource_id,
                self._column_remained)

        return outputs
Esempio n. 10
0
    def _process_files(self, inputs: Input):
        fn_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \
            semantic_types=["https://metadata.datadrivendiscovery.org/types/FileName"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"])
        fn_columns = list(set(all_attributes).intersection(fn_attributes))

        # if no file name columns are detected, default to regular behavior
        if len(fn_columns) == 0:
            return inputs

        # create an empty DataFrame of the required size
        processed_cols = pd.DataFrame("", index = copy.deepcopy(inputs.index), \
            columns = ['text_files_' + str(i) for i in range(len(fn_columns))])

        # for column_index in range(len(fn_columns)):
        for column_index in fn_columns:
            curr_column = copy.deepcopy(inputs.iloc[:, column_index])

            file_loc = inputs.metadata.query(
                (ALL_ELEMENTS, column_index))['location_base_uris']
            file_loc = file_loc[0]  # take the first elem of the tuple
            file_loc = file_loc[7:]  # get rid of 'file://' prefix

            for row_index in range(curr_column.shape[0]):
                text_file = curr_column.iloc[row_index]
                file_path = file_loc + text_file

                with open(file_path, 'rb') as file:
                    doc = file.read()
                doc = "".join(map(chr, doc))
                doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall(
                    doc)  # list of strings

                processed_cols.iloc[row_index,
                                    fn_columns.index(column_index)] = " ".join(
                                        doc_tokens)

        # construct metadata for the newly generated columns
        processed_cols = d3m_DataFrame(processed_cols, generate_metadata=True)

        for column_index in range(processed_cols.shape[1]):
            col_dict = dict(
                processed_cols.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type("text")
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = 'processed_file_' + str(inputs.shape[1] +
                                                       column_index)
            col_dict['semantic_types'] = (
                'http://schema.org/Text',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            processed_cols.metadata = processed_cols.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)

        # concatenate the input with the newly created columns
        updated_inputs = utils.append_columns(inputs, processed_cols)

        # remove the initial FileName columns from the df, if we do this before concatenating we might get an empty dataset error
        updated_inputs = utils.remove_columns(updated_inputs, fn_columns)

        return updated_inputs
Esempio n. 11
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        # if already fitted, do nothing
        if self.fitted:
            return CallResult(None, True, 1)

        self.training_data = self._process_files(self.training_data)

        text_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\
            semantic_types=["http://schema.org/Text"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"])
        categorical_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\
            semantic_types=["https://metadata.datadrivendiscovery.org/types/CategoricalData"])

        # want text columns that are attributes
        self.text_columns = set(all_attributes).intersection(text_attributes)

        # but, don't want to edit categorical columns
        self.text_columns = set(
            self.text_columns) - set(categorical_attributes)

        # and, we want the text columns as a list
        self.text_columns = list(self.text_columns)

        # if no text columns are present don't do anything
        self.do_nothing = False
        if len(self.text_columns) == 0:
            self.fitted = True

            self.model = None
            self.bow = None
            self.do_nothing = True
            self.text_columns = None
            self.latent_factors = None
            self.max_iter = None

            return CallResult(None, True, 1)

        # instantiate a corex model and a bag of words model
        self.model = Corex(n_hidden=self.hyperparams['n_hidden'],
                           max_iter=iterations,
                           seed=self.random_seed)
        self.bow = TfidfVectorizer(decode_error='ignore',
                                   max_df=self.hyperparams['max_df'],
                                   min_df=self.hyperparams['min_df'])

        # set the number of iterations (for wrapper and underlying Corex model)
        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 250
        self.model.max_iter = self.max_iter

        # concatenate the columns row-wise
        concat_cols = None
        for column_index in self.text_columns:
            if concat_cols is not None:
                concat_cols = concat_cols.str.cat(
                    self.training_data.iloc[:, column_index], sep=" ")
            else:
                concat_cols = copy.deepcopy(
                    self.training_data.iloc[:, column_index])

        try:
            bow = self.bow.fit_transform(
                map(self._get_ngrams, concat_cols.ravel()))
        except ValueError:
            self.bow = TfidfVectorizer(decode_error='ignore',
                                       max_df=self.hyperparams['max_df'],
                                       min_df=0)
            bow = self.bow.fit_transform(
                map(self._get_ngrams, concat_cols.ravel()))

            print("[WARNING] Setting min_df to 0 to avoid ValueError")

        # choose between CorEx and the TfIdf matrix
        if bow.shape[1] > self.hyperparams['threshold']:
            # use CorEx
            self.latent_factors = self.model.fit_transform(bow)
        else:
            # just use the bag of words representation
            self.latent_factors = pd.DataFrame(bow.todense())

        self.fitted = True

        return CallResult(None, True, 1)
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        train imputation parameters. Now support:
        -> greedySearch

        for the method that not trainable, do nothing:
        -> interatively regression
        -> other

        Parameters:
        ----------
        data: pandas dataframe
        label: pandas series, used for the trainable methods
        """

        # if already fitted on current dataset, do nothing
        if self._is_fitted:
            return CallResult(None, self._has_finished, self._iterations_done)

        if (timeout is None):
            timeout = 2**31 - 1
        if (iterations is None):
            self._iterations_done = True
            iterations = 30
        # import pdb
        # pdb.set_trace()
        # setup the timeout
        with stopit.ThreadingTimeout(timeout) as to_ctx_mrg:
            assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING

            data = self._train_x.copy()

            # start fitting
            if self._verbose:
                _logger.info("=========> iteratively regress method:")

            attribute = DataMetadata.list_columns_with_semantic_types(
                data.metadata,
                ['https://metadata.datadrivendiscovery.org/types/Attribute'])
            numeric = DataMetadata.list_columns_with_semantic_types(
                data.metadata,
                ['http://schema.org/Integer', 'http://schema.org/Float'])
            numeric = [x for x in numeric if x in attribute]
            numeric_data = data.iloc[:, numeric].apply(
                lambda col: pd.to_numeric(col, errors='coerce'))

            data_clean, self._best_imputation = self.__iterativeRegress(
                numeric_data, iterations)
            self._numeric_column_indices = numeric

        if to_ctx_mrg.state == to_ctx_mrg.EXECUTED:
            self._is_fitted = True
            self._iterations_done = True
            self._has_finished = True
        elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT:
            self._is_fitted = False
            self._iterations_done = False
            self._has_finished = False
        return CallResult(None, self._has_finished, self._iterations_done)
Esempio n. 13
0
 def _get_band_column(self, inputs_metadata: metadata_base.DataMetadata):
     return inputs_metadata.list_columns_with_semantic_types(
         self._band_semantic_types)
Esempio n. 14
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        """
        precond: run fit() before

        Parameters:
        ----------
        data: pandas dataframe
        """

        if (not self._is_fitted):
            # todo: specify a NotFittedError, like in sklearn
            raise ValueError("Calling produce before fitting.")

        # if (pd.isnull(inputs).sum().sum() == 0):    # no missing value exists
        #     if self._verbose: print ("Warning: no missing value in test dataset")
        #     self._has_finished = True
        #     return CallResult(inputs, self._has_finished, self._iterations_done)

        if (timeout is None):
            timeout = 2**31 - 1

        if isinstance(inputs, pd.DataFrame):
            data = inputs.copy()
        else:
            data = inputs[0].copy()

        # setup the timeout
        with stopit.ThreadingTimeout(timeout) as to_ctx_mrg:
            assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING

            # start completing data...
            if self._verbose:
                print("=========> impute by mean value of the attribute:")

            data.iloc[:, self.
                      _numeric_columns] = data.iloc[:, self.
                                                    _numeric_columns].apply(
                                                        lambda col: pd.
                                                        to_numeric(col,
                                                                   errors=
                                                                   'coerce'))

            # assume the features of testing data are same with the training data
            # therefore, only use the mean_values to impute, should get a clean dataset
            attribute = DataMetadata.list_columns_with_semantic_types(
                data.metadata,
                ['https://metadata.datadrivendiscovery.org/types/Attribute'])

            # try:
            for col in attribute:
                if str(inputs.dtypes[inputs.columns[col]]) != "object":
                    if data.iloc[:, col].isnull().sum() != 0:
                        data.iloc[:, col].fillna(
                            self.mean_values[data.columns[col]], inplace=True)
                else:
                    for i in range(data.shape[0]):
                        if data.iloc[i, col] == "" or pd.isnull(
                                data.iloc[i, col]):
                            data.iloc[i, col] = self.mean_values[
                                data.columns[col]]
            # except:
            #     import pdb
            #     pdb.set_trace()
            data_clean = data

            # Update metadata
            for col in self._numeric_columns:
                old_metadata = dict(
                    data_clean.metadata.query((mbase.ALL_ELEMENTS, col)))
                dtype = data_clean.iloc[:, col].dtype
                if str(dtype).lower().startswith("int"):
                    if "http://schema.org/Integer" not in old_metadata[
                            'semantic_types']:
                        old_metadata['semantic_types'] += (
                            "http://schema.org/Integer", )
                    old_metadata["structural_type"] = type(10)
                elif str(dtype).lower().startswith("float"):
                    if "http://schema.org/Float" not in old_metadata[
                            'semantic_types']:
                        old_metadata['semantic_types'] += (
                            "http://schema.org/Float", )
                    old_metadata["structural_type"] = type(10.2)

                data_clean.metadata = data_clean.metadata.update(
                    (mbase.ALL_ELEMENTS, col), old_metadata)

        value = None
        if to_ctx_mrg.state == to_ctx_mrg.EXECUTED:
            self._has_finished = True
            self._iterations_done = True
            value = data_clean
        elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT:
            _logger.warn('Produce timed out')
            self._has_finished = False
            self._iterations_done = False
        return CallResult(value, self._has_finished, self._iterations_done)