コード例 #1
0
ファイル: Labler.py プロジェクト: cybergla/dsbox-cleaning
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            return CallResult(inputs, True, 1)
        temp = pd.DataFrame(self._training_data.iloc[:, self._s_cols].apply(
            lambda x: self._d[x.name].transform(x)))
        outputs = self._training_data.copy()

        for id_index, od_index in zip(self._s_cols, range(temp.shape[1])):
            outputs.iloc[:, id_index] = temp.iloc[:, od_index]

        lookup = {
            "int": ('http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
        }

        #new_dtype = temp.dtypes

        for index in self._s_cols:
            old_metadata = dict(
                outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata["semantic_types"] = lookup["int"]
            old_metadata["structural_type"] = type(10)
            outputs.metadata = outputs.metadata.update(
                (mbase.ALL_ELEMENTS, index), old_metadata)

        if outputs.shape == inputs.shape:
            print("output:", outputs.head(5))
            return CallResult(d3m_DataFrame(outputs), True, 1)
        else:
            return CallResult(inputs, True, 1)
コード例 #2
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:

        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")

        if len(self._training_outputs.shape) == 1:
            self._training_outputs = np.expand_dims(self._training_outputs,
                                                    axis=1)
        binaryoutputs = self.__map_labels_to_binary(self._training_outputs)

        (self._weights, _) = tm_fit(self._training_inputs,
                                    binaryoutputs,
                                    'bc',
                                    self.hyperparams['r'],
                                    self.hyperparams['q'],
                                    self.hyperparams['gamma'],
                                    self.hyperparams['solver'],
                                    self.hyperparams['epochs'],
                                    self.hyperparams['alpha'],
                                    seed=self._seed)

        self._fitted = True

        return CallResult(None)
コード例 #3
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Arguments
            - inputs: List( # Data
                         List( # Segments
                            [ deg, num_feats ], ...
                         )
                       ),
        """
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None:
            raise Exception('Missing training data')

        with stopit.ThreadingTimeout(timeout) as timer:
            inputs_curve_fitting = self._training_inputs
            num_data = sum([ len(x) for x in inputs_curve_fitting ]) # number of segments, each segment if formed by multiple data samples
            deg, num_feats = inputs_curve_fitting[0][0].shape
            betas = np.vstack([
                        np.array([
                            segment.flatten() for segment in cinput
                        ]) for cinput in inputs_curve_fitting if len(cinput) > 0
                    ])

            self._model.fit(betas)
            self._fitted = True

        if timer.state == timer.EXECUTED:
            return CallResult(None)
        else:
            raise TimeoutError('ClusterCurveFittingKMeans exceeded time limit')
コード例 #4
0
    def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]:
        """
        precond: run fit() before

        to complete the data, based on the learned parameters, support:
        -> greedy search

        also support the untrainable methods:
        -> iteratively regression
        -> other

        Parameters:
        ----------
        data: pandas dataframe
        label: pandas series, used for the evaluation of imputation

        TODO:
        ----------
        1. add evaluation part for __simpleImpute()

        """

        if (not self._is_fitted):
            # todo: specify a NotFittedError, like in sklearn
            raise ValueError("Calling produce before fitting.")
        if (pd.isnull(inputs).sum().sum() == 0):    # no missing value exists
            if self._verbose: print ("Warning: no missing value in test dataset")
            self._has_finished = True
            return CallResult(inputs, self._has_finished, self._iterations_done)

        if (timeout is None):
            timeout = math.inf
        if (iterations is None):
            self._iterations_done = True
            iterations = 30 # only works for iteratively_regre method

        data = inputs.copy()
        # record keys:
        keys = data.keys()
        index = data.index

        # setup the timeout
        with stopit.ThreadingTimeout(timeout) as to_ctx_mrg:
            assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING

            # start completing data...
            if self._verbose: print("=========> iteratively regress method:")
            data_clean = self.__regressImpute(data, self._best_imputation, iterations)

        value = None
        if to_ctx_mrg.state == to_ctx_mrg.EXECUTED:
            self._is_fitted = True
            self._has_finished = True
            value = pd.DataFrame(data_clean, index, keys)
        elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT:
            print ("Timed Out...")
            self._is_fitted = False
            self._has_finished = False
            self._iterations_done = False
        return CallResult(value, self._has_finished, self._iterations_done)
コード例 #5
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:

        if (timeout is None):
            big_table = self._core(inputs)
            self._has_finished = True
            self._iterations_done = True
            return CallResult(big_table, self._has_finished,
                              self._iterations_done)
        else:
            # setup the timeout
            with stopit.ThreadingTimeout(timeout) as to_ctx_mrg:
                assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING

                # core computations
                big_table = self._core(inputs)

            if to_ctx_mrg.state == to_ctx_mrg.EXECUTED:
                self._has_finished = True
                self._iterations_done = True
                return CallResult(big_table, self._has_finished,
                                  self._iterations_done)
            elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT:
                self._has_finished = False
                self._iterations_done = False
                return CallResult(None, self._has_finished,
                                  self._iterations_done)
コード例 #6
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        Learns the kernel regression coefficients alpha given training pairs (X,y)
        """
        if self._fitted:
            return CallResult(None)

        if self._Xtrain is None or self._ytrain is None:
            raise ValueError("Missing training data.")

        self._U = generateGaussianPreconditioner(self._Xtrain,
                                                 self.hyperparams['sigma'],
                                                 self.hyperparams['lparam'])

        def mykernel(X, Y):
            return GaussianKernel(X, Y, self.hyperparams['sigma'])

        self._coeffs = PCGfit(self._Xtrain, self._ytrain, mykernel, self._U,
                              self.hyperparams['lparam'],
                              self.hyperparams['eps'],
                              self.hyperparams['maxIters'])
        self._fitted = True

        return CallResult(None)
コード例 #7
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            return CallResult(inputs, True, 1)
        temp = pd.DataFrame(self._model.transform(inputs.iloc[:, self._s_cols]))
        outputs = self._training_data.copy()
        for id_index, od_index in zip(self._s_cols, range(temp.shape[1])):
            outputs.iloc[:, id_index] = temp.iloc[:, od_index]

        new_dtype = temp.dtypes
        lookup = {"float": ('http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'),
                  "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')}

        for d, index in zip(new_dtype, self._s_cols):
            print("old metadata : ", outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata = dict(outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            if d==np.dtype(np.float16) or d==np.dtype(np.float32) or d==np.dtype(np.float64) or d==np.dtype(np.float128):
                old_metadata["semantic_types"] = lookup["float"]
                old_metadata["structural_type"] = type(10.0)
            else:
                old_metadata["semantic_types"] = lookup["int"]
                old_metadata["structural_type"] = type(10)
            outputs.metadata = outputs.metadata.update((mbase.ALL_ELEMENTS, index),old_metadata)
            print("updated dict : ",old_metadata)
            print("check again : ", outputs.metadata.query((mbase.ALL_ELEMENTS, index)))

        if outputs.shape == inputs.shape:
            return CallResult(d3m_DataFrame(outputs), True, 1)
        else:
            return CallResult(inputs, True, 1)
コード例 #8
0
ファイル: SKTruncatedSVD.py プロジェクト: zwbjtu123/tods
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Fit model with training data.
        Args:
            *: Container DataFrame. Time series data up to fit.

        Returns:
            None
        """
        if self._fitted:
            return CallResult(None)

        # Get cols to fit.
        self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        # If there is no cols to fit, return None
        if self._training_inputs is None:
            return CallResult(None)

        # Call SVD in sklearn and set _fitted to true
        if len(self._training_indices) > 0:
            self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        return CallResult(None)
コード例 #9
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs.any() == None or self._training_outputs.any() == None: 
            raise ValueError('Missing training data, or missing values exist.')

        ## impute missing values
        self._Imputer.fit(self._training_inputs)
        self._training_inputs = self._Imputer.transform(self._training_inputs)

        ## discretize non-categorical values
        disc_training_inputs = self._training_inputs
        if not len(np.where(self._cate_flag == 0)[0]) == 0:
            self._Kbins.fit(self._training_inputs[:, np.where(self._cate_flag == 0)[0]]) #find non-categorical values
            temp = self._Kbins.transform(self._training_inputs[:, np.where(self._cate_flag == 0)[0]])
            disc_training_inputs[:, np.where(self._cate_flag == 0)[0]] = temp
        # starting from zero

        ## get number of states for each feature and remove features with only one state
        discTrainset = RelationSet(disc_training_inputs, self._training_outputs.reshape(-1,1))
        discTrainset.getStateNo(self._cate_flag, self._nbins)
        discTrainset.remove()
        X_train = discTrainset.data
        Y_train = discTrainset.labels
        
        self._discTrainset = discTrainset
        stateNo = np.append(discTrainset.NUM_STATES, len(np.unique(Y_train)))

        ## fit the classifier
        self._clf.fit(X_train, Y_train, stateNo)
        self._fitted = True

        return CallResult(None)
コード例 #10
0
ファイル: sdne.py プロジェクト: vivian0111/dsbox_graphs
    def fit(self, *, timeout: float = None, iterations: int = None) -> None:

        if self.fitted:
            return CallResult(None, True, 1)

        args = {}
        args['nu1'] = 1e-6
        args['nu2'] = 1e-6
        args['K'] = self.hyperparams['depth']
        args['n_units'] = [
            500,
            300,
        ]
        args['rho'] = 0.3
        args['n_iter'] = self.hyperparams['epochs']
        args['xeta'] = self.hyperparams['lr']  #0.0005
        args['n_batch'] = 100  #500
        self._args = args

        dim = self.hyperparams['dimension']
        alpha = self.hyperparams['alpha']
        beta = self.hyperparams['beta']
        #self._model = sdne.SDNE(d = dim,
        self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args)
        #self._model.learn_embedding(graph = self.training_data)
        self._sdne.learn_embedding(graph=self.training_data)
        self._model = self._sdne._model

        make_keras_pickleable()
        self.fitted = True
        return CallResult(None, True, 1)
コード例 #11
0
ファイル: ccfsReg.py プロジェクト: plai-group/ubc_primitives
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[None]:
        """
        Inputs: ndarray of features
        Returns: None
        """
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise exceptions.InvalidStateError("Missing training data.")
        self._new_training_data = False

        XTrain, _ = self._select_inputs_columns(self._training_inputs)
        YTrain, _ = self._select_outputs_columns(self._training_outputs)

        # Fit data
        CCF = genCCF(XTrain,
                     YTrain,
                     nTrees=self.optionsClassCCF['nTrees'],
                     bReg=True,
                     optionsFor=self.optionsClassCCF,
                     do_parallel=self.optionsClassCCF['parallelprocessing'])

        self._CCF = CCF
        self._fitted = True

        return CallResult(None)
コード例 #12
0
ファイル: FastLADSolver.py プロジェクト: gittea-rpi/realML
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        Learns the LAD regression coefficients alpha given training pairs (X,y)
        """
        if self._fitted:
            return CallResult(None)

        if self._Xtrain is None or self._ytrain is None:
            raise ValueError("Missing training data.")

        stoppingTol = self.hyperparams['eps'] * norm(
            self._ytrain, 1) / (np.sqrt(self._n) * norm(self._Xtrain))
        r = self.hyperparams['coresetmultiplier'] * self._d

        if r < self._n:
            self._U = generateWellConditionedBasis(
                np.concatenate((self._Xtrain, self._ytrain), axis=1), r)
            self._coeffs = coresetLAD(self._Xtrain, self._ytrain, self._U, r,
                                      stoppingTol,
                                      self.hyperparams['maxIters'])
        else:
            #print("coreset size is larger than number of examples, so solving the full LAD problem --- you may want to lower the coresetmultiplier parameter")
            self._coeffs = LAD(self._Xtrain, self._ytrain, stoppingTol,
                               self.hyperparams['maxIters'])

        self._fitted = True

        return CallResult(None)
コード例 #13
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None:
            raise d3m_exceptions.MissingValueError(
                'set_training_data must be called before fit')

        # operate on columns by index, not name
        for i, (col_name, col) in enumerate(self._training_inputs.iteritems()):
            drop_col = False
            if self.hyperparams['drop_missing_values']:
                if self.hyperparams['how'] == 'all' and col.isnull().all():
                    drop_col = True
                elif self.hyperparams['how'] == 'any' and col.isnull().any():
                    drop_col = True
            self._drop_cols.append(drop_col)
            if drop_col:
                self._drop_col_indices.append(i)

            col_known_values = None
            if not drop_col:
                col_known_values = col.dropna(axis=0, how='any').tolist()
            self._known_values.append(col_known_values)

        self._fitted = True
        self._training_inputs = None  # free memory

        return CallResult(None)
コード例 #14
0
    def produce(self,
                *,
                inputs1: Inputs1,
                inputs2: Inputs2,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        status = self._import_module()
        if status == 0:
            print("not a valid  url")
            return CallResult(DataFrame())
        if status == 1:  # run isi-datamart
            # sort the inputslist by best score
            inputs1.sort(key=lambda x: x.score, reverse=True)
            # choose the best one? maybe more, determined by hyperparams
            res_df = ISI_datamart.augment(
                original_data=inputs2,
                augment_data=inputs1[
                    self.hyperparams["n_index"]])  # a pd.dataframe

            # join with inputs2

            # updating "attribute columns", "datatype" from datamart.Dataset
        else:  # run
            inputs1.sort(key=lambda x: x.score, reverse=True)
            res_df = NYU_datamart.augment(
                data=inputs2,
                augment_data=inputs1[self.hyperparams["n_index"]])

        self._has_finished = True
        self._iterations_done = True
        return CallResult(res_df)
コード例 #15
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : Input pandas frame

        Returns
        -------
        Outputs : A frame structurally identical to the input frame, with each feature
            cleaned according to its type (e.g. all date objects will be modified to be
            of a common structure)
        """
        
        string_cleaner = CleanStrings()
        number_cleaner = CleanNumbers()
        date_cleaner = CleanDates()

        def dtype_apply(series):
            if series.dtype in ['int64', 'float64']:
                return number_cleaner.clean_numbers(series)
            elif series.dtype in ['object']:
                return string_cleaner.clean_strings(series)
            elif 'datetime' in series.dtype:
                return date_cleaner.clean_dates(series)
            else: 
                return series

        try:
            return CallResult(inputs.apply(dtype_apply))
        except:
            return CallResult(inputs)
コード例 #16
0
ファイル: SKStandardScaler.py プロジェクト: zwbjtu123/tods
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:

        """
        Fit model with training data.
        Args:
            *: Container DataFrame. Time series data up to fit.

        Returns:
            None
        """

        if self._fitted:
            return CallResult(None)

        self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if self._training_inputs is None:
            return CallResult(None)

        if len(self._training_indices) > 0:
            self._clf.fit_transform(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        # print(self._training_inputs.std())

        return CallResult(None)
コード例 #17
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        if self._inputs is None or self._outputs is None:
            raise ValueError("Missing training data.")

        if not self._new_training_data:
            return CallResult(None)
        self._new_training_data = False

        self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
        self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns.astype(str)

        if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
            self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
            sk_training_output = self._training_outputs.values

            shape = sk_training_output.shape
            if len(shape) == 2 and shape[1] == 1:
                sk_training_output = numpy.ravel(sk_training_output)

            self._clf.fit(self._training_inputs, sk_training_output)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        return CallResult(None)
コード例 #18
0
ファイル: labler.py プロジェクト: byu-dml/dsbox-primitives
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            raise exceptions.PrimitiveNotFittedError('Labeler not fitted')

        if len(self._s_cols) == 0:
            # No categorical columns. Nothing to do.
            return CallResult(inputs, True)

        # Generate label encoding
        columns = []
        for col_index in self._s_cols:
            size = self._model[col_index].size
            mapping = {x: i for i, x in enumerate(self._model[col_index])}
            columns.append(inputs.iloc[:, col_index].apply(lambda x: mapping[x] if x in mapping else size))

        # insert encoded columns
        outputs = inputs.copy()
        for col, index in enumerate(self._s_cols):
            outputs.iloc[:, index] = columns[col]

        lookup = {
            "int": ('http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
        }

        for index in self._s_cols:
            old_metadata = dict(outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata["semantic_types"] = lookup["int"]
            old_metadata["structural_type"] = type(10)
            outputs.metadata = outputs.metadata.update((mbase.ALL_ELEMENTS, index), old_metadata)

        self._has_finished = True
        return CallResult(outputs, self._has_finished)
コード例 #19
0
 def produce(self,
             *,
             inputs: Inputs,
             timeout: float = None,
             iterations: int = None) -> CallResult[Outputs]:
     if 'd3mIndex' in list(inputs.columns.values) and len(
             set(inputs.loc[:, 'd3mIndex'].tolist())) < inputs.shape[0]:
         data_dict = dict()
         indices = list()
         for row in range(inputs.shape[0]):
             idx = inputs.iloc[row, :]['d3mIndex']
             if idx not in data_dict:
                 data_dict[idx] = list()
                 indices.append(idx)
             data_dict[idx].append(
                 inputs.iloc[row, :].drop('d3mIndex').tolist())
         for key in data_dict:
             data_dict[key] = self._get_target(data_dict[key])
         new_df = inputs[0:0].drop('d3mIndex', axis=1)
         for idx in indices:
             new_df.loc[idx] = data_dict[idx]
         old_metadata = dict(new_df.metadata.query(()))
         old_metadata["dimension"] = dict(old_metadata["dimension"])
         old_metadata["dimension"]["length"] = new_df.shape[0]
         new_df.metadata = new_df.metadata.update((), old_metadata)
         return CallResult(new_df, True, 1)
     return CallResult(inputs, True, 1)
コード例 #20
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:

        if self._fitted:
            return CallResult(None)

        if self._input_data is None:
            raise ValueError('Missing training(fitting) data.')

        # Look at attribute columns only
        # print('fit in', self._input_data.columns)
        data = self._input_data.copy()
        all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        # Remove columns with all empty values, structural type str
        numeric = DataMetadata.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in all_attributes]

        self._empty_columns = []
        _logger.debug(f'Numeric columns: {numeric}')
        for element in numeric:
            if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str:
                if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]:
                    _logger.debug(f'Empty numeric str column: {element}')
                    self._empty_columns.append(element)

        # Remove columns with all empty values, structural numeric
        is_empty = pd.isnull(data).sum(axis=0) == data.shape[0]
        for i in all_attributes:
            if is_empty.iloc[i] and i not in self._empty_columns:
                _logger.debug(f'Empty numeric str column: {element}')
                self._empty_columns.append(i)

        _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns]))

        data = container.DataFrame.remove_columns(data, self._empty_columns)

        categorical_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata,
                                                                        semantic_types=[
                                                                            "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                                                                            "https://metadata.datadrivendiscovery.org/types/CategoricalData"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes))
        self._cat_columns = data.columns[self._cat_col_index].tolist()

        _logger.debug('Encoding columns: {}'.format(self._cat_columns))

        mapping = {}
        for column_name in self._cat_columns:
            col = data[column_name]
            temp = self._trim_features(col, self.hyperparams['n_limit'])
            if temp:
                mapping[temp[0]] = temp[1]
        self._mapping = mapping
        self._fitted = True
        return CallResult(None, has_finished=True)
コード例 #21
0
ファイル: sdne.py プロジェクト: vivian0111/dsbox_graphs
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        #make_keras_pickleable()
        produce_data, learning_df, nodes_df, edges_df = self._parse_inputs(
            inputs, return_all=True)
        if self.fitted:
            result = self._sdne._Y  #produce( )#_Y
        else:
            dim = self.hyperparams['dimension']
            alpha = self.hyperparams['alpha']
            beta = self.hyperparams['beta']
            #self._model
            self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args)

            produce_data = networkx.from_scipy_sparse_matrix(produce_data)
            self._sdne.learn_embedding(graph=produce_data)
            self._model = self._sdne._model
            result = self._sdne._Y

        target_types = [
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
        ]
        if self.hyperparams['return_list']:
            result_np = container.ndarray(result, generate_metadata=True)
            return_list = d3m_List([result_np, inputs[1], inputs[2]],
                                   generate_metadata=True)
            return CallResult(return_list, True, 1)
        else:
            learn_df = d3m_DataFrame(learning_df, generate_metadata=True)
            learn_df = get_columns_not_of_type(learn_df, target_types)

            learn_df = learn_df.remove_columns(
                [learn_df.columns.get_loc('nodeID')])
            #learn_df = learn_df.drop('nodeID', axis = 'columns')

            result_df = d3m_DataFrame(result, generate_metadata=True)
            result_df = result_df.loc[result_df.index.isin(
                learning_df['d3mIndex'].values)]

            for column_index in range(result_df.shape[1]):
                col_dict = dict(
                    result_df.metadata.query((ALL_ELEMENTS, column_index)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = str(learn_df.shape[1] + column_index)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')

                result_df.metadata = result_df.metadata.update(
                    (ALL_ELEMENTS, column_index), col_dict)
            result_df.index = learn_df.index.copy()

            output = utils.append_columns(learn_df, result_df)
            #output.set_index('d3mIndex', inplace=True)
            return CallResult(output, True, 1)
コード例 #22
0
ファイル: unfold.py プロジェクト: byu-dml/dsbox-primitives
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        primary_key_cols = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/PrimaryKey"]
        )

        unfold_cols = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=self.hyperparams["unfold_semantic_types"]
        )

        if not primary_key_cols:
            warnings.warn("Did not find primary key column for grouping. Will not unfold")
            return CallResult(inputs)

        if not unfold_cols:
            warnings.warn("Did not find any column to unfold. Will not unfold")
            return CallResult(inputs)

        primary_key_col_names = [inputs.columns[pos] for pos in primary_key_cols]
        unfold_col_names = [inputs.columns[pos] for pos in unfold_cols]

        if self.hyperparams["use_pipeline_id_semantic_type"]:
            pipeline_id_cols = common_utils.list_columns_with_semantic_types(
                metadata=inputs.metadata,
                semantic_types=["https://metadata.datadrivendiscovery.org/types/PipelineId"]
            )

            if len(pipeline_id_cols) >= 2:
                warnings.warn("Multiple pipeline id columns found. Will use first.")

            if pipeline_id_cols:
                inputs = inputs.sort_values(primary_key_col_names + [inputs.columns[pos] for pos in pipeline_id_cols])
                self._sorted_pipe_ids = sorted(inputs.iloc[:, pipeline_id_cols[0]].unique())
            else:
                warnings.warn(
                    "No pipeline id column found by 'https://metadata.datadrivendiscovery.org/types/PipelineId'")

        new_df = self._get_new_df(inputs=inputs, use_cols=primary_key_cols + unfold_cols)

        groupby_df = inputs.groupby(primary_key_col_names)[unfold_col_names].aggregate(
            lambda x: container.List(x)).reset_index(drop=False)

        ret_df = container.DataFrame(groupby_df)
        ret_df.metadata = new_df.metadata
        ret_df = self._update_metadata_dimension(df=ret_df)

        split_col_names = [inputs.columns[pos] for pos in unfold_cols]

        ret_df = self._split_aggregated(df=ret_df, split_col_names=split_col_names)
        ret_df = common_utils.remove_columns(
            inputs=ret_df,
            column_indices=[ret_df.columns.get_loc(name) for name in split_col_names]
        )

        return CallResult(ret_df)
コード例 #23
0
ファイル: vertical_concat.py プロジェクト: RqS/dsbox-cleaning
 def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
     new_df = pd.concat([x for x in inputs], ignore_index=self.hyperparams["ignore_index"])
     if self.hyperparams["sort_on_primary_key"]:
         primary_key_col = common_utils.list_columns_with_semantic_types(metadata=new_df.metadata, semantic_types=[
             "https://metadata.datadrivendiscovery.org/types/PrimaryKey"])
         if not primary_key_col:
             warnings.warn("No PrimaryKey column found. Will not sort on PrimaryKey")
             return CallResult(self._update_metadata(new_df))
         new_df = new_df.sort_values([new_df.columns[pos] for pos in primary_key_col])
     return CallResult(self._update_metadata(new_df))
コード例 #24
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:

        # if self._training_data is None or self._y_dim==0:
        inputs_timeseries = inputs[1]
        inputs_d3mIndex = inputs[0]
        if not self._fitted:
            return CallResult(None, True, 0)
        if isinstance(inputs_timeseries, np.ndarray):
            X = np.zeros((inputs_timeseries.shape[0], self._y_dim))
        else:
            X = np.zeros((len(inputs_timeseries), self._y_dim))

        for i, series in enumerate(inputs_timeseries):
            if series.shape[1] > 1 and not self._value_found:
                series_output = pd.DataFrame()
                for j in range(series.shape[1]):
                    series_output = pd.concat(
                        [series_output, series.iloc[:, j]])
            else:
                series_output = series
            if (series_output.shape[0] < self._y_dim):
                # pad with zeros
                X[i, :series_output.
                  shape[0]] = series_output.iloc[:series_output.shape[0],
                                                 self._value_dimension]
            else:
                # Truncate or just fit in
                X[i, :] = series_output.iloc[:self._y_dim,
                                             self._value_dimension]

        # save the result to DataFrame format
        output_ndarray = self._model.transform(X)
        output_dataFrame = container.DataFrame(
            container.ndarray(output_ndarray))

        if self.hyperparams["generate_metadata"]:
            # add metadata if required
            for each_column in range(output_ndarray.shape[1]):
                metadata_selector = (mbase.ALL_ELEMENTS, each_column)
                metadata_each_column = {
                    'semantic_types':
                    ('https://metadata.datadrivendiscovery.org/types/TabularColumn',
                     'https://metadata.datadrivendiscovery.org/types/Attribute'
                     )
                }
                output_dataFrame.metadata = output_dataFrame.metadata.update(
                    metadata=metadata_each_column, selector=metadata_selector)

        # update the original index to be d3mIndex
        output_dataFrame = output_dataFrame.set_index(inputs_d3mIndex)
        return CallResult(output_dataFrame, True, 1)
コード例 #25
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:

        if self._fitted:
            return CallResult(None)

        self._fit_and_return_result(timeout=timeout, iterations=iterations)

        return CallResult(None)
コード例 #26
0
ファイル: ensemble_voting.py プロジェクト: RqS/dsbox-cleaning
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:

        index_col = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/PrimaryKey"
            ])
        if not index_col:
            warnings.warn(
                "Did not find primary key column. Can not vote, output origin")
            return CallResult(inputs)

        predict_target_col = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget"
            ])
        if not index_col:
            warnings.warn(
                "Did not find PredictedTarget column. Can not vote, output origin"
            )
            return CallResult(inputs)

        df = inputs.copy()
        new_df = self._get_index_and_target_df(inputs=df,
                                               use_cols=index_col +
                                               predict_target_col)

        if self.hyperparams["ensemble_method"] == 'majority':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).agg(lambda x: x.value_counts().index[0]).reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        if self.hyperparams["ensemble_method"] == 'max':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).max().reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        if self.hyperparams["ensemble_method"] == 'min':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).min().reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        return CallResult(self._update_metadata(df=ret_df))
コード例 #27
0
ファイル: greedy.py プロジェクト: liangmuxin/dsbox-cleaning
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        train imputation parameters. Now support:
        -> greedySearch

        for the method that not trainable, do nothing:
        -> interatively regression
        -> other

        Parameters:
        ----------
        data: pandas dataframe
        label: pandas series, used for the trainable methods
        """
        # if already fitted on current dataset, do nothing
        if self._is_fitted:
            return CallResult(None, self._has_finished, self._iterations_done)

        if (timeout is None):
            timeout = 2**31 - 1

        # setup the timeout
        with stopit.ThreadingTimeout(timeout) as to_ctx_mrg:
            assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING

            if isinstance(self._train_x, pd.DataFrame):
                data = self._train_x.copy()
                label = self._train_y.copy()
            else:
                data = self._train_x[0].copy()
                label = self._train_y[0].copy()

            # start fitting...
            # 1. to figure out what kind of problem it is and assign model and scorer
            # now only support "classification" or "regresion" problem
            self._set_model_scorer()
            # 2. using the model and scorer to do greedy search
            if self._verbose:
                print("=========> Greedy searched imputation:")
            self._best_imputation = self.__imputationGreedySearch(data, label)

        if to_ctx_mrg.state == to_ctx_mrg.EXECUTED:
            self._is_fitted = True
            self._has_finished = True
            self._iterations_done = True
        elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT:
            print("Timed Out...")
            self._is_fitted = False
            self._has_finished = False
            self._iterations_done = False
        return CallResult(None, self._has_finished, self._iterations_done)
コード例 #28
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            return CallResult(inputs, self._has_finished,
                              self._iterations_done)

        assert isinstance(
            self._model,
            dict), "self._model type must be dict not defaultdict!"

        temp = pd.DataFrame(
            inputs.iloc[:, self._s_cols].apply(lambda x: self._model[
                x.name].transform(x) if x.name in self._model else None))

        outputs = inputs.copy()
        for id_index, od_index in zip(self._s_cols, range(temp.shape[1])):
            outputs.iloc[:, id_index] = temp.iloc[:, od_index]
        lookup = {
            "int": ('http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
        }

        for index in self._s_cols:
            old_metadata = dict(
                outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata["semantic_types"] = lookup["int"]
            old_metadata["structural_type"] = type(10)
            outputs.metadata = outputs.metadata.update(
                (mbase.ALL_ELEMENTS, index), old_metadata)

        # remove the columns that appeared in produce method but were not in fitted data
        drop_names = set(outputs.columns[self._s_cols]).difference(
            set(self._model.keys()))
        drop_indices = map(lambda a: outputs.columns.get_loc(a), drop_names)
        drop_indices = sorted(drop_indices)
        outputs = common_utils.remove_columns(outputs,
                                              drop_indices,
                                              source='ISI DSBox Data Labler')

        # sanity check and report the results
        if outputs.shape[0] == inputs.shape[0] and \
           outputs.shape[1] == inputs.shape[1] - len(drop_names):
            self._has_finished = True
            self._iterations_done = True
            # print("output:",outputs.head(5))
            return CallResult(d3m_DataFrame(outputs), self._has_finished,
                              self._iterations_done)
        else:
            return CallResult(inputs, self._has_finished,
                              self._iterations_done)
コード例 #29
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs.any() == None or self._training_outputs.any(
        ) == None:
            raise ValueError('Missing training data, or missing values exist.')

        ## impute missing values
        self._Imputer.fit(self._training_inputs)
        self._training_inputs = self._Imputer.transform(self._training_inputs)

        #        [m,n] = self._training_inputs.shape
        #        for column_index in range(n):
        #            if len(np.unique(self._training_inputs[:,column_index])) == 1:
        #                self._cate_flag[column_index] = 1

        ## discretize non-categorical values
        disc_training_inputs = self._training_inputs
        if not len(np.where(self._cate_flag == 0)[0]) == 0:
            self._Kbins.fit(
                self._training_inputs[:, np.where(
                    self._cate_flag == 0)[0]])  #find non-categorical values
            temp = self._Kbins.transform(
                self._training_inputs[:, np.where(self._cate_flag == 0)[0]])
            disc_training_inputs[:, np.where(self._cate_flag == 0)[0]] = temp
        #start from zero

        Trainset = RelationSet(self._training_inputs,
                               self._training_outputs.reshape(-1, 1))
        discTrainset = RelationSet(disc_training_inputs,
                                   self._training_outputs.reshape(-1, 1))
        validSet, smallTrainSet = Trainset.split(
            self._training_inputs.shape[0] // 4)
        smallDiscTrainSet = discTrainset.split(
            self._training_inputs.shape[0] // 4)[1]
        model = STMB(Trainset,
                     discTrainset,
                     self._problem_type,
                     test_set=Trainset)
        index = model.select_features()
        self._index = []
        [
            m,
        ] = index.shape
        for ii in np.arange(m):
            if not len(np.unique(
                    self._training_inputs[:, index[ii].item()])) == 1:
                self._index.append(index[ii].item())
        self._fitted = True

        return CallResult(None)
コード例 #30
0
ファイル: splitter.py プロジェクト: byu-dml/dsbox-primitives
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        check the shape of the main resource dataset. I
        f the size is larger than threshold, the primitive will record and generate
        a list of column/ row that need to be remained.
        """
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None:
            raise ValueError('Missing training(fitting) data.')

        data = self._training_inputs.copy()
        main_res_shape = data[self._main_resource_id].shape

        if main_res_shape[0] > self._threshold_row_length:
            self._need_reduce_row = True
            if main_res_shape[1] > self._further_reduce_threshold_column_length:
                self._threshold_row_length = self._threshold_row_length * self._further_reduce_ratio
                self._logger.info(
                    "This dataset's column number and row number are both oversized, will further reduce the threshold of the row about to be."
                    + str(self._threshold_row_length))

        if main_res_shape[1] > self._threshold_column_length:
            self._need_reduce_column = True

        if self._need_reduce_column and self._need_reduce_row:
            self._logger.info(
                "This dataset's column number and row number are both oversized, will sample both of them."
            )
        elif self._need_reduce_column:
            self._logger.info(
                "The column number of the input dataset is very large, will split part of them."
            )
        elif self._need_reduce_row:
            self._logger.info(
                "The row number of the input dataset is very large, will split part of them."
            )
        else:
            self._logger.info(
                "This dataset's size is OK, no split on dataset needed.")

        # copy from d3m here, what is this used for?
        # Graph is the adjacency representation for the relations graph. Make it not be a "defaultdict".
        # self._graph = dict(utils.build_relation_graph(self._training_inputs))

        self._status = Status.TRAIN
        self._fitted = True
        return CallResult(None, has_finished=True, iterations_done=1)