def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: return CallResult(inputs, True, 1) temp = pd.DataFrame(self._training_data.iloc[:, self._s_cols].apply( lambda x: self._d[x.name].transform(x))) outputs = self._training_data.copy() for id_index, od_index in zip(self._s_cols, range(temp.shape[1])): outputs.iloc[:, id_index] = temp.iloc[:, od_index] lookup = { "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') } #new_dtype = temp.dtypes for index in self._s_cols: old_metadata = dict( outputs.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata["semantic_types"] = lookup["int"] old_metadata["structural_type"] = type(10) outputs.metadata = outputs.metadata.update( (mbase.ALL_ELEMENTS, index), old_metadata) if outputs.shape == inputs.shape: print("output:", outputs.head(5)) return CallResult(d3m_DataFrame(outputs), True, 1) else: return CallResult(inputs, True, 1)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None or self._training_outputs is None: raise ValueError("Missing training data.") if len(self._training_outputs.shape) == 1: self._training_outputs = np.expand_dims(self._training_outputs, axis=1) binaryoutputs = self.__map_labels_to_binary(self._training_outputs) (self._weights, _) = tm_fit(self._training_inputs, binaryoutputs, 'bc', self.hyperparams['r'], self.hyperparams['q'], self.hyperparams['gamma'], self.hyperparams['solver'], self.hyperparams['epochs'], self.hyperparams['alpha'], seed=self._seed) self._fitted = True return CallResult(None)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Arguments - inputs: List( # Data List( # Segments [ deg, num_feats ], ... ) ), """ if self._fitted: return CallResult(None) if self._training_inputs is None: raise Exception('Missing training data') with stopit.ThreadingTimeout(timeout) as timer: inputs_curve_fitting = self._training_inputs num_data = sum([ len(x) for x in inputs_curve_fitting ]) # number of segments, each segment if formed by multiple data samples deg, num_feats = inputs_curve_fitting[0][0].shape betas = np.vstack([ np.array([ segment.flatten() for segment in cinput ]) for cinput in inputs_curve_fitting if len(cinput) > 0 ]) self._model.fit(betas) self._fitted = True if timer.state == timer.EXECUTED: return CallResult(None) else: raise TimeoutError('ClusterCurveFittingKMeans exceeded time limit')
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ precond: run fit() before to complete the data, based on the learned parameters, support: -> greedy search also support the untrainable methods: -> iteratively regression -> other Parameters: ---------- data: pandas dataframe label: pandas series, used for the evaluation of imputation TODO: ---------- 1. add evaluation part for __simpleImpute() """ if (not self._is_fitted): # todo: specify a NotFittedError, like in sklearn raise ValueError("Calling produce before fitting.") if (pd.isnull(inputs).sum().sum() == 0): # no missing value exists if self._verbose: print ("Warning: no missing value in test dataset") self._has_finished = True return CallResult(inputs, self._has_finished, self._iterations_done) if (timeout is None): timeout = math.inf if (iterations is None): self._iterations_done = True iterations = 30 # only works for iteratively_regre method data = inputs.copy() # record keys: keys = data.keys() index = data.index # setup the timeout with stopit.ThreadingTimeout(timeout) as to_ctx_mrg: assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING # start completing data... if self._verbose: print("=========> iteratively regress method:") data_clean = self.__regressImpute(data, self._best_imputation, iterations) value = None if to_ctx_mrg.state == to_ctx_mrg.EXECUTED: self._is_fitted = True self._has_finished = True value = pd.DataFrame(data_clean, index, keys) elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT: print ("Timed Out...") self._is_fitted = False self._has_finished = False self._iterations_done = False return CallResult(value, self._has_finished, self._iterations_done)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if (timeout is None): big_table = self._core(inputs) self._has_finished = True self._iterations_done = True return CallResult(big_table, self._has_finished, self._iterations_done) else: # setup the timeout with stopit.ThreadingTimeout(timeout) as to_ctx_mrg: assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING # core computations big_table = self._core(inputs) if to_ctx_mrg.state == to_ctx_mrg.EXECUTED: self._has_finished = True self._iterations_done = True return CallResult(big_table, self._has_finished, self._iterations_done) elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT: self._has_finished = False self._iterations_done = False return CallResult(None, self._has_finished, self._iterations_done)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Learns the kernel regression coefficients alpha given training pairs (X,y) """ if self._fitted: return CallResult(None) if self._Xtrain is None or self._ytrain is None: raise ValueError("Missing training data.") self._U = generateGaussianPreconditioner(self._Xtrain, self.hyperparams['sigma'], self.hyperparams['lparam']) def mykernel(X, Y): return GaussianKernel(X, Y, self.hyperparams['sigma']) self._coeffs = PCGfit(self._Xtrain, self._ytrain, mykernel, self._U, self.hyperparams['lparam'], self.hyperparams['eps'], self.hyperparams['maxIters']) self._fitted = True return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: return CallResult(inputs, True, 1) temp = pd.DataFrame(self._model.transform(inputs.iloc[:, self._s_cols])) outputs = self._training_data.copy() for id_index, od_index in zip(self._s_cols, range(temp.shape[1])): outputs.iloc[:, id_index] = temp.iloc[:, od_index] new_dtype = temp.dtypes lookup = {"float": ('http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'), "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')} for d, index in zip(new_dtype, self._s_cols): print("old metadata : ", outputs.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata = dict(outputs.metadata.query((mbase.ALL_ELEMENTS, index))) if d==np.dtype(np.float16) or d==np.dtype(np.float32) or d==np.dtype(np.float64) or d==np.dtype(np.float128): old_metadata["semantic_types"] = lookup["float"] old_metadata["structural_type"] = type(10.0) else: old_metadata["semantic_types"] = lookup["int"] old_metadata["structural_type"] = type(10) outputs.metadata = outputs.metadata.update((mbase.ALL_ELEMENTS, index),old_metadata) print("updated dict : ",old_metadata) print("check again : ", outputs.metadata.query((mbase.ALL_ELEMENTS, index))) if outputs.shape == inputs.shape: return CallResult(d3m_DataFrame(outputs), True, 1) else: return CallResult(inputs, True, 1)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fit model with training data. Args: *: Container DataFrame. Time series data up to fit. Returns: None """ if self._fitted: return CallResult(None) # Get cols to fit. self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns # If there is no cols to fit, return None if self._training_inputs is None: return CallResult(None) # Call SVD in sklearn and set _fitted to true if len(self._training_indices) > 0: self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") return CallResult(None)
def fit(self, *, timeout: float = None, iterations: int = None) -> None: if self._fitted: return CallResult(None) if self._training_inputs.any() == None or self._training_outputs.any() == None: raise ValueError('Missing training data, or missing values exist.') ## impute missing values self._Imputer.fit(self._training_inputs) self._training_inputs = self._Imputer.transform(self._training_inputs) ## discretize non-categorical values disc_training_inputs = self._training_inputs if not len(np.where(self._cate_flag == 0)[0]) == 0: self._Kbins.fit(self._training_inputs[:, np.where(self._cate_flag == 0)[0]]) #find non-categorical values temp = self._Kbins.transform(self._training_inputs[:, np.where(self._cate_flag == 0)[0]]) disc_training_inputs[:, np.where(self._cate_flag == 0)[0]] = temp # starting from zero ## get number of states for each feature and remove features with only one state discTrainset = RelationSet(disc_training_inputs, self._training_outputs.reshape(-1,1)) discTrainset.getStateNo(self._cate_flag, self._nbins) discTrainset.remove() X_train = discTrainset.data Y_train = discTrainset.labels self._discTrainset = discTrainset stateNo = np.append(discTrainset.NUM_STATES, len(np.unique(Y_train))) ## fit the classifier self._clf.fit(X_train, Y_train, stateNo) self._fitted = True return CallResult(None)
def fit(self, *, timeout: float = None, iterations: int = None) -> None: if self.fitted: return CallResult(None, True, 1) args = {} args['nu1'] = 1e-6 args['nu2'] = 1e-6 args['K'] = self.hyperparams['depth'] args['n_units'] = [ 500, 300, ] args['rho'] = 0.3 args['n_iter'] = self.hyperparams['epochs'] args['xeta'] = self.hyperparams['lr'] #0.0005 args['n_batch'] = 100 #500 self._args = args dim = self.hyperparams['dimension'] alpha = self.hyperparams['alpha'] beta = self.hyperparams['beta'] #self._model = sdne.SDNE(d = dim, self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args) #self._model.learn_embedding(graph = self.training_data) self._sdne.learn_embedding(graph=self.training_data) self._model = self._sdne._model make_keras_pickleable() self.fitted = True return CallResult(None, True, 1)
def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: """ Inputs: ndarray of features Returns: None """ if self._fitted: return CallResult(None) if self._training_inputs is None or self._training_outputs is None: raise exceptions.InvalidStateError("Missing training data.") self._new_training_data = False XTrain, _ = self._select_inputs_columns(self._training_inputs) YTrain, _ = self._select_outputs_columns(self._training_outputs) # Fit data CCF = genCCF(XTrain, YTrain, nTrees=self.optionsClassCCF['nTrees'], bReg=True, optionsFor=self.optionsClassCCF, do_parallel=self.optionsClassCCF['parallelprocessing']) self._CCF = CCF self._fitted = True return CallResult(None)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Learns the LAD regression coefficients alpha given training pairs (X,y) """ if self._fitted: return CallResult(None) if self._Xtrain is None or self._ytrain is None: raise ValueError("Missing training data.") stoppingTol = self.hyperparams['eps'] * norm( self._ytrain, 1) / (np.sqrt(self._n) * norm(self._Xtrain)) r = self.hyperparams['coresetmultiplier'] * self._d if r < self._n: self._U = generateWellConditionedBasis( np.concatenate((self._Xtrain, self._ytrain), axis=1), r) self._coeffs = coresetLAD(self._Xtrain, self._ytrain, self._U, r, stoppingTol, self.hyperparams['maxIters']) else: #print("coreset size is larger than number of examples, so solving the full LAD problem --- you may want to lower the coresetmultiplier parameter") self._coeffs = LAD(self._Xtrain, self._ytrain, stoppingTol, self.hyperparams['maxIters']) self._fitted = True return CallResult(None)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None: raise d3m_exceptions.MissingValueError( 'set_training_data must be called before fit') # operate on columns by index, not name for i, (col_name, col) in enumerate(self._training_inputs.iteritems()): drop_col = False if self.hyperparams['drop_missing_values']: if self.hyperparams['how'] == 'all' and col.isnull().all(): drop_col = True elif self.hyperparams['how'] == 'any' and col.isnull().any(): drop_col = True self._drop_cols.append(drop_col) if drop_col: self._drop_col_indices.append(i) col_known_values = None if not drop_col: col_known_values = col.dropna(axis=0, how='any').tolist() self._known_values.append(col_known_values) self._fitted = True self._training_inputs = None # free memory return CallResult(None)
def produce(self, *, inputs1: Inputs1, inputs2: Inputs2, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: status = self._import_module() if status == 0: print("not a valid url") return CallResult(DataFrame()) if status == 1: # run isi-datamart # sort the inputslist by best score inputs1.sort(key=lambda x: x.score, reverse=True) # choose the best one? maybe more, determined by hyperparams res_df = ISI_datamart.augment( original_data=inputs2, augment_data=inputs1[ self.hyperparams["n_index"]]) # a pd.dataframe # join with inputs2 # updating "attribute columns", "datatype" from datamart.Dataset else: # run inputs1.sort(key=lambda x: x.score, reverse=True) res_df = NYU_datamart.augment( data=inputs2, augment_data=inputs1[self.hyperparams["n_index"]]) self._has_finished = True self._iterations_done = True return CallResult(res_df)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : Input pandas frame Returns ------- Outputs : A frame structurally identical to the input frame, with each feature cleaned according to its type (e.g. all date objects will be modified to be of a common structure) """ string_cleaner = CleanStrings() number_cleaner = CleanNumbers() date_cleaner = CleanDates() def dtype_apply(series): if series.dtype in ['int64', 'float64']: return number_cleaner.clean_numbers(series) elif series.dtype in ['object']: return string_cleaner.clean_strings(series) elif 'datetime' in series.dtype: return date_cleaner.clean_dates(series) else: return series try: return CallResult(inputs.apply(dtype_apply)) except: return CallResult(inputs)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fit model with training data. Args: *: Container DataFrame. Time series data up to fit. Returns: None """ if self._fitted: return CallResult(None) self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if self._training_inputs is None: return CallResult(None) if len(self._training_indices) > 0: self._clf.fit_transform(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") # print(self._training_inputs.std()) return CallResult(None)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._inputs is None or self._outputs is None: raise ValueError("Missing training data.") if not self._new_training_data: return CallResult(None) self._new_training_data = False self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) self._input_column_names = self._training_inputs.columns.astype(str) if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) sk_training_output = self._training_outputs.values shape = sk_training_output.shape if len(shape) == 2 and shape[1] == 1: sk_training_output = numpy.ravel(sk_training_output) self._clf.fit(self._training_inputs, sk_training_output) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: raise exceptions.PrimitiveNotFittedError('Labeler not fitted') if len(self._s_cols) == 0: # No categorical columns. Nothing to do. return CallResult(inputs, True) # Generate label encoding columns = [] for col_index in self._s_cols: size = self._model[col_index].size mapping = {x: i for i, x in enumerate(self._model[col_index])} columns.append(inputs.iloc[:, col_index].apply(lambda x: mapping[x] if x in mapping else size)) # insert encoded columns outputs = inputs.copy() for col, index in enumerate(self._s_cols): outputs.iloc[:, index] = columns[col] lookup = { "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') } for index in self._s_cols: old_metadata = dict(outputs.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata["semantic_types"] = lookup["int"] old_metadata["structural_type"] = type(10) outputs.metadata = outputs.metadata.update((mbase.ALL_ELEMENTS, index), old_metadata) self._has_finished = True return CallResult(outputs, self._has_finished)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if 'd3mIndex' in list(inputs.columns.values) and len( set(inputs.loc[:, 'd3mIndex'].tolist())) < inputs.shape[0]: data_dict = dict() indices = list() for row in range(inputs.shape[0]): idx = inputs.iloc[row, :]['d3mIndex'] if idx not in data_dict: data_dict[idx] = list() indices.append(idx) data_dict[idx].append( inputs.iloc[row, :].drop('d3mIndex').tolist()) for key in data_dict: data_dict[key] = self._get_target(data_dict[key]) new_df = inputs[0:0].drop('d3mIndex', axis=1) for idx in indices: new_df.loc[idx] = data_dict[idx] old_metadata = dict(new_df.metadata.query(())) old_metadata["dimension"] = dict(old_metadata["dimension"]) old_metadata["dimension"]["length"] = new_df.shape[0] new_df.metadata = new_df.metadata.update((), old_metadata) return CallResult(new_df, True, 1) return CallResult(inputs, True, 1)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._input_data is None: raise ValueError('Missing training(fitting) data.') # Look at attribute columns only # print('fit in', self._input_data.columns) data = self._input_data.copy() all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) # Remove columns with all empty values, structural type str numeric = DataMetadata.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in all_attributes] self._empty_columns = [] _logger.debug(f'Numeric columns: {numeric}') for element in numeric: if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str: if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]: _logger.debug(f'Empty numeric str column: {element}') self._empty_columns.append(element) # Remove columns with all empty values, structural numeric is_empty = pd.isnull(data).sum(axis=0) == data.shape[0] for i in all_attributes: if is_empty.iloc[i] and i not in self._empty_columns: _logger.debug(f'Empty numeric str column: {element}') self._empty_columns.append(i) _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns])) data = container.DataFrame.remove_columns(data, self._empty_columns) categorical_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData"]) all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes)) self._cat_columns = data.columns[self._cat_col_index].tolist() _logger.debug('Encoding columns: {}'.format(self._cat_columns)) mapping = {} for column_name in self._cat_columns: col = data[column_name] temp = self._trim_features(col, self.hyperparams['n_limit']) if temp: mapping[temp[0]] = temp[1] self._mapping = mapping self._fitted = True return CallResult(None, has_finished=True)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: #make_keras_pickleable() produce_data, learning_df, nodes_df, edges_df = self._parse_inputs( inputs, return_all=True) if self.fitted: result = self._sdne._Y #produce( )#_Y else: dim = self.hyperparams['dimension'] alpha = self.hyperparams['alpha'] beta = self.hyperparams['beta'] #self._model self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args) produce_data = networkx.from_scipy_sparse_matrix(produce_data) self._sdne.learn_embedding(graph=produce_data) self._model = self._sdne._model result = self._sdne._Y target_types = [ 'https://metadata.datadrivendiscovery.org/types/TrueTarget', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ] if self.hyperparams['return_list']: result_np = container.ndarray(result, generate_metadata=True) return_list = d3m_List([result_np, inputs[1], inputs[2]], generate_metadata=True) return CallResult(return_list, True, 1) else: learn_df = d3m_DataFrame(learning_df, generate_metadata=True) learn_df = get_columns_not_of_type(learn_df, target_types) learn_df = learn_df.remove_columns( [learn_df.columns.get_loc('nodeID')]) #learn_df = learn_df.drop('nodeID', axis = 'columns') result_df = d3m_DataFrame(result, generate_metadata=True) result_df = result_df.loc[result_df.index.isin( learning_df['d3mIndex'].values)] for column_index in range(result_df.shape[1]): col_dict = dict( result_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) col_dict['name'] = str(learn_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') result_df.metadata = result_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) result_df.index = learn_df.index.copy() output = utils.append_columns(learn_df, result_df) #output.set_index('d3mIndex', inplace=True) return CallResult(output, True, 1)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: primary_key_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/PrimaryKey"] ) unfold_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=self.hyperparams["unfold_semantic_types"] ) if not primary_key_cols: warnings.warn("Did not find primary key column for grouping. Will not unfold") return CallResult(inputs) if not unfold_cols: warnings.warn("Did not find any column to unfold. Will not unfold") return CallResult(inputs) primary_key_col_names = [inputs.columns[pos] for pos in primary_key_cols] unfold_col_names = [inputs.columns[pos] for pos in unfold_cols] if self.hyperparams["use_pipeline_id_semantic_type"]: pipeline_id_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/PipelineId"] ) if len(pipeline_id_cols) >= 2: warnings.warn("Multiple pipeline id columns found. Will use first.") if pipeline_id_cols: inputs = inputs.sort_values(primary_key_col_names + [inputs.columns[pos] for pos in pipeline_id_cols]) self._sorted_pipe_ids = sorted(inputs.iloc[:, pipeline_id_cols[0]].unique()) else: warnings.warn( "No pipeline id column found by 'https://metadata.datadrivendiscovery.org/types/PipelineId'") new_df = self._get_new_df(inputs=inputs, use_cols=primary_key_cols + unfold_cols) groupby_df = inputs.groupby(primary_key_col_names)[unfold_col_names].aggregate( lambda x: container.List(x)).reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata ret_df = self._update_metadata_dimension(df=ret_df) split_col_names = [inputs.columns[pos] for pos in unfold_cols] ret_df = self._split_aggregated(df=ret_df, split_col_names=split_col_names) ret_df = common_utils.remove_columns( inputs=ret_df, column_indices=[ret_df.columns.get_loc(name) for name in split_col_names] ) return CallResult(ret_df)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: new_df = pd.concat([x for x in inputs], ignore_index=self.hyperparams["ignore_index"]) if self.hyperparams["sort_on_primary_key"]: primary_key_col = common_utils.list_columns_with_semantic_types(metadata=new_df.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PrimaryKey"]) if not primary_key_col: warnings.warn("No PrimaryKey column found. Will not sort on PrimaryKey") return CallResult(self._update_metadata(new_df)) new_df = new_df.sort_values([new_df.columns[pos] for pos in primary_key_col]) return CallResult(self._update_metadata(new_df))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # if self._training_data is None or self._y_dim==0: inputs_timeseries = inputs[1] inputs_d3mIndex = inputs[0] if not self._fitted: return CallResult(None, True, 0) if isinstance(inputs_timeseries, np.ndarray): X = np.zeros((inputs_timeseries.shape[0], self._y_dim)) else: X = np.zeros((len(inputs_timeseries), self._y_dim)) for i, series in enumerate(inputs_timeseries): if series.shape[1] > 1 and not self._value_found: series_output = pd.DataFrame() for j in range(series.shape[1]): series_output = pd.concat( [series_output, series.iloc[:, j]]) else: series_output = series if (series_output.shape[0] < self._y_dim): # pad with zeros X[i, :series_output. shape[0]] = series_output.iloc[:series_output.shape[0], self._value_dimension] else: # Truncate or just fit in X[i, :] = series_output.iloc[:self._y_dim, self._value_dimension] # save the result to DataFrame format output_ndarray = self._model.transform(X) output_dataFrame = container.DataFrame( container.ndarray(output_ndarray)) if self.hyperparams["generate_metadata"]: # add metadata if required for each_column in range(output_ndarray.shape[1]): metadata_selector = (mbase.ALL_ELEMENTS, each_column) metadata_each_column = { 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularColumn', 'https://metadata.datadrivendiscovery.org/types/Attribute' ) } output_dataFrame.metadata = output_dataFrame.metadata.update( metadata=metadata_each_column, selector=metadata_selector) # update the original index to be d3mIndex output_dataFrame = output_dataFrame.set_index(inputs_d3mIndex) return CallResult(output_dataFrame, True, 1)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) self._fit_and_return_result(timeout=timeout, iterations=iterations) return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: index_col = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PrimaryKey" ]) if not index_col: warnings.warn( "Did not find primary key column. Can not vote, output origin") return CallResult(inputs) predict_target_col = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PredictedTarget" ]) if not index_col: warnings.warn( "Did not find PredictedTarget column. Can not vote, output origin" ) return CallResult(inputs) df = inputs.copy() new_df = self._get_index_and_target_df(inputs=df, use_cols=index_col + predict_target_col) if self.hyperparams["ensemble_method"] == 'majority': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).agg(lambda x: x.value_counts().index[0]).reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata if self.hyperparams["ensemble_method"] == 'max': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).max().reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata if self.hyperparams["ensemble_method"] == 'min': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).min().reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata return CallResult(self._update_metadata(df=ret_df))
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ train imputation parameters. Now support: -> greedySearch for the method that not trainable, do nothing: -> interatively regression -> other Parameters: ---------- data: pandas dataframe label: pandas series, used for the trainable methods """ # if already fitted on current dataset, do nothing if self._is_fitted: return CallResult(None, self._has_finished, self._iterations_done) if (timeout is None): timeout = 2**31 - 1 # setup the timeout with stopit.ThreadingTimeout(timeout) as to_ctx_mrg: assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING if isinstance(self._train_x, pd.DataFrame): data = self._train_x.copy() label = self._train_y.copy() else: data = self._train_x[0].copy() label = self._train_y[0].copy() # start fitting... # 1. to figure out what kind of problem it is and assign model and scorer # now only support "classification" or "regresion" problem self._set_model_scorer() # 2. using the model and scorer to do greedy search if self._verbose: print("=========> Greedy searched imputation:") self._best_imputation = self.__imputationGreedySearch(data, label) if to_ctx_mrg.state == to_ctx_mrg.EXECUTED: self._is_fitted = True self._has_finished = True self._iterations_done = True elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT: print("Timed Out...") self._is_fitted = False self._has_finished = False self._iterations_done = False return CallResult(None, self._has_finished, self._iterations_done)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: return CallResult(inputs, self._has_finished, self._iterations_done) assert isinstance( self._model, dict), "self._model type must be dict not defaultdict!" temp = pd.DataFrame( inputs.iloc[:, self._s_cols].apply(lambda x: self._model[ x.name].transform(x) if x.name in self._model else None)) outputs = inputs.copy() for id_index, od_index in zip(self._s_cols, range(temp.shape[1])): outputs.iloc[:, id_index] = temp.iloc[:, od_index] lookup = { "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') } for index in self._s_cols: old_metadata = dict( outputs.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata["semantic_types"] = lookup["int"] old_metadata["structural_type"] = type(10) outputs.metadata = outputs.metadata.update( (mbase.ALL_ELEMENTS, index), old_metadata) # remove the columns that appeared in produce method but were not in fitted data drop_names = set(outputs.columns[self._s_cols]).difference( set(self._model.keys())) drop_indices = map(lambda a: outputs.columns.get_loc(a), drop_names) drop_indices = sorted(drop_indices) outputs = common_utils.remove_columns(outputs, drop_indices, source='ISI DSBox Data Labler') # sanity check and report the results if outputs.shape[0] == inputs.shape[0] and \ outputs.shape[1] == inputs.shape[1] - len(drop_names): self._has_finished = True self._iterations_done = True # print("output:",outputs.head(5)) return CallResult(d3m_DataFrame(outputs), self._has_finished, self._iterations_done) else: return CallResult(inputs, self._has_finished, self._iterations_done)
def fit(self, *, timeout: float = None, iterations: int = None) -> None: if self._fitted: return CallResult(None) if self._training_inputs.any() == None or self._training_outputs.any( ) == None: raise ValueError('Missing training data, or missing values exist.') ## impute missing values self._Imputer.fit(self._training_inputs) self._training_inputs = self._Imputer.transform(self._training_inputs) # [m,n] = self._training_inputs.shape # for column_index in range(n): # if len(np.unique(self._training_inputs[:,column_index])) == 1: # self._cate_flag[column_index] = 1 ## discretize non-categorical values disc_training_inputs = self._training_inputs if not len(np.where(self._cate_flag == 0)[0]) == 0: self._Kbins.fit( self._training_inputs[:, np.where( self._cate_flag == 0)[0]]) #find non-categorical values temp = self._Kbins.transform( self._training_inputs[:, np.where(self._cate_flag == 0)[0]]) disc_training_inputs[:, np.where(self._cate_flag == 0)[0]] = temp #start from zero Trainset = RelationSet(self._training_inputs, self._training_outputs.reshape(-1, 1)) discTrainset = RelationSet(disc_training_inputs, self._training_outputs.reshape(-1, 1)) validSet, smallTrainSet = Trainset.split( self._training_inputs.shape[0] // 4) smallDiscTrainSet = discTrainset.split( self._training_inputs.shape[0] // 4)[1] model = STMB(Trainset, discTrainset, self._problem_type, test_set=Trainset) index = model.select_features() self._index = [] [ m, ] = index.shape for ii in np.arange(m): if not len(np.unique( self._training_inputs[:, index[ii].item()])) == 1: self._index.append(index[ii].item()) self._fitted = True return CallResult(None)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ check the shape of the main resource dataset. I f the size is larger than threshold, the primitive will record and generate a list of column/ row that need to be remained. """ if self._fitted: return CallResult(None) if self._training_inputs is None: raise ValueError('Missing training(fitting) data.') data = self._training_inputs.copy() main_res_shape = data[self._main_resource_id].shape if main_res_shape[0] > self._threshold_row_length: self._need_reduce_row = True if main_res_shape[1] > self._further_reduce_threshold_column_length: self._threshold_row_length = self._threshold_row_length * self._further_reduce_ratio self._logger.info( "This dataset's column number and row number are both oversized, will further reduce the threshold of the row about to be." + str(self._threshold_row_length)) if main_res_shape[1] > self._threshold_column_length: self._need_reduce_column = True if self._need_reduce_column and self._need_reduce_row: self._logger.info( "This dataset's column number and row number are both oversized, will sample both of them." ) elif self._need_reduce_column: self._logger.info( "The column number of the input dataset is very large, will split part of them." ) elif self._need_reduce_row: self._logger.info( "The row number of the input dataset is very large, will split part of them." ) else: self._logger.info( "This dataset's size is OK, no split on dataset needed.") # copy from d3m here, what is this used for? # Graph is the adjacency representation for the relations graph. Make it not be a "defaultdict". # self._graph = dict(utils.build_relation_graph(self._training_inputs)) self._status = Status.TRAIN self._fitted = True return CallResult(None, has_finished=True, iterations_done=1)