def time_new(self, columns): base_utils.combine_columns( self.large_dataframe_with_many_columns, list(range(int(columns / 4), int(columns / 2))), # Just 1/4 of columns. self.list_of_many_dataframe_columns, return_result='new', add_index_columns=True, )
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: if not self._fitted: raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") assert self._add_semantic_types is not None assert self._remove_semantic_types is not None columns_to_use, output_columns = self._produce_columns( inputs, self._add_semantic_types, self._remove_semantic_types) if self.hyperparams['replace_index_columns'] and self.hyperparams[ 'return_result'] == 'append': assert len(columns_to_use) == len(output_columns) index_columns = inputs.metadata.get_index_columns() index_columns_to_use = [] other_columns_to_use = [] index_output_columns = [] other_output_columns = [] for column_to_use, output_column in zip(columns_to_use, output_columns): if column_to_use in index_columns: index_columns_to_use.append(column_to_use) index_output_columns.append(output_column) else: other_columns_to_use.append(column_to_use) other_output_columns.append(output_column) outputs = base_utils.combine_columns( inputs, index_columns_to_use, index_output_columns, return_result='replace', add_index_columns=self.hyperparams['add_index_columns']) outputs = base_utils.combine_columns( outputs, other_columns_to_use, other_output_columns, return_result='append', add_index_columns=self.hyperparams['add_index_columns']) else: outputs = base_utils.combine_columns( inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) return base.CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: sk_inputs, columns_to_use, _ = self._get_columns_to_fit(inputs, self.hyperparams) output = [] if len(sk_inputs.columns): try: sk_output = self._clf.transform(sk_inputs) except sklearn.exceptions.NotFittedError as error: raise PrimitiveNotFittedError("Primitive not fitted.") from error if sparse.issparse(sk_output): sk_output = sk_output.toarray() target_columns_metadata = self._copy_columns_metadata(inputs.metadata, self._training_indices, self.hyperparams) output = self._wrap_predictions(inputs, sk_output, target_columns_metadata) output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self._training_indices] output = [output] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") _, _, dropped_cols = self._get_columns_to_fit(inputs, self.hyperparams) outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices + dropped_cols, columns_list=output) return CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) output = [] if len(sk_inputs.columns): try: sk_output = self._clf.predict(sk_inputs) except sklearn.exceptions.NotFittedError as error: raise PrimitiveNotFittedError("Primitive not fitted.") from error # For primitives that allow predicting without fitting like GaussianProcessRegressor if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") if sparse.issparse(sk_output): sk_output = pandas.DataFrame.sparse.from_spmatrix(sk_output) output = self._wrap_predictions(inputs, sk_output) output.columns = self._target_names output = [output] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._target_column_indices, columns_list=output) return CallResult(outputs)
def produce(self, *, inputs: Inputs, iterations: int = None, timeout: float = None) -> base.CallResult[Outputs]: """ Inputs: ndarray of features Returns: Pandas DataFrame Containing predictions """ # Inference if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") XTest, columns_to_use = self._select_inputs_columns(inputs) if len(XTest.columns): # Prediction YpredCCF, _, _ = predictFromCCF(self._CCF, XTest) output_columns = [self._wrap_predictions(YpredCCF)] outputs = base_utils.combine_columns( inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) return base.CallResult(outputs)
def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Time series data up to outlier detection. Returns: Container DataFrame 1 marks Outliers, 0 marks normal. """ if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: if self.hyperparams['return_subseq_inds']: if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD pred_score = self._clf.decision_function(sk_inputs.values).ravel() left_inds_ = numpy.arange(0, len(pred_score), self.step_size) right_inds_ = left_inds_ + self.window_size right_inds_[right_inds_ > len(pred_score)] = len(pred_score) else: pred_score, left_inds_, right_inds_ = self._clf.decision_function(sk_inputs.values) # print(pred_score.shape, left_inds_.shape, right_inds_.shape) sk_output = numpy.concatenate((numpy.expand_dims(pred_score, axis=1), numpy.expand_dims(left_inds_, axis=1), numpy.expand_dims(right_inds_, axis=1)), axis=1) else: if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD sk_output = self._clf.decision_function(sk_inputs.values) else: sk_output, _, _ = self._clf.decision_function(sk_inputs.values) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) return CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Returns: Container DataFrame after HPFilter. """ # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit( inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: # self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._hpfilter(sk_inputs, lamb=self.hyperparams['lamb']) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) # self._write(outputs) # self.logger.warning('produce was called3') return CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame Returns: Container DataFrame added with absolute and phase value in a columns named 'column_name_fft_abs' and 'column_name_fft_phse'. These values correspnd to the absolute and angle values for a complex number we get as FFT coefficients """ assert isinstance(inputs, container.DataFrame), type(dataframe) self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: # self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: cols = [inputs.columns[x] for x in self._training_indices] sk_inputs = container.DataFrame(data = inputs.iloc[:, self._training_indices].values,columns = cols, generate_metadata=True) output_columns = [] if len(self._training_indices) > 0: sk_output = self._clf.produce(sk_inputs) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) # if len(outputs.columns) == len(self._input_column_names): # outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) return base.CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame timeout: Default iterations: Default Returns: Container DataFrame containing hmean of time series """ self.logger.info('Statistical Hmean Primitive called') # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: # self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") statistical_hmean_input = inputs if self.hyperparams['use_semantic_types']: statistical_hmean_input = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: statistical_hmean_output = self._hmean(statistical_hmean_input,self.hyperparams["window_size"]) if sparse.issparse(statistical_hmean_output): statistical_hmean_output = statistical_hmean_output.toarray() outputs = self._wrap_predictions(inputs, statistical_hmean_output) #if len(outputs.columns) == len(self._input_column_names): # outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) self.logger.info('Statistical Hmean Primitive returned') return base.CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Time series data up to Wavelet transform. Returns: [cA_n, cD_n, cD_n-1, …, cD2, cD1]: Container DataFrame after Wavelet Transformation. Ordered frame of coefficients arrays where n denotes the level of decomposition. The first element (cA_n) of the result is approximation coefficients array and the following elements (cD_n - cD_1) are details coefficients arrays. """ assert isinstance(inputs, container.DataFrame), type(container.DataFrame) _, self._columns_to_produce = self._get_columns_to_fit( inputs, self.hyperparams) self._input_column_names = inputs.columns # print('columns_to_produce=', self._columns_to_produce) sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._columns_to_produce] output_columns = [] if len(self._columns_to_produce) > 0: sk_output = self._clf.produce(sk_inputs, self.hyperparams['inverse']) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._columns_to_produce, columns_list=output_columns) # print(inputs) # print(outputs) # if self.hyperparams['inverse'] == 1: # print(outputs) # print(outputs.metadata.to_internal_simple_structure()) # outputs = inputs return base.CallResult(outputs)
def produce(self, *, inputs: FileReaderInputs, timeout: float = None, iterations: int = None) -> base.CallResult[FileReaderOutputs]: columns_to_use = self._get_columns(inputs.metadata) output_columns = [self._produce_column(inputs, column_index) for column_index in columns_to_use] outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) if self.hyperparams['return_result'] == 'append': outputs.metadata = self._reassign_boundaries(outputs.metadata, columns_to_use) return base.CallResult(outputs)
def time_replace2(self, columns): cols = 5000 large_dataframe_with_many_columns = container.DataFrame( {str(i): [j for j in range(5)] for i in range(cols)}, columns=[str(i) for i in range(cols)], generate_metadata=True) list_of_many_dataframe_columns = [ container.DataFrame({str(i): [j for j in range(5, 1000)]}, columns=[str(i)], generate_metadata=True) for i in range(int(cols / 2)) ] base_utils.combine_columns( large_dataframe_with_many_columns, list(range(int(cols))), # All of the columns. list_of_many_dataframe_columns, return_result='replace', add_index_columns=True, )
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Time series data up to scale. Returns: Container DataFrame after scaling. """ assert isinstance(inputs, container.DataFrame), type(dataframe) _, self._columns_to_produce = self._get_columns_to_fit(inputs, self.hyperparams) self._input_column_names = inputs.columns # print(self._columns_to_produce) sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._columns_to_produce] output_columns = [] if len(self._columns_to_produce) > 0: sk_output = self._clf.produce(sk_inputs) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") # print(outputs.metadata.to_internal_simple_structure()) outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._columns_to_produce, columns_list=output_columns) # print(inputs) # print(outputs) return base.CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Time series data up to standardlize. Returns: Container DataFrame after standardlization. """ if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._clf.transform(sk_inputs) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") # print(outputs.metadata.to_internal_simple_structure()) outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) # print(inputs) # print(outputs) # print(inputs.metadata.to_internal_simple_structure()) # print(outputs.metadata.to_internal_simple_structure()) return CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: sk_inputs, columns_to_use = self._get_columns_to_fit( inputs, self.hyperparams) output = [] if len(sk_inputs.columns): try: af_inputs = af.from_ndarray(sk_inputs.values) weight_by_dist = self._weights == 'distance' dist_type = self._get_dist_type(self.hyperparams['dist_type']) af_output = self._predict(af_inputs, self._data, self._labels, \ self.hyperparams['n_neighbors'], dist_type, \ weight_by_dist) af_ndarray_output = af_output.to_ndarray().astype('int32') except sklearn.exceptions.NotFittedError as error: raise PrimitiveNotFittedError( "Primitive not fitted.") from error # For primitives that allow predicting without fitting like GaussianProcessRegressor if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") if sparse.issparse(af_ndarray_output): af_ndarray_output = af_ndarray_output.toarray() output = self._wrap_predictions(inputs, af_ndarray_output) output.columns = self._target_names output = [output] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._target_column_indices, columns_list=output) return CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs, columns_to_use = self._get_columns_to_fit( inputs, self.hyperparams) output = [] if len(sk_inputs.columns): af_inputs = af.from_ndarray(sk_inputs.values.astype('float32')) # Normalize feature values if not self._max_feature_value_defined: self._max_feature_value = af.max(train_feats) af_inputs = af_inputs / self._max_feature_value af_output = self._predict(af_inputs, self._weights) ndarray_output = af_output.to_ndarray() output = self._wrap_predictions(inputs, ndarray_output) output.columns = self._target_names output = [output] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._target_column_indices, columns_list=output) return CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Returns: Container DataFrame after Truncated SVD. """ self._clf = trmf( lags=self.hyperparams['lags'], K=self.hyperparams['K'], lambda_f=self.hyperparams['lambda_f'], lambda_x=self.hyperparams['lambda_x'], lambda_w=self.hyperparams['lambda_w'], alpha=self.hyperparams['alpha'], eta=self.hyperparams['eta'], max_iter=self.hyperparams['max_iter'], F_step=self.hyperparams['F_step'], X_step=self.hyperparams['X_step'], W_step=self.hyperparams['W_step'], ) tmp = inputs.copy() for col in inputs.columns: tmp[col] = inputs[col] / inputs[col].max() self._inputs = tmp self._fitted = False # Get cols to fit. self._training_inputs, self._training_indices = self._get_columns_to_fit( self._inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._clf.get_X() if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) # self._write(outputs) return CallResult(outputs)
def test_combine_columns_compact_metadata(self): main = container.DataFrame( { 'a1': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9], 'd1': [10, 11, 12], 'e1': [13, 14, 15] }, { 'top_level': 'main', }, generate_metadata=False) main.metadata = main.metadata.generate(main, compact=True) main.metadata = main.metadata.update_column(0, {'name': 'aaa111'}) main.metadata = main.metadata.update_column(1, { 'name': 'bbb111', 'extra': 'b_column' }) main.metadata = main.metadata.update_column(2, {'name': 'ccc111'}) columns2 = container.DataFrame({ 'a2': [21, 22, 23], 'b2': [24, 25, 26] }, { 'top_level': 'columns2', }, generate_metadata=False) columns2.metadata = columns2.metadata.generate(columns2, compact=True) columns2.metadata = columns2.metadata.update_column( 0, {'name': 'aaa222'}) columns2.metadata = columns2.metadata.update_column( 1, {'name': 'bbb222'}) columns3 = container.DataFrame({ 'a3': [31, 32, 33], 'b3': [34, 35, 36] }, { 'top_level': 'columns3', }, generate_metadata=False) columns3.metadata = columns3.metadata.generate(columns3, compact=True) columns3.metadata = columns3.metadata.update_column( 0, {'name': 'aaa333'}) columns3.metadata = columns3.metadata.update_column( 1, {'name': 'bbb333'}) result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='append', add_index_columns=False) self.assertEqual(result.values.tolist(), [ [1, 4, 7, 10, 13, 21, 24, 31, 34], [2, 5, 8, 11, 14, 22, 25, 32, 35], [3, 6, 9, 12, 15, 23, 26, 33, 36], ]) self.assertEqual( d3m_utils.to_json_structure( result.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'top_level': 'main', 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularRow' ], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularColumn' ], 'length': 9, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'aaa111', }, }, { 'selector': ['__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'bbb111', 'extra': 'b_column', }, }, { 'selector': ['__ALL_ELEMENTS__', 2], 'metadata': { 'name': 'ccc111', }, }, { 'selector': ['__ALL_ELEMENTS__', 3], 'metadata': { 'name': 'd1', }, }, { 'selector': ['__ALL_ELEMENTS__', 4], 'metadata': { 'name': 'e1', }, }, { 'selector': ['__ALL_ELEMENTS__', 5], 'metadata': { 'name': 'aaa222', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 6], 'metadata': { 'name': 'bbb222', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 7], 'metadata': { 'name': 'aaa333', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 8], 'metadata': { 'name': 'bbb333', 'structural_type': 'numpy.int64', }, }]) result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='new', add_index_columns=False) self.assertEqual(result.values.tolist(), [ [21, 24, 31, 34], [22, 25, 32, 35], [23, 26, 33, 36], ]) self.assertEqual( d3m_utils.to_json_structure( result.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'top_level': 'columns2', 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularRow' ], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularColumn' ], 'length': 4, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'aaa222', }, }, { 'selector': ['__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'bbb222', }, }, { 'selector': ['__ALL_ELEMENTS__', 2], 'metadata': { 'name': 'aaa333', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 3], 'metadata': { 'name': 'bbb333', 'structural_type': 'numpy.int64', }, }]) result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='replace', add_index_columns=False) self.assertEqual(result.values.tolist(), [ [1, 21, 24, 31, 34, 10, 13], [2, 22, 25, 32, 35, 11, 14], [3, 23, 26, 33, 36, 12, 15], ]) self.assertEqual( d3m_utils.to_json_structure( result.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'top_level': 'main', 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularRow' ], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularColumn' ], 'length': 7, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'aaa111', }, }, { 'selector': ['__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'aaa222', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 2], 'metadata': { 'name': 'bbb222', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 3], 'metadata': { 'name': 'aaa333', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 4], 'metadata': { 'name': 'bbb333', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 5], 'metadata': { 'name': 'd1', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 6], 'metadata': { 'name': 'e1', 'structural_type': 'numpy.int64', }, }]) result = utils.combine_columns(main, [0, 1, 2, 3, 4], [columns2, columns3], return_result='replace', add_index_columns=False) self.assertEqual(result.values.tolist(), [ [21, 24, 31, 34], [22, 25, 32, 35], [23, 26, 33, 36], ]) self.assertEqual( d3m_utils.to_json_structure( result.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'top_level': 'main', 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularRow' ], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularColumn' ], 'length': 4, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'aaa222', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'bbb222', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 2], 'metadata': { 'name': 'aaa333', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 3], 'metadata': { 'name': 'bbb333', 'structural_type': 'numpy.int64', }, }]) result = utils.combine_columns(main, [4], [columns2, columns3], return_result='replace', add_index_columns=False) self.assertEqual(result.values.tolist(), [ [1, 4, 7, 10, 21, 24, 31, 34], [2, 5, 8, 11, 22, 25, 32, 35], [3, 6, 9, 12, 23, 26, 33, 36], ]) self.assertEqual( d3m_utils.to_json_structure( result.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'top_level': 'main', 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularRow' ], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularColumn' ], 'length': 8, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'aaa111', }, }, { 'selector': ['__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'bbb111', 'extra': 'b_column', }, }, { 'selector': ['__ALL_ELEMENTS__', 2], 'metadata': { 'name': 'ccc111', }, }, { 'selector': ['__ALL_ELEMENTS__', 3], 'metadata': { 'name': 'd1', }, }, { 'selector': ['__ALL_ELEMENTS__', 4], 'metadata': { 'structural_type': 'numpy.int64', 'name': 'aaa222', }, }, { 'selector': ['__ALL_ELEMENTS__', 5], 'metadata': { 'structural_type': 'numpy.int64', 'name': 'bbb222', }, }, { 'selector': ['__ALL_ELEMENTS__', 6], 'metadata': { 'structural_type': 'numpy.int64', 'name': 'aaa333', }, }, { 'selector': ['__ALL_ELEMENTS__', 7], 'metadata': { 'structural_type': 'numpy.int64', 'name': 'bbb333', }, }]) result = utils.combine_columns(main, [0, 2, 4], [columns2, columns3], return_result='replace', add_index_columns=False) self.assertEqual(result.values.tolist(), [ [21, 4, 24, 10, 31, 34], [22, 5, 25, 11, 32, 35], [23, 6, 26, 12, 33, 36], ]) self.assertEqual( d3m_utils.to_json_structure( result.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'top_level': 'main', 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularRow' ], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularColumn' ], 'length': 6, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'aaa222', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'bbb111', 'extra': 'b_column', }, }, { 'selector': ['__ALL_ELEMENTS__', 2], 'metadata': { 'name': 'bbb222', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 3], 'metadata': { 'name': 'd1', }, }, { 'selector': ['__ALL_ELEMENTS__', 4], 'metadata': { 'name': 'aaa333', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 5], 'metadata': { 'name': 'bbb333', 'structural_type': 'numpy.int64', }, }])
def test_combine_columns_new_with_index_noncompact_metadata(self): main = container.DataFrame( { 'd3mIndex': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9] }, columns=['d3mIndex', 'b1', 'c1'], generate_metadata=False) main.metadata = main.metadata.generate(main, compact=False) main.metadata = main.metadata.update_column( 0, { 'name': 'd3mIndex', 'semantic_types': [ 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' ] }) main.metadata = main.metadata.update_column( 1, { 'name': 'b1', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'] }) main.metadata = main.metadata.update_column( 2, { 'name': 'c1', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'] }) columns = container.DataFrame({ 'd3mIndex': [1, 2, 3], 'b2': [4, 5, 6] }, columns=['d3mIndex', 'b2'], generate_metadata=False) columns.metadata = columns.metadata.generate(columns, compact=False) columns.metadata = columns.metadata.update_column( 0, { 'name': 'd3mIndex', 'semantic_types': [ 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' ] }) columns.metadata = columns.metadata.update_column( 1, { 'name': 'b2', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'] }) result = utils.combine_columns(main, [], [columns], return_result='new', add_index_columns=True) self.assertEqual(result.values.tolist(), [ [1, 4], [2, 5], [3, 6], ]) self.assertEqual( d3m_utils.to_json_structure( result.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularRow' ], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/TabularColumn' ], 'length': 2, }, }, }, { 'selector': ['__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'd3mIndex', 'semantic_types': [ 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' ], 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'b2', 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/Attribute' ], 'structural_type': 'numpy.int64', }, }])
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Inputs]: """ Add SIMON annotations Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- Input pd frame with metadata augmented """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") ## BEGIN originally from from d3m.primitives.schema_discovery.profiler.Common """ assert self._add_semantic_types is not None assert self._remove_semantic_types is not None columns_to_use, output_columns = self._produce_columns( inputs, self._add_semantic_types, self._remove_semantic_types) if self.hyperparams['replace_index_columns'] and self.hyperparams[ 'return_result'] == 'append': assert len(columns_to_use) == len(output_columns) index_columns = inputs.metadata.get_index_columns() index_columns_to_use = [] other_columns_to_use = [] index_output_columns = [] other_output_columns = [] for column_to_use, output_column in zip(columns_to_use, output_columns): if column_to_use in index_columns: index_columns_to_use.append(column_to_use) index_output_columns.append(output_column) else: other_columns_to_use.append(column_to_use) other_output_columns.append(output_column) outputs = base_utils.combine_columns( inputs, index_columns_to_use, index_output_columns, return_result='replace', add_index_columns=self.hyperparams['add_index_columns']) outputs = base_utils.combine_columns( outputs, other_columns_to_use, other_output_columns, return_result='append', add_index_columns=self.hyperparams['add_index_columns']) else: outputs = base_utils.combine_columns( inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) ## EMD originally from from d3m.primitives.schema_discovery.profiler.Common """ return CallResult(outputs, has_finished=self._is_fit)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame timeout: Default iterations: Default Returns: Container DataFrame containing Matrix Profile of selected columns """ # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._clf.produce(sk_inputs) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) #print(outputs) #CallResult(outputs) #print("___") print(outputs.columns) #outputs.columns = [str(x) for x in outputs.columns] return CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame Returns: Container DataFrame added with binary version of a column a sort of one hot encoding of values under different columns named as "column name_category value" for all the columns passed in list while building the pipeline """ assert isinstance(inputs, container.DataFrame), type(dataframe) self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit( inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: # self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") sk_inputs = inputs if self.hyperparams['use_semantic_types']: cols = [inputs.columns[x] for x in self._training_indices] sk_inputs = container.DataFrame( data=inputs.iloc[:, self._training_indices].values, columns=cols, generate_metadata=True) output_columns = [] if len(self._training_indices) > 0: sk_output = self._clf.produce(sk_inputs) # print("sk_ouput",sk_output) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) # if len(outputs.columns) == len(self._input_column_names): # outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) # self._update_metadata(outputs) return base.CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Process the testing data. Args: inputs: Container DataFrame. Returns: Container DataFrame after BKFilter. """ # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit( inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns operated_col = [ int(x.strip('#')) for x in re.findall(r'#\d*#', self.hyperparams['rule']) ] if set(operated_col) != set(self._training_indices): # print(operated_col, self._training_indices) raise RuntimeError( "Column numbers in 'rule' and 'use_columns' are not matched.") if len(self._training_indices) > 0: self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") # if not self._fitted: # raise PrimitiveNotFittedError("Primitive not fitted.") # sk_inputs = inputs # if self.hyperparams['use_semantic_types']: # sk_inputs = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: sk_output = self._rule_based_filter(inputs, self.hyperparams['rule']) if sparse.issparse(sk_output): sk_output = sk_output.toarray() outputs = self._wrap_predictions(inputs, sk_output) if len(outputs.columns) == len(self._input_column_names): outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns( return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) # self._write(outputs) # self.logger.warning('produce was called3') return CallResult(outputs)
def produce( self, *, inputs: base.FileReaderInputs, timeout: float = None, iterations: int = None, ) -> base_prim.CallResult[base.FileReaderOutputs]: logger.debug(f"Producing {__name__}") columns_to_use = self._get_columns(inputs.metadata) inputs_clone = inputs.copy() if len(columns_to_use) == 0: return base_prim.CallResult(inputs_clone) column_index = columns_to_use[0] band_column_indices = self._get_band_column(inputs.metadata) if len(band_column_indices) == 0: band_column_name = self.hyperparams["band_column"] else: band_column_name = inputs.columns[band_column_indices[0]] # need to flatten the dataframe, creating a list of files per tile grouping_column = self._get_grouping_key_column(inputs_clone) if grouping_column < 0: self.logger.warning( "no columns to use for grouping key so returning loaded images as output" ) return base_prim.CallResult(inputs_clone) base_uri = inputs_clone.metadata.query( (metadata_base.ALL_ELEMENTS, column_index))["location_base_uris"][0] grouping_name = inputs_clone.columns[grouping_column] file_column_name = inputs_clone.columns[column_index] start = time.time() logger.debug("Loading images") # group by grouping key to get all the images loaded in one row groups = inputs_clone.groupby([grouping_name], sort=False) # use the max dimension for the first group as the max dimension for all groups group_key = groups[grouping_name].first()[0] max_dimension = self._get_group_image_size(groups.get_group(group_key), file_column_name, band_column_name, base_uri) # load images for each group and store them in a matrix of [band, x, y] jobs = [ delayed(self._load_image_group)( group[1][file_column_name], group[1][band_column_name], base_uri, max_dimension, ) for group in tqdm(groups, total=len(groups)) ] groups = Parallel(n_jobs=self.hyperparams["n_jobs"], backend="loky", verbose=10)(jobs) end = time.time() logger.debug(f"Loaded images in {end-start}s") logger.debug("Updating metadata") start = time.time() # auto-generate metdata for one row's worth of data - necessary to avoid having the generation step traverse all of the data # which is extremely slow first_df = container.DataFrame( { file_column_name: [groups[0]] }, generate_metadata=True).reset_index(drop=True) rest_df = container.DataFrame({file_column_name: groups[1:]}) grouped_df = first_df.append(rest_df, ignore_index=True) grouped_df.metadata = grouped_df.metadata.update( (), {"dimension": { "length": grouped_df.shape[0] }}) grouped_df.metadata = grouped_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "http://schema.org/ImageObject") end = time.time() logger.debug(f"Updated metadata in {end-start}s") # only keep one row / group from the input - use the first band value to select against first_band = inputs_clone[band_column_name][0] first_groups = inputs_clone.loc[inputs_clone[band_column_name] == first_band].reset_index(drop=True) outputs = base_utils.combine_columns( first_groups, [column_index], [grouped_df], return_result=self.hyperparams["return_result"], add_index_columns=self.hyperparams["add_index_columns"], ) if self.hyperparams["return_result"] == "append": outputs.metadata = self._reassign_boundaries( outputs.metadata, columns_to_use) outputs.metadata = outputs.metadata.update( (), {"dimension": { "length": outputs.shape[0] }}) polygon_columns = outputs.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/LocationPolygon", )) vector_columns = outputs.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/FloatVector", )) if len(vector_columns) > 0 and len(polygon_columns) == 0: outputs.metadata = outputs.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, vector_columns[0]), "https://metadata.datadrivendiscovery.org/types/LocationPolygon", ) return base_prim.CallResult(outputs)