def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: self._fitted = True categorical_attributes = DataMetadata.list_columns_with_semantic_types( self=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ] ) all_attributes = DataMetadata.list_columns_with_semantic_types( self=self._training_data.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"] ) self._s_cols = container.List(set(all_attributes).intersection(categorical_attributes)) _logger.debug("%d of categorical attributes found." % (len(self._s_cols))) if len(self._s_cols) > 0: # temp_model = defaultdict(LabelEncoder) # self._training_data.iloc[:, self._s_cols].apply(lambda x: temp_model[x.name].fit(x)) # self._model = dict(temp_model) self._model = {} for col_index in self._s_cols: self._model[col_index] = self._training_data.iloc[:, col_index].dropna().unique() return CallResult(None, has_finished=True)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._input_data is None: raise ValueError('Missing training(fitting) data.') # Look at attribute columns only # print('fit in', self._input_data.columns) data = self._input_data.copy() all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) # Remove columns with all empty values, structural type str numeric = DataMetadata.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in all_attributes] self._empty_columns = [] _logger.debug(f'Numeric columns: {numeric}') for element in numeric: if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str: if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]: _logger.debug(f'Empty numeric str column: {element}') self._empty_columns.append(element) # Remove columns with all empty values, structural numeric is_empty = pd.isnull(data).sum(axis=0) == data.shape[0] for i in all_attributes: if is_empty.iloc[i] and i not in self._empty_columns: _logger.debug(f'Empty numeric str column: {element}') self._empty_columns.append(i) _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns])) data = container.DataFrame.remove_columns(data, self._empty_columns) categorical_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData"]) all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes)) self._cat_columns = data.columns[self._cat_col_index].tolist() _logger.debug('Encoding columns: {}'.format(self._cat_columns)) mapping = {} for column_name in self._cat_columns: col = data[column_name] temp = self._trim_features(col, self.hyperparams['n_limit']) if temp: mapping[temp[0]] = temp[1] self._mapping = mapping self._fitted = True return CallResult(None, has_finished=True)
def __get_fitted(self): attribute = DataMetadata.list_columns_with_semantic_types( self._train_x.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) # Mean for numerical columns self._numeric_columns = DataMetadata.list_columns_with_semantic_types( self._train_x.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) self._numeric_columns = [ x for x in self._numeric_columns if x in attribute ] _logger.debug('numeric columns %s', str(self._numeric_columns)) # Convert selected columns to_numeric, then compute column mean, then convert to_dict self.mean_values = self._train_x.iloc[:, self._numeric_columns].apply( lambda col: pd.to_numeric(col, errors='coerce')).mean( axis=0).to_dict() for name in self.mean_values.keys(): if pd.isnull(self.mean_values[name]): self.mean_values[name] = 0.0 # Mode for categorical columns self._categoric_columns = DataMetadata.list_columns_with_semantic_types( self._train_x.metadata, [ 'https://metadata.datadrivendiscovery.org/types/CategoricalData', 'http://schema.org/Boolean' ]) self._categoric_columns = [ x for x in self._categoric_columns if x in attribute ] _logger.debug('categorical columns %s', str(self._categoric_columns)) mode_values = self._train_x.iloc[:, self._categoric_columns].mode( axis=0).iloc[0].to_dict() for name in mode_values.keys(): if pd.isnull(mode_values[name]): # mode is nan rest = self._train_x[name].dropna() if rest.shape[0] == 0: # every value is nan mode = 0 else: mode = rest.mode().iloc[0] mode_values[name] = mode self.mean_values.update(mode_values) if self._verbose: import pprint print('mean imputation:') pprint.pprint(self.mean_values) _logger.debug('Mean values:') for name, value in self.mean_values.items(): _logger.debug(' %s %s', name, str(value))
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: numerical_attributes = DataMetadata.list_columns_with_semantic_types( self=self._training_data.metadata, semantic_types=["http://schema.org/Float", "http://schema.org/Integer"]) all_attributes = DataMetadata.list_columns_with_semantic_types( self=self._training_data.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]) self._s_cols = list(set(all_attributes).intersection(numerical_attributes)) # print(" %d columns scaled" % (len(self._s_cols))) if len(self._s_cols) > 0: self._model.fit(self._training_data.iloc[:, self._s_cols]) self._fitted = True else: self._fitted = False return CallResult(None, has_finished=True, iterations_done=1)
def set_training_data(self, *, inputs: Input) -> None: """ Sets training data of this primitive. Parameters ---------- inputs : Input The inputs. """ attribute = DataMetadata.list_columns_with_semantic_types( inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) nan_sum = 0 for col in attribute: if str(inputs.dtypes[inputs.columns[col]]) != "object": nan_sum += inputs.iloc[:, col].isnull().sum() else: for i in range(inputs.shape[0]): if inputs.iloc[i, col] == "" or pd.isnull( inputs.iloc[i, col]): nan_sum += 1 if nan_sum == 0: # no missing value exists if self._verbose: _logger.info('no missing value in train dataset') self._train_x = inputs self._is_fitted = False
def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: str) -> typing.Optional[int]: indices = inputs_metadata.list_columns_with_semantic_types( cls._semantic_types, at=(res_id, )) for i in indices: if cls._is_csv_file_column(inputs_metadata, res_id, i): return i return None
def _get_date_cols(data): dates = DataMetadata.list_columns_with_semantic_types( data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Time" ]) return dates
def _get_floatvector_column(self, inputs_metadata: metadata_base.DataMetadata): fv_column = self.hyperparams["column"] if fv_column: return fv_column fv_columns = inputs_metadata.list_columns_with_semantic_types( self._floatvector_semantic) if len(fv_columns) > 0: return fv_columns[0] logger.warning( "inputs provided contains no specified FloatVector column and lacks columns with FloatVector semantic" ) return None
def _split_column(self, inputs): """ Inner function to sample part of the column of the input dataset """ input_dataset_shape = inputs[self._main_resource_id].shape # find target column, we should not split these column target_column = DataMetadata.list_columns_with_semantic_types( self._training_inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(self._main_resource_id, )) if not target_column: self._logger.warn("No target column found from the input dataset.") index_column = DataMetadata.get_index_columns( self._training_inputs.metadata, at=(self._main_resource_id, )) if not index_column: self._logger.warn("No index column found from the input dataset.") outputs = copy.copy(inputs) if self._status is Status.TRAIN: # check again on the amount of the attributes column only # we only need to sample when attribute column numbers are larger than threshould attribute_column_length = (input_dataset_shape[1] - len(index_column) - len(target_column)) if attribute_column_length > self._threshold_column_length: attribute_column = set(range(input_dataset_shape[1])) for each_target_column in target_column: attribute_column.remove(each_target_column) for each_index_column in index_column: attribute_column.remove(each_index_column) # generate the remained column index randomly and sort it self._column_remained = random.sample( attribute_column, self._threshold_column_length) self._column_remained.extend(target_column) self._column_remained.extend(index_column) self._column_remained.sort() if len(self._column_remained) > 0: # Just to make sure. outputs.metadata = copy.deepcopy(inputs.metadata) outputs[self._main_resource_id] = inputs[ self._main_resource_id].iloc[:, self._column_remained] outputs.metadata = self._select_columns_metadata( outputs.metadata, self._main_resource_id, self._column_remained) return outputs
def _process_files(self, inputs: Input): fn_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \ semantic_types=["https://metadata.datadrivendiscovery.org/types/FileName"]) all_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \ semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]) fn_columns = list(set(all_attributes).intersection(fn_attributes)) # if no file name columns are detected, default to regular behavior if len(fn_columns) == 0: return inputs # create an empty DataFrame of the required size processed_cols = pd.DataFrame("", index = copy.deepcopy(inputs.index), \ columns = ['text_files_' + str(i) for i in range(len(fn_columns))]) # for column_index in range(len(fn_columns)): for column_index in fn_columns: curr_column = copy.deepcopy(inputs.iloc[:, column_index]) file_loc = inputs.metadata.query( (ALL_ELEMENTS, column_index))['location_base_uris'] file_loc = file_loc[0] # take the first elem of the tuple file_loc = file_loc[7:] # get rid of 'file://' prefix for row_index in range(curr_column.shape[0]): text_file = curr_column.iloc[row_index] file_path = file_loc + text_file with open(file_path, 'rb') as file: doc = file.read() doc = "".join(map(chr, doc)) doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall( doc) # list of strings processed_cols.iloc[row_index, fn_columns.index(column_index)] = " ".join( doc_tokens) # construct metadata for the newly generated columns processed_cols = d3m_DataFrame(processed_cols, generate_metadata=True) for column_index in range(processed_cols.shape[1]): col_dict = dict( processed_cols.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type("text") # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = 'processed_file_' + str(inputs.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') processed_cols.metadata = processed_cols.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate the input with the newly created columns updated_inputs = utils.append_columns(inputs, processed_cols) # remove the initial FileName columns from the df, if we do this before concatenating we might get an empty dataset error updated_inputs = utils.remove_columns(updated_inputs, fn_columns) return updated_inputs
def fit(self, *, timeout: float = None, iterations: int = None) -> None: # if already fitted, do nothing if self.fitted: return CallResult(None, True, 1) self.training_data = self._process_files(self.training_data) text_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\ semantic_types=["http://schema.org/Text"]) all_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\ semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]) categorical_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\ semantic_types=["https://metadata.datadrivendiscovery.org/types/CategoricalData"]) # want text columns that are attributes self.text_columns = set(all_attributes).intersection(text_attributes) # but, don't want to edit categorical columns self.text_columns = set( self.text_columns) - set(categorical_attributes) # and, we want the text columns as a list self.text_columns = list(self.text_columns) # if no text columns are present don't do anything self.do_nothing = False if len(self.text_columns) == 0: self.fitted = True self.model = None self.bow = None self.do_nothing = True self.text_columns = None self.latent_factors = None self.max_iter = None return CallResult(None, True, 1) # instantiate a corex model and a bag of words model self.model = Corex(n_hidden=self.hyperparams['n_hidden'], max_iter=iterations, seed=self.random_seed) self.bow = TfidfVectorizer(decode_error='ignore', max_df=self.hyperparams['max_df'], min_df=self.hyperparams['min_df']) # set the number of iterations (for wrapper and underlying Corex model) if iterations is not None: self.max_iter = iterations else: self.max_iter = 250 self.model.max_iter = self.max_iter # concatenate the columns row-wise concat_cols = None for column_index in self.text_columns: if concat_cols is not None: concat_cols = concat_cols.str.cat( self.training_data.iloc[:, column_index], sep=" ") else: concat_cols = copy.deepcopy( self.training_data.iloc[:, column_index]) try: bow = self.bow.fit_transform( map(self._get_ngrams, concat_cols.ravel())) except ValueError: self.bow = TfidfVectorizer(decode_error='ignore', max_df=self.hyperparams['max_df'], min_df=0) bow = self.bow.fit_transform( map(self._get_ngrams, concat_cols.ravel())) print("[WARNING] Setting min_df to 0 to avoid ValueError") # choose between CorEx and the TfIdf matrix if bow.shape[1] > self.hyperparams['threshold']: # use CorEx self.latent_factors = self.model.fit_transform(bow) else: # just use the bag of words representation self.latent_factors = pd.DataFrame(bow.todense()) self.fitted = True return CallResult(None, True, 1)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ train imputation parameters. Now support: -> greedySearch for the method that not trainable, do nothing: -> interatively regression -> other Parameters: ---------- data: pandas dataframe label: pandas series, used for the trainable methods """ # if already fitted on current dataset, do nothing if self._is_fitted: return CallResult(None, self._has_finished, self._iterations_done) if (timeout is None): timeout = 2**31 - 1 if (iterations is None): self._iterations_done = True iterations = 30 # import pdb # pdb.set_trace() # setup the timeout with stopit.ThreadingTimeout(timeout) as to_ctx_mrg: assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING data = self._train_x.copy() # start fitting if self._verbose: _logger.info("=========> iteratively regress method:") attribute = DataMetadata.list_columns_with_semantic_types( data.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) numeric = DataMetadata.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in attribute] numeric_data = data.iloc[:, numeric].apply( lambda col: pd.to_numeric(col, errors='coerce')) data_clean, self._best_imputation = self.__iterativeRegress( numeric_data, iterations) self._numeric_column_indices = numeric if to_ctx_mrg.state == to_ctx_mrg.EXECUTED: self._is_fitted = True self._iterations_done = True self._has_finished = True elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT: self._is_fitted = False self._iterations_done = False self._has_finished = False return CallResult(None, self._has_finished, self._iterations_done)
def _get_band_column(self, inputs_metadata: metadata_base.DataMetadata): return inputs_metadata.list_columns_with_semantic_types( self._band_semantic_types)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ precond: run fit() before Parameters: ---------- data: pandas dataframe """ if (not self._is_fitted): # todo: specify a NotFittedError, like in sklearn raise ValueError("Calling produce before fitting.") # if (pd.isnull(inputs).sum().sum() == 0): # no missing value exists # if self._verbose: print ("Warning: no missing value in test dataset") # self._has_finished = True # return CallResult(inputs, self._has_finished, self._iterations_done) if (timeout is None): timeout = 2**31 - 1 if isinstance(inputs, pd.DataFrame): data = inputs.copy() else: data = inputs[0].copy() # setup the timeout with stopit.ThreadingTimeout(timeout) as to_ctx_mrg: assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING # start completing data... if self._verbose: print("=========> impute by mean value of the attribute:") data.iloc[:, self. _numeric_columns] = data.iloc[:, self. _numeric_columns].apply( lambda col: pd. to_numeric(col, errors= 'coerce')) # assume the features of testing data are same with the training data # therefore, only use the mean_values to impute, should get a clean dataset attribute = DataMetadata.list_columns_with_semantic_types( data.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) # try: for col in attribute: if str(inputs.dtypes[inputs.columns[col]]) != "object": if data.iloc[:, col].isnull().sum() != 0: data.iloc[:, col].fillna( self.mean_values[data.columns[col]], inplace=True) else: for i in range(data.shape[0]): if data.iloc[i, col] == "" or pd.isnull( data.iloc[i, col]): data.iloc[i, col] = self.mean_values[ data.columns[col]] # except: # import pdb # pdb.set_trace() data_clean = data # Update metadata for col in self._numeric_columns: old_metadata = dict( data_clean.metadata.query((mbase.ALL_ELEMENTS, col))) dtype = data_clean.iloc[:, col].dtype if str(dtype).lower().startswith("int"): if "http://schema.org/Integer" not in old_metadata[ 'semantic_types']: old_metadata['semantic_types'] += ( "http://schema.org/Integer", ) old_metadata["structural_type"] = type(10) elif str(dtype).lower().startswith("float"): if "http://schema.org/Float" not in old_metadata[ 'semantic_types']: old_metadata['semantic_types'] += ( "http://schema.org/Float", ) old_metadata["structural_type"] = type(10.2) data_clean.metadata = data_clean.metadata.update( (mbase.ALL_ELEMENTS, col), old_metadata) value = None if to_ctx_mrg.state == to_ctx_mrg.EXECUTED: self._has_finished = True self._iterations_done = True value = data_clean elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT: _logger.warn('Produce timed out') self._has_finished = False self._iterations_done = False return CallResult(value, self._has_finished, self._iterations_done)