def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: primary_key_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/PrimaryKey"] ) unfold_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=self.hyperparams["unfold_semantic_types"] ) if not primary_key_cols: warnings.warn("Did not find primary key column for grouping. Will not unfold") return CallResult(inputs) if not unfold_cols: warnings.warn("Did not find any column to unfold. Will not unfold") return CallResult(inputs) primary_key_col_names = [inputs.columns[pos] for pos in primary_key_cols] unfold_col_names = [inputs.columns[pos] for pos in unfold_cols] if self.hyperparams["use_pipeline_id_semantic_type"]: pipeline_id_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/PipelineId"] ) if len(pipeline_id_cols) >= 2: warnings.warn("Multiple pipeline id columns found. Will use first.") if pipeline_id_cols: inputs = inputs.sort_values(primary_key_col_names + [inputs.columns[pos] for pos in pipeline_id_cols]) self._sorted_pipe_ids = sorted(inputs.iloc[:, pipeline_id_cols[0]].unique()) else: warnings.warn( "No pipeline id column found by 'https://metadata.datadrivendiscovery.org/types/PipelineId'") new_df = self._get_new_df(inputs=inputs, use_cols=primary_key_cols + unfold_cols) groupby_df = inputs.groupby(primary_key_col_names)[unfold_col_names].aggregate( lambda x: container.List(x)).reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata ret_df = self._update_metadata_dimension(df=ret_df) split_col_names = [inputs.columns[pos] for pos in unfold_cols] ret_df = self._split_aggregated(df=ret_df, split_col_names=split_col_names) ret_df = common_utils.remove_columns( inputs=ret_df, column_indices=[ret_df.columns.get_loc(name) for name in split_col_names] ) return CallResult(ret_df)
def fit(self, *, timeout: float = None, iterations: int = None) -> None: if self._fitted: return if self._input_data is None: raise ValueError('Missing training(fitting) data.') # Look at attribute columns only # print('fit in', self._input_data.columns) data = self._input_data.copy() all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) # Remove columns with all empty values, structural type str numeric = utils.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in all_attributes] for element in numeric: if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str: if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]: self._empty_columns.append(element) # Remove columns with all empty values, structural numeric is_empty = pd.isnull(data).sum(axis=0) == data.shape[0] for i in all_attributes: if is_empty.iloc[i]: self._empty_columns.append(i) _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns])) data = utils.remove_columns(data, self._empty_columns, source='ISI DSBox Data Encoder') categorical_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData"]) all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes)) self._cat_columns = data.columns[self._cat_col_index].tolist() _logger.debug('Encoding columns: {}'.format(self._cat_columns)) mapping = {} for column_name in self._cat_columns: col = data[column_name] temp = self._trim_features(col, self.hyperparams['n_limit']) if temp: mapping[temp[0]] = temp[1] self._mapping = mapping self._fitted = True
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: return CallResult(inputs, self._has_finished, self._iterations_done) assert isinstance( self._model, dict), "self._model type must be dict not defaultdict!" temp = pd.DataFrame( inputs.iloc[:, self._s_cols].apply(lambda x: self._model[ x.name].transform(x) if x.name in self._model else None)) outputs = inputs.copy() for id_index, od_index in zip(self._s_cols, range(temp.shape[1])): outputs.iloc[:, id_index] = temp.iloc[:, od_index] lookup = { "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') } for index in self._s_cols: old_metadata = dict( outputs.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata["semantic_types"] = lookup["int"] old_metadata["structural_type"] = type(10) outputs.metadata = outputs.metadata.update( (mbase.ALL_ELEMENTS, index), old_metadata) # remove the columns that appeared in produce method but were not in fitted data drop_names = set(outputs.columns[self._s_cols]).difference( set(self._model.keys())) drop_indices = map(lambda a: outputs.columns.get_loc(a), drop_names) drop_indices = sorted(drop_indices) outputs = common_utils.remove_columns(outputs, drop_indices, source='ISI DSBox Data Labler') # sanity check and report the results if outputs.shape[0] == inputs.shape[0] and \ outputs.shape[1] == inputs.shape[1] - len(drop_names): self._has_finished = True self._iterations_done = True # print("output:",outputs.head(5)) return CallResult(d3m_DataFrame(outputs), self._has_finished, self._iterations_done) else: return CallResult(inputs, self._has_finished, self._iterations_done)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: columns_list_to_fold = self._mapping.get('foldable_columns', []) if len(columns_list_to_fold) == 0: return CallResult(inputs, True, 1) if inputs.shape[0] > 20000: return CallResult(inputs, True, 1) self._column_names = list(inputs) if inputs is not None else [] df = None for columns_to_fold in columns_list_to_fold: df = self._fold_columns(inputs, columns_to_fold) cols_to_drop = list() for col_idx, col_name in enumerate(inputs.columns): if col_name not in df.columns: cols_to_drop.append(col_idx) inputs = utils.remove_columns(inputs, cols_to_drop) new_df = inputs[0:0] for col_name in new_df.columns: new_df.loc[:, col_name] = df.loc[:, col_name] extends = {} for col_name in df.columns: if col_name not in new_df.columns: extends[col_name] = df.loc[:, col_name].tolist() if extends: extends_df = d3m_DataFrame.from_dict(extends) extends_df.index = new_df.index.copy() new_df = utils.append_columns(new_df, extends_df) new_df = self._update_type(new_df, list(extends.keys())) old_metadata = dict(new_df.metadata.query(())) old_metadata["dimension"] = dict(old_metadata["dimension"]) old_metadata["dimension"]["length"] = new_df.shape[0] new_df.metadata = new_df.metadata.update((), old_metadata) return CallResult(new_df, True, 1) if new_df is not None else CallResult( inputs, True, 1)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ Convert and output the input data into unary encoded format, using the trained (fitted) encoder. Value unseen in training_inputs would be rounded to nearest value in training_inputs. Missing(NaN) cells in a column one-hot encoded would give out a row of all-ZERO columns for the target column. """ #if self._target_columns == []: # return CallResult(inputs, True, 1) if not self._fitted: raise ValueError('Encoder model not fitted. Use fit()') # Return if there is nothing to encode if len(self._cat_columns)==0: return CallResult(inputs, True, 1) if isinstance(inputs, pd.DataFrame): data = inputs.copy() else: data = inputs[0].copy() data = utils.remove_columns(data, self._empty_columns) set_columns = set(data.columns) if set_columns != self._all_columns: raise ValueError('Columns(features) fed at produce() differ from fitted data.') # core part: encode the unary columns data_enc = data.iloc[:, self._cat_col_index].apply(lambda col: pd.to_numeric(col, errors='coerce')) data_else = data.drop(self._mapping.keys(),axis=1) res = [] for column_name in data_enc: col = data_enc[column_name] col.is_copy = False # only apply unary encoder when the amount of the numerical data is less than 12 if self._requirement[column_name]: chg_v = lambda x: min(self._mapping[col.name], key=lambda a:abs(a-x)) if x is not None else x # only encode the values which is not null col[col.notnull()] = col[col.notnull()].apply(chg_v) encoded = self.__encode_column(col) res.append(encoded) else: res.append(col) if self._text2int: texts = data_else.select_dtypes([object]) le = Label_encoder() le.set_params(self._textmapping) data_else[texts.columns] = le.transform_pd(texts) # transfer the encoded results to dataFrame encoded = d3m_DataFrame(pd.concat(res, axis=1)) # update metadata for existing columns for index in range(len(encoded.columns)): old_metadata = dict(encoded.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata["structural_type"] = int old_metadata["semantic_types"] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') encoded.metadata = encoded.metadata.update((mbase.ALL_ELEMENTS, index), old_metadata) # after extracting the traget columns, remove these columns from dataFrame data_else = utils.remove_columns(data, self._cat_col_index) result = utils.horizontal_concat(data_else, encoded) return CallResult(result, True, 1)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Need training data from set_training_data first. The encoder would record specified columns to encode and column values to unary encode later in the produce step. """ if self._fitted: return if self._training_inputs is None: raise ValueError('Missing training(fitting) data.') data = self._training_inputs.copy() all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) # Remove columns with all empty values, structural type str numeric = utils.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in all_attributes] for element in numeric: if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ())==str: if pd.isnull(pd.to_numeric(data.iloc[:,element], errors='coerce')).sum() == data.shape[0]: self._empty_columns.append(element) # Remove columns with all empty values, structural numeric is_empty = pd.isnull(data).sum(axis=0) == data.shape[0] for i in all_attributes: if is_empty.iloc[i]: self._empty_columns.append(i) self._empty_columns = list(set(self._empty_columns)) self._empty_columns.reverse() self._empty_columns = container.List(self._empty_columns) data = utils.remove_columns(data, self._empty_columns) # print('fit', data.shape) categorical_attributes = utils.list_columns_with_semantic_types( metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ] ) all_attributes = utils.list_columns_with_semantic_types( metadata=data.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"] ) self._cat_col_index = container.List(set(all_attributes).intersection(numeric)) self._cat_columns = container.List(data.columns[self._cat_col_index].tolist()) #import pdb #pdb.set_trace() numerical_values = data.iloc[:, self._cat_col_index].apply( lambda col: pd.to_numeric(col, errors='coerce')) self._all_columns = set(data.columns) # mapping idict = {} for name in self._cat_columns: col = numerical_values[name] idict[name] = sorted(col.unique()) self._mapping = idict if self._text2int: texts = data.drop(self._mapping.keys(),axis=1) texts = texts.select_dtypes(include=[object]) le = Label_encoder() le.fit_pd(texts) self._textmapping = le.get_params() # determine whether to run unary encoder on the given column or not data_enc = data.iloc[:, self._cat_col_index].apply(lambda col: pd.to_numeric(col, errors='coerce')) for column_name in data_enc: col = data_enc[column_name] col.is_copy = False # only apply unary encoder when the amount of the numerical data is less than 12 if col.unique().shape[0] < 13: self._requirement[column_name] = True else: self._requirement[column_name] = False self._fitted = True return CallResult(None, has_finished=True, iterations_done=1)
def _process_files(self, inputs: Input): fn_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \ semantic_types=["https://metadata.datadrivendiscovery.org/types/FileName"]) all_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \ semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]) fn_columns = list(set(all_attributes).intersection(fn_attributes)) # if no file name columns are detected, default to regular behavior if len(fn_columns) == 0: return inputs # create an empty DataFrame of the required size processed_cols = pd.DataFrame("", index = copy.deepcopy(inputs.index), \ columns = ['text_files_' + str(i) for i in range(len(fn_columns))]) # for column_index in range(len(fn_columns)): for column_index in fn_columns: curr_column = copy.deepcopy(inputs.iloc[:, column_index]) file_loc = inputs.metadata.query( (ALL_ELEMENTS, column_index))['location_base_uris'] file_loc = file_loc[0] # take the first elem of the tuple file_loc = file_loc[7:] # get rid of 'file://' prefix for row_index in range(curr_column.shape[0]): text_file = curr_column.iloc[row_index] file_path = file_loc + text_file with open(file_path, 'rb') as file: doc = file.read() doc = "".join(map(chr, doc)) doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall( doc) # list of strings processed_cols.iloc[row_index, fn_columns.index(column_index)] = " ".join( doc_tokens) # construct metadata for the newly generated columns processed_cols = d3m_DataFrame(processed_cols, generate_metadata=True) for column_index in range(processed_cols.shape[1]): col_dict = dict( processed_cols.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type("text") # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = 'processed_file_' + str(inputs.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') processed_cols.metadata = processed_cols.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate the input with the newly created columns updated_inputs = utils.append_columns(inputs, processed_cols) # remove the initial FileName columns from the df, if we do this before concatenating we might get an empty dataset error updated_inputs = utils.remove_columns(updated_inputs, fn_columns) return updated_inputs
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: # if corex didn't run for any reason, just return the given dataset if self.do_nothing: return CallResult(inputs, True, 1) inputs = self._process_files(inputs) if iterations is not None: self.max_iter = iterations else: self.max_iter = 250 self.model.max_iter = self.max_iter # concatenate the columns row-wise concat_cols = None for column_index in self.text_columns: if concat_cols is not None: concat_cols = concat_cols.str.cat(inputs.iloc[:, column_index], sep=" ") else: concat_cols = copy.deepcopy(inputs.iloc[:, column_index]) bow = self.bow.transform(map(self._get_ngrams, concat_cols.ravel())) # choose between CorEx and the TfIdf matrix if bow.shape[1] > self.hyperparams['threshold']: # use CorEx self.latent_factors = self.model.transform(bow).astype(float) else: # just use the bag of words representation self.latent_factors = pd.DataFrame(bow.todense()) # make the columns corex adds distinguishable from other columns # remove the selected columns from input and add the latent factors given by corex out_df = d3m_DataFrame(inputs, generate_metadata=True) self.latent_factors.columns = [ str(out_df.shape[-1] + i) for i in range(self.latent_factors.shape[-1]) ] # create metadata for the corex columns corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True) for column_index in range(corex_df.shape[1]): col_dict = dict( corex_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = 'corex_' + str(out_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') corex_df.metadata = corex_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate is --VERY-- slow without this next line corex_df.index = out_df.index.copy() out_df = utils.append_columns(out_df, corex_df) # remove the initial text columns from the df, if we do this before CorEx we can get an empty dataset error out_df = utils.remove_columns(out_df, self.text_columns) # TO DO : Incorporate timeout, max_iter # return CallResult(d3m_DataFrame(self.latent_factors)) return CallResult(out_df, True, 1)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ Convert and output the input data into encoded format, using the trained (fitted) encoder. Notice that [colname]_other_ and [colname]_nan columns are always kept for one-hot encoded columns. """ self._input_data_copy = inputs.copy() # Remove columns with all empty values _logger.debug('Removing entirely empty columns: {}'.format(self._input_data_copy.columns[self._empty_columns])) self._input_data_copy = utils.remove_columns(self._input_data_copy, self._empty_columns, source='ISI DSBox Data Encoder') # Return if there is nothing to encode if len(self._cat_columns) == 0: return CallResult(self._input_data_copy, True, 1) _logger.debug('Encoding columns: {}'.format(self._cat_columns)) data_encode = self._input_data_copy[list(self._mapping.keys())] # Get rid of false SettingWithCopyWarning data_encode.is_copy = None res = [] for column_name in self._cat_columns: feature = data_encode[column_name].copy() other_ = lambda x: 'Other' if (x and x not in self._mapping[column_name]) else x nan_ = lambda x: x if x else np.nan feature.loc[feature.notnull()] = feature[feature.notnull()].apply(other_) feature = feature.apply(nan_) new_column_names = ['{}_{}'.format(column_name, i) for i in self._mapping[column_name] + ['nan']] encoded = pd.get_dummies(feature, dummy_na=True, prefix=column_name) missed = [name for name in new_column_names if name not in list(encoded.columns)] for m in missed: # print('missing', m) encoded[m] = 0 encoded = encoded[new_column_names] res.append(encoded) # data_encode.loc[:,column_name] = feature # Drop columns that will be encoded # data_rest = self._input_data_copy.drop(self._mapping.keys(), axis=1) columns_names = self._input_data_copy.columns.tolist() drop_indices = [columns_names.index(col) for col in self._mapping.keys()] drop_indices = sorted(drop_indices) all_categorical = False try: self._input_data_copy = utils.remove_columns(self._input_data_copy, drop_indices, source='ISI DSBox Data Encoder') except ValueError: _logger.warning("[warn] All the attributes are categorical!") all_categorical = True # metadata for columns that are not one hot encoded # self._col_index = [self._input_data_copy.columns.get_loc(c) for c in data_rest.columns] # data_rest.metadata = utils.select_columns_metadata(self._input_data_copy.metadata, self._col_index) # encode data # encoded = d3m_DataFrame(pd.get_dummies(data_encode, dummy_na=True, prefix=self._cat_columns, prefix_sep='_', # columns=self._cat_columns)) encoded = d3m_DataFrame(pd.concat(res, axis=1)) # update metadata for existing columns for index in range(len(encoded.columns)): old_metadata = dict(encoded.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata["structural_type"] = int old_metadata["semantic_types"] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') encoded.metadata = encoded.metadata.update((mbase.ALL_ELEMENTS, index), old_metadata) ## merge/concat both the dataframes if not all_categorical: output = utils.horizontal_concat(self._input_data_copy, encoded) else: output = encoded return CallResult(output, True, 1)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: self._input_data_copy = inputs.copy() cols_to_drop = list() date_cols = self._mapping.get("date_columns") if date_cols: cols_to_drop += self._mapping.get("date_columns") original_cols = self._get_cols(self._input_data_copy) dfo = DateFeaturizerOrg(dataframe=self._input_data_copy) df = dfo.featurize_date_columns(date_cols) current_cols = self._get_cols(df["df"]) _logger.info( "Date Featurizer. 'created_columns': '%(created_columns)s'.", { 'created_columns': str(list(set(current_cols).difference(original_cols))), }, ) self._input_data_copy = df["df"] phone_cols = self._mapping.get("phone_columns") if phone_cols: cols_to_drop += phone_cols.get("columns_to_perform", []) original_cols = self._get_cols(self._input_data_copy) df = PhoneParser.perform(df=self._input_data_copy, columns_perform=phone_cols) current_cols = self._get_cols(df) _logger.info( "Phone Featurizer. 'created_columns': '%(created_columns)s'.", { 'created_columns': str(list(set(current_cols).difference(original_cols))), }, ) self._input_data_copy = df an_cols = self._mapping.get("alpha_numeric_columns") if an_cols: cols_to_drop += an_cols.get("columns_to_perform", []) original_cols = self._get_cols(self._input_data_copy) df = NumAlphaParser.perform(df=self._input_data_copy, columns_perform=an_cols) current_cols = self._get_cols(df) _logger.info( "NumAlpha Featurizer. 'created_columns': '%(created_columns)s'.", { 'created_columns': str(list(set(current_cols).difference(original_cols))), }, ) self._input_data_copy = df punc_cols = self._mapping.get("punctuation_columns") if punc_cols: cols_to_drop += punc_cols.get("columns_to_perform", []) original_cols = self._get_cols(self._input_data_copy) df = PunctuationParser.perform(df=self._input_data_copy, columns_perform=punc_cols) current_cols = self._get_cols(df) _logger.info( "Punctuation Featurizer. 'created_columns': '%(created_columns)s'.", { 'created_columns': str(list(set(current_cols).difference(original_cols))), }, ) self._input_data_copy = df if cols_to_drop: self._input_data_copy = common_utils.remove_columns( self._input_data_copy, list(set(cols_to_drop))) self._update_structural_type() return CallResult(self._input_data_copy, True, 1)