def _split_aggregated(self, df: container.DataFrame, split_col_names: list) -> container.DataFrame: lengths = [len(df.loc[0, col_name]) for col_name in split_col_names] for idx, col_name in enumerate(split_col_names): if self._sorted_pipe_ids: if len(self._sorted_pipe_ids) == lengths[idx]: extend_col_names = [ "{}_{}".format(col_name, i) for i in self._sorted_pipe_ids ] else: raise ValueError( "Unique number of pipeline ids not equal to the number of aggregated values" ) else: extend_col_names = [ "{}_{}".format(col_name, i) for i in range(lengths[idx]) ] extends = container.DataFrame(df.loc[:, col_name].values.tolist(), columns=extend_col_names) df = common_utils.horizontal_concat(left=df, right=extends) origin_metadata = dict( df.metadata.query( (mbase.ALL_ELEMENTS, df.columns.get_loc(col_name)))) for name in extend_col_names: col_idx = df.columns.get_loc(name) origin_metadata["name"] = name df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, col_idx), origin_metadata) return df
def _update_metadata_dimension( df: container.DataFrame) -> container.DataFrame: old_metadata = dict(df.metadata.query(())) old_metadata["dimension"] = dict(old_metadata["dimension"]) old_metadata["dimension"]["length"] = df.shape[0] df.metadata = df.metadata.update((), old_metadata) return df
def _update_type_info(self, semantic_types: Sequence[str], outputs: container.DataFrame, i: int) -> container.DataFrame: # update the structural / df type from the semantic type if "http://schema.org/Integer" in semantic_types: outputs.metadata = outputs.metadata.update_column( i, {"structural_type": int}) outputs.iloc[:, i] = pd.to_numeric(outputs.iloc[:, i]) elif "http://schema.org/Float" in semantic_types: outputs.metadata = outputs.metadata.update_column( i, {"structural_type": float}) outputs.iloc[:, i] = pd.to_numeric(outputs.iloc[:, i]) elif "http://schema.org/Boolean" in semantic_types: outputs.metadata = outputs.metadata.update_column( i, {"structural_type": bool}) outputs.iloc[:, i] = outputs.iloc[:, i].astype("bool") return outputs
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: start = time.time() logger.debug(f"Producing {__name__}") cols = self._get_columns(inputs.metadata) # outputs = container.DataFrame(generate_metadata=False) outputs = [None] * inputs.shape[1] parsing_semantics = self.hyperparams["parsing_semantics"] def fromstring(x: str) -> np.ndarray: # if column isn't a string, we'll just pass it through assuming it doesn't need to be parsed if type(x) is not str: return x return np.fromstring(x, dtype=float, sep=",") for col_index in range(len(inputs.columns)): if col_index in cols: column_metadata = inputs.metadata.query( (metadata_base.ALL_ELEMENTS, col_index) ) semantic_types = column_metadata.get("semantic_types", []) desired_semantics = set(semantic_types).intersection(parsing_semantics) if desired_semantics: if ( "https://metadata.datadrivendiscovery.org/types/FloatVector" in desired_semantics ): outputs[col_index] = inputs.iloc[:, col_index].apply( fromstring, convert_dtype=False ) if outputs[col_index].shape[0] > 0: inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": type(outputs[col_index][0])}, ) elif "http://schema.org/DateTime" in desired_semantics: outputs[col_index] = inputs.iloc[:, col_index].apply( utils.parse_datetime_to_float, fuzzy=self.hyperparams["fuzzy_time_parsing"], convert_dtype=False, ) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": float} ) elif ( "https://metadata.datadrivendiscovery.org/types/CategoricalData" in desired_semantics ): # need to make sure if a categorical type is a numeric string, convert it if inputs[inputs.columns[col_index]][0].isnumeric(): outputs[col_index] = pd.to_numeric( inputs.iloc[:, col_index], errors=self.hyperparams["error_handling"], ) if outputs[col_index].shape[0] > 0: updated_type = type(outputs[col_index][0].item()) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": updated_type} ) else: # if it's categorical but not numerical, ensure the string stays outputs[col_index] = inputs.iloc[:, col_index] else: outputs[col_index] = pd.to_numeric( inputs.iloc[:, col_index], errors=self.hyperparams["error_handling"], ) # Update structural type to reflect the results of the to_numeric call. We can't rely on the semantic type because # error coersion may result in a type becoming a float due to the presence of NaN. if outputs[col_index].shape[0] > 0: updated_type = type(outputs[col_index][0].item()) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": updated_type} ) else: # columns without specified semantics need to be concatenated outputs[col_index] = inputs.iloc[:, col_index] else: # columns not specified still need to be concatenated outputs[col_index] = inputs.iloc[:, col_index] outputs = container.DataFrame(pd.concat(outputs, axis=1)) outputs.metadata = inputs.metadata end = time.time() logger.debug(f"Produce {__name__} completed in {end - start} ms") return base.CallResult(outputs)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: cols = ["idx", "name", "rank"] # Make sure the target column is of a valid type and return no ranked features if it isn't. target_idx = self.hyperparams["target_col_index"] if not self._can_use_column(inputs.metadata, target_idx): return base.CallResult(container.DataFrame(data={}, columns=cols)) # check if target is discrete or continuous semantic_types = inputs.metadata.query_column( target_idx)["semantic_types"] discrete = len(set(semantic_types).intersection( self._discrete_types)) > 0 # make a copy of the inputs and clean out any missing data feature_df = inputs.copy() if self.hyperparams["sub_sample"]: sub_sample_size = (self.hyperparams["sub_sample_size"] if self.hyperparams["sub_sample_size"] < inputs.shape[0] else inputs.shape[0]) rows = random.sample_without_replacement(inputs.shape[0], sub_sample_size) feature_df = feature_df.iloc[rows, :] # makes sure that if an entire column is NA, we remove that column, so as to not remove ALL rows cols_to_drop = feature_df.columns[feature_df.isna().sum() == feature_df.shape[0]] feature_df.drop(columns=cols_to_drop, inplace=True) feature_df.dropna(inplace=True) # split out the target feature target_df = feature_df.iloc[:, feature_df.columns. get_loc(inputs.columns[target_idx])] # drop features that are not compatible with ranking feature_indices = set( inputs.metadata.list_columns_with_semantic_types( self._semantic_types)) role_indices = set( inputs.metadata.list_columns_with_semantic_types(self._roles)) feature_indices = feature_indices.intersection(role_indices) feature_indices.remove(target_idx) for categ_ind in inputs.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/CategoricalData", )): if categ_ind in feature_indices: if (np.unique(inputs[inputs.columns[categ_ind]]).shape[0] == inputs.shape[0]): feature_indices.remove(categ_ind) elif (inputs.metadata.query( (metadata_base.ALL_ELEMENTS, categ_ind))["structural_type"] == str): feature_df[inputs.columns[categ_ind]] = pd.to_numeric( feature_df[inputs.columns[categ_ind]]) text_indices = inputs.metadata.list_columns_with_semantic_types( self._text_semantic) tfv = TfidfVectorizer(max_features=20) column_to_text_features = {} text_feature_indices = [] for text_index in text_indices: if (text_index not in feature_indices and text_index in role_indices and text_index != target_idx): word_features = tfv.fit_transform( feature_df[inputs.columns[text_index]]) if issparse(word_features): column_to_text_features[inputs.columns[ text_index]] = pd.DataFrame.sparse.from_spmatrix( word_features) else: column_to_text_features[ inputs.columns[text_index]] = word_features text_feature_indices.append(text_index) text_feature_indices = set(text_feature_indices) # return an empty result if all features were incompatible numeric_features = len(feature_indices) > 0 if not numeric_features and len(column_to_text_features) == 0: return base.CallResult(container.DataFrame(data={}, columns=cols)) all_indices = set(range(0, inputs.shape[1])) skipped_indices = all_indices.difference( feature_indices.union(text_feature_indices)) # remove columns that were dropped feature_indices = feature_indices - set( [inputs.columns.get_loc(c) for c in cols_to_drop]) for i, v in enumerate(skipped_indices): feature_df.drop(inputs.columns[v], axis=1, inplace=True) # figure out the discrete and continuous feature indices and create an array # that flags them feature_columns = inputs.columns[list(feature_indices)] numeric_data = feature_df[feature_columns] discrete_indices = inputs.metadata.list_columns_with_semantic_types( self._discrete_types) discrete_flags = [False] * numeric_data.shape[1] for v in discrete_indices: col_name = inputs.columns[v] if col_name in numeric_data: # only mark columns with a least 1 duplicate value as discrete when predicting # a continuous target - there's a check in the bowels of MI code that will throw # an exception otherwise if numeric_data[col_name].duplicated().any() and not discrete: col_idx = numeric_data.columns.get_loc(col_name) discrete_flags[col_idx] = True target_np = target_df.values # compute mutual information for discrete or continuous target ranked_features_np = np.empty([0]) text_ranked_features_np = np.empty((len(column_to_text_features), )) if discrete: if numeric_features: ranked_features_np = mutual_info_classif( numeric_data.values, target_np, discrete_features=discrete_flags, n_neighbors=self.hyperparams["k"], random_state=self._random_seed, ) for i, column in enumerate(column_to_text_features): text_rankings = mutual_info_classif( column_to_text_features[column], target_np, discrete_features=[False] * column_to_text_features[column].shape[1], n_neighbors=self.hyperparams["k"], random_state=self._random_seed, ) sum_text_rank = np.sum(text_rankings) text_ranked_features_np[i] = sum_text_rank else: if numeric_features: ranked_features_np = mutual_info_regression( numeric_data.values, target_np, discrete_features=discrete_flags, n_neighbors=self.hyperparams["k"], random_state=self._random_seed, ) for i, column in enumerate(column_to_text_features): text_rankings = mutual_info_regression( column_to_text_features[column], target_np, discrete_features=[False] * column_to_text_features[column].shape[1], n_neighbors=self.hyperparams["k"], random_state=self._random_seed, ) sum_text_rank = np.sum(text_rankings) text_ranked_features_np[i] = sum_text_rank ranked_features_np, target_entropy = self._normalize( ranked_features_np, feature_df[feature_columns], target_np, discrete, discrete_flags, ) text_ranked_features_np = self._normalize_text( text_ranked_features_np, column_to_text_features, target_entropy) if self.hyperparams["return_as_metadata"]: ranked_features_np = np.append(ranked_features_np, text_ranked_features_np) for i, f in enumerate(feature_indices.union(text_feature_indices)): column_metadata = inputs.metadata.query( (metadata_base.ALL_ELEMENTS, f)) rank_dict = dict(column_metadata) rank_dict["rank"] = ranked_features_np[i] inputs.metadata = inputs.metadata.update( (metadata_base.ALL_ELEMENTS, f), FrozenOrderedDict(rank_dict.items()), ) return base.CallResult(inputs) # merge back into a single list of col idx / rank value tuples data: typing.List[typing.Tuple[int, str, float]] = [] data = self._append_rank_info(inputs, data, ranked_features_np, feature_df[feature_columns]) data = self._append_rank_info( inputs, data, text_ranked_features_np, feature_df[inputs.columns[list(text_feature_indices)]], ) # wrap as a D3M container - metadata should be auto generated results = container.DataFrame(data=data, columns=cols, generate_metadata=True) results = results.sort_values(by=["rank"], ascending=False).reset_index(drop=True) return base.CallResult(results)