def _produce_threaded( self, *, index: int, left_df_full: container.DataFrame, # type: ignore left_dfs: typing.Sequence[container.DataFrame], # type: ignore right_df: container.DataFrame, # type: ignore join_types: typing.Sequence[str], left_col: typing.Sequence[int], right_col: typing.Sequence[int], accuracy: typing.Sequence[float], absolute_accuracy: typing.Sequence[bool] ) -> typing.Tuple[int, base.CallResult[Outputs]]: if left_dfs[index].empty: return (index, None) output = self._produce( left_df_full = left_df_full, left_df = left_dfs[index].reset_index(drop=True), right_df = right_df.copy(), join_types = join_types, left_col = left_col, right_col = right_col, accuracy = accuracy, absolute_accuracy = absolute_accuracy ) return (index, output)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__}") # set values that only occur once to a special token outputs = inputs.copy() # determine columns to operate on cols = distil_utils.get_operating_columns( inputs, self.hyperparams["use_columns"], CATEGORICALS) for c in cols: vcs = pd.value_counts(list(inputs.iloc[:, c])) singletons = set(vcs[vcs == 1].index) if singletons: mask = outputs.iloc[:, c].isin(singletons) outputs.loc[mask, outputs.columns[c]] = SINGLETON_INDICATOR logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__}") # determine columns to operate on cols = distil_utils.get_operating_columns( inputs, self.hyperparams["use_columns"], CATEGORICALS) logger.debug(f"Found {len(cols)} categorical columns to evaluate") if len(cols) is 0: return base.CallResult(inputs) imputer = CategoricalImputer( strategy=self.hyperparams["strategy"], fill_value=self.hyperparams["fill_value"], missing_values="", tie_breaking="first", ) outputs = inputs.copy() failures: List[int] = [] for c in cols: input_col = inputs.iloc[:, c] try: imputer.fit(input_col) result = imputer.transform(input_col) outputs.iloc[:, c] = result except ValueError as e: # value error gets thrown when all data is missing if not self.hyperparams["error_on_empty"]: failures.append(c) else: raise e # for columns that failed using 'most_frequent' try again using 'constant' if not self.hyperparams["error_on_empty"]: imputer = CategoricalImputer( strategy="constant", fill_value=self.hyperparams["fill_value"], missing_values="", tie_breaking="first", ) for f in failures: outputs_col = outputs.iloc[:, f] imputer.fit(outputs_col) result = imputer.transform(outputs_col) outputs.iloc[:, f] = result logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") if len(self._cols) == 0: return base.CallResult(inputs) # add the binary encoded columns and remove the source columns outputs = inputs.copy() encoded_cols = container.DataFrame() encoded_cols_source = [] bin_idx = 0 for i, c in enumerate(self._cols): categorical_inputs = outputs.iloc[:, c] result = self._encoders[i].transform(categorical_inputs) for j in range(result.shape[1]): encoded_cols[(f"__binary_{bin_idx}")] = result[:, j] encoded_cols_source.append(c) bin_idx += 1 encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols) for c in range(encoded_cols.shape[1]): encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Integer" ) encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic ) col_dict = dict( encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c)) ) col_dict["source_column"] = outputs.metadata.query( (metadata_base.ALL_ELEMENTS, encoded_cols_source[c]) )["name"] encoded_cols.metadata = encoded_cols.metadata.update( (metadata_base.ALL_ELEMENTS, c), col_dict ) outputs = outputs.append_columns(encoded_cols) outputs = outputs.remove_columns(self._cols) logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def _remap_graphs( cls, data: container.DataFrame) -> Tuple[container.DataFrame, int, int]: assert data.shape[1] == 2 data = data.copy() data.columns = ("user", "item") uusers = np.unique([data.user, data.user]) user_lookup = dict(zip(uusers, range(len(uusers)))) data.user = data.user.apply(user_lookup.get) uitems = np.unique(data.item) item_lookup = dict(zip(uitems, range(len(uitems)))) data.item = data.item.apply(item_lookup.get) n_users = len(uusers) n_items = len(uitems) return data, n_users, n_items
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") if len(self._cols) == 0: return base.CallResult(inputs) # encode using the previously identified categorical columns input_cols = inputs.iloc[:, self._cols] from itertools import zip_longest encoded_cols = container.DataFrame() for i in self._cols: col_name = inputs.columns[i] col = container.DataFrame.from_records( zip_longest(*inputs[col_name].values)).T col.columns = [f"{col_name}_{x}" for x in range(len(col.columns))] encoded_cols = pd.concat([encoded_cols, col], axis=1) # append the encoding columns and generate metadata outputs = inputs.copy() encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols) for c in range(encoded_cols.shape[1]): encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float") outputs = outputs.append_columns(encoded_cols) # drop the source columns outputs = outputs.remove_columns(self._cols) logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") # fallthrough if there's nothing to do if len(self._cols) == 0 or self._encoder is None: return base.CallResult(inputs) # map encoded cols to source column names feature_names = self._encoder.get_feature_names() encoded_cols_source = [] # feature names are xA_YY where A is the source column index and YY is the value for name in feature_names: # take the first part of the name (xA) and remove the x encoded_feature_index = int(name.split("_")[0][1:]) feature_index = self._cols[encoded_feature_index] encoded_cols_source.append( inputs.metadata.query((metadata_base.ALL_ELEMENTS, feature_index))[ "name" ] ) # encode using the previously identified categorical columns input_cols = inputs.iloc[:, self._cols] result = self._encoder.transform(input_cols) # append the encoding columns and generate metadata outputs = inputs.copy() encoded_cols: container.DataFrame = container.DataFrame() for i in range(result.shape[1]): encoded_cols[f"__onehot_{str(i)}"] = result[:, i] encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols) for c in range(encoded_cols.shape[1]): encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float" ) encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic ) col_dict = dict( encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c)) ) col_dict["source_column"] = encoded_cols_source[c] encoded_cols.metadata = encoded_cols.metadata.update( (metadata_base.ALL_ELEMENTS, c), col_dict ) outputs = outputs.append_columns(encoded_cols) # drop the source columns outputs = outputs.remove_columns(self._cols) logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: cols = ["idx", "name", "rank"] # Make sure the target column is of a valid type and return no ranked features if it isn't. target_idx = self.hyperparams["target_col_index"] if not self._can_use_column(inputs.metadata, target_idx): return base.CallResult(container.DataFrame(data={}, columns=cols)) # check if target is discrete or continuous semantic_types = inputs.metadata.query_column( target_idx)["semantic_types"] discrete = len(set(semantic_types).intersection( self._discrete_types)) > 0 # make a copy of the inputs and clean out any missing data feature_df = inputs.copy() if self.hyperparams["sub_sample"]: sub_sample_size = (self.hyperparams["sub_sample_size"] if self.hyperparams["sub_sample_size"] < inputs.shape[0] else inputs.shape[0]) rows = random.sample_without_replacement(inputs.shape[0], sub_sample_size) feature_df = feature_df.iloc[rows, :] # makes sure that if an entire column is NA, we remove that column, so as to not remove ALL rows cols_to_drop = feature_df.columns[feature_df.isna().sum() == feature_df.shape[0]] feature_df.drop(columns=cols_to_drop, inplace=True) feature_df.dropna(inplace=True) # split out the target feature target_df = feature_df.iloc[:, feature_df.columns. get_loc(inputs.columns[target_idx])] # drop features that are not compatible with ranking feature_indices = set( inputs.metadata.list_columns_with_semantic_types( self._semantic_types)) role_indices = set( inputs.metadata.list_columns_with_semantic_types(self._roles)) feature_indices = feature_indices.intersection(role_indices) feature_indices.remove(target_idx) for categ_ind in inputs.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/CategoricalData", )): if categ_ind in feature_indices: if (np.unique(inputs[inputs.columns[categ_ind]]).shape[0] == inputs.shape[0]): feature_indices.remove(categ_ind) elif (inputs.metadata.query( (metadata_base.ALL_ELEMENTS, categ_ind))["structural_type"] == str): feature_df[inputs.columns[categ_ind]] = pd.to_numeric( feature_df[inputs.columns[categ_ind]]) text_indices = inputs.metadata.list_columns_with_semantic_types( self._text_semantic) tfv = TfidfVectorizer(max_features=20) column_to_text_features = {} text_feature_indices = [] for text_index in text_indices: if (text_index not in feature_indices and text_index in role_indices and text_index != target_idx): word_features = tfv.fit_transform( feature_df[inputs.columns[text_index]]) if issparse(word_features): column_to_text_features[inputs.columns[ text_index]] = pd.DataFrame.sparse.from_spmatrix( word_features) else: column_to_text_features[ inputs.columns[text_index]] = word_features text_feature_indices.append(text_index) text_feature_indices = set(text_feature_indices) # return an empty result if all features were incompatible numeric_features = len(feature_indices) > 0 if not numeric_features and len(column_to_text_features) == 0: return base.CallResult(container.DataFrame(data={}, columns=cols)) all_indices = set(range(0, inputs.shape[1])) skipped_indices = all_indices.difference( feature_indices.union(text_feature_indices)) # remove columns that were dropped feature_indices = feature_indices - set( [inputs.columns.get_loc(c) for c in cols_to_drop]) for i, v in enumerate(skipped_indices): feature_df.drop(inputs.columns[v], axis=1, inplace=True) # figure out the discrete and continuous feature indices and create an array # that flags them feature_columns = inputs.columns[list(feature_indices)] numeric_data = feature_df[feature_columns] discrete_indices = inputs.metadata.list_columns_with_semantic_types( self._discrete_types) discrete_flags = [False] * numeric_data.shape[1] for v in discrete_indices: col_name = inputs.columns[v] if col_name in numeric_data: # only mark columns with a least 1 duplicate value as discrete when predicting # a continuous target - there's a check in the bowels of MI code that will throw # an exception otherwise if numeric_data[col_name].duplicated().any() and not discrete: col_idx = numeric_data.columns.get_loc(col_name) discrete_flags[col_idx] = True target_np = target_df.values # compute mutual information for discrete or continuous target ranked_features_np = np.empty([0]) text_ranked_features_np = np.empty((len(column_to_text_features), )) if discrete: if numeric_features: ranked_features_np = mutual_info_classif( numeric_data.values, target_np, discrete_features=discrete_flags, n_neighbors=self.hyperparams["k"], random_state=self._random_seed, ) for i, column in enumerate(column_to_text_features): text_rankings = mutual_info_classif( column_to_text_features[column], target_np, discrete_features=[False] * column_to_text_features[column].shape[1], n_neighbors=self.hyperparams["k"], random_state=self._random_seed, ) sum_text_rank = np.sum(text_rankings) text_ranked_features_np[i] = sum_text_rank else: if numeric_features: ranked_features_np = mutual_info_regression( numeric_data.values, target_np, discrete_features=discrete_flags, n_neighbors=self.hyperparams["k"], random_state=self._random_seed, ) for i, column in enumerate(column_to_text_features): text_rankings = mutual_info_regression( column_to_text_features[column], target_np, discrete_features=[False] * column_to_text_features[column].shape[1], n_neighbors=self.hyperparams["k"], random_state=self._random_seed, ) sum_text_rank = np.sum(text_rankings) text_ranked_features_np[i] = sum_text_rank ranked_features_np, target_entropy = self._normalize( ranked_features_np, feature_df[feature_columns], target_np, discrete, discrete_flags, ) text_ranked_features_np = self._normalize_text( text_ranked_features_np, column_to_text_features, target_entropy) if self.hyperparams["return_as_metadata"]: ranked_features_np = np.append(ranked_features_np, text_ranked_features_np) for i, f in enumerate(feature_indices.union(text_feature_indices)): column_metadata = inputs.metadata.query( (metadata_base.ALL_ELEMENTS, f)) rank_dict = dict(column_metadata) rank_dict["rank"] = ranked_features_np[i] inputs.metadata = inputs.metadata.update( (metadata_base.ALL_ELEMENTS, f), FrozenOrderedDict(rank_dict.items()), ) return base.CallResult(inputs) # merge back into a single list of col idx / rank value tuples data: typing.List[typing.Tuple[int, str, float]] = [] data = self._append_rank_info(inputs, data, ranked_features_np, feature_df[feature_columns]) data = self._append_rank_info( inputs, data, text_ranked_features_np, feature_df[inputs.columns[list(text_feature_indices)]], ) # wrap as a D3M container - metadata should be auto generated results = container.DataFrame(data=data, columns=cols, generate_metadata=True) results = results.sort_values(by=["rank"], ascending=False).reset_index(drop=True) return base.CallResult(results)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: cols = ['idx', 'name', 'rank'] # Make sure the target column is of a valid type and return no ranked features if it isn't. target_idx = self.hyperparams['target_col_index'] if not self._can_use_column(inputs.metadata, target_idx): return base.CallResult(container.DataFrame(data={}, columns=cols)) # check if target is discrete or continuous semantic_types = inputs.metadata.query_column( target_idx)['semantic_types'] discrete = len(set(semantic_types).intersection( self._discrete_types)) > 0 # make a copy of the inputs and clean out any missing data feature_df = inputs.copy() feature_df.dropna(inplace=True) # split out the target feature target_df = feature_df.iloc[:, target_idx] # drop features that are not compatible with ranking feature_indices = set( inputs.metadata.list_columns_with_semantic_types( self._semantic_types)) role_indices = set( inputs.metadata.list_columns_with_semantic_types(self._roles)) feature_indices = feature_indices.intersection(role_indices) feature_indices.remove(target_idx) # return an empty result if all features were incompatible if len(feature_indices) is 0: return base.CallResult(container.DataFrame(data={}, columns=cols)) all_indices = set(range(0, inputs.shape[1])) skipped_indices = all_indices.difference(feature_indices) for i, v in enumerate(skipped_indices): feature_df.drop(inputs.columns[v], axis=1, inplace=True) # figure out the discrete and continuous feature indices and create an array # that flags them discrete_indices = inputs.metadata.list_columns_with_semantic_types( self._discrete_types) discrete_flags = [False] * feature_df.shape[1] for v in discrete_indices: col_name = inputs.columns[v] if col_name in feature_df: # only mark columns with a least 1 duplicate value as discrete when predicting # a continuous target - there's a check in the bowels of MI code that will throw # an exception otherwise if feature_df[col_name].duplicated().any() and not discrete: col_idx = feature_df.columns.get_loc(col_name) discrete_flags[col_idx] = True target_np = target_df.values feature_np = feature_df.values # compute mutual information for discrete or continuous target ranked_features_np = None if discrete: ranked_features_np = mutual_info_classif( feature_np, target_np, discrete_features=discrete_flags, random_state=self._random_seed) else: ranked_features_np = mutual_info_regression( feature_np, target_np, discrete_features=discrete_flags, random_state=self._random_seed) # merge back into a single list of col idx / rank value tuples data: typing.List[typing.Tuple[int, str, float]] = [] data = self._append_rank_info(inputs, data, ranked_features_np, feature_df) # wrap as a D3M container - metadata should be auto generated results = container.DataFrame(data=data, columns=cols, generate_metadata=True) results = results.sort_values(by=['rank'], ascending=False).reset_index(drop=True) return base.CallResult(results)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: # make sure the target column is of a valid type target_idx = self.hyperparams['target_col_index'] if not self._can_use_column(inputs.metadata, target_idx): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(target_idx) + ' from ' + str(inputs.columns) + ' does not contain continuous or discrete type') # check if target is discrete or continuous semantic_types = inputs.metadata.query_column( target_idx)['semantic_types'] discrete = len(set(semantic_types).intersection( self._discrete_types)) > 0 # make a copy of the inputs and clean out any missing data feature_df = inputs.copy() feature_df.dropna(inplace=True) # split out the target feature target_df = feature_df.iloc[:, target_idx] # drop features that are not compatible with ranking feature_indices = set( utils.list_columns_with_semantic_types(inputs.metadata, self._semantic_types)) role_indices = set( utils.list_columns_with_semantic_types(inputs.metadata, self._roles)) feature_indices = feature_indices.intersection(role_indices) all_indices = set(range(0, inputs.shape[1])) skipped_indices = all_indices.difference(feature_indices) skipped_indices.add(target_idx) # drop the target too for i, v in enumerate(skipped_indices): feature_df.drop(inputs.columns[v], axis=1, inplace=True) # figure out the discrete and continuous feature indices and create an array # that flags them discrete_indices = utils.list_columns_with_semantic_types( inputs.metadata, self._discrete_types) discrete_flags = [False] * feature_df.shape[1] for v in discrete_indices: col_name = inputs.columns[v] if col_name in feature_df: col_idx = feature_df.columns.get_loc(col_name) discrete_flags[col_idx] = True target_np = target_df.values feature_np = feature_df.values # compute mutual information for discrete or continuous target ranked_features_np = None if discrete: ranked_features_np = mutual_info_classif( feature_np, target_np, discrete_features=discrete_flags, random_state=self._random_seed) else: ranked_features_np = mutual_info_regression( feature_np, target_np, discrete_features=discrete_flags, random_state=self._random_seed) # merge back into a single list of col idx / rank value tuples data: typing.List[typing.Tuple[int, str, float]] = [] data = self._append_rank_info(inputs, data, ranked_features_np, feature_df) cols = ['idx', 'name', 'rank'] results = container.DataFrame(data=data, columns=cols) results = results.sort_values(by=['rank'], ascending=False).reset_index(drop=True) # wrap as a D3M container - metadata should be auto generated return base.CallResult(results)