def add_filters(self, filter_list): if filter_list is None: return for (feature, operator, threshold) in filter_list: fio = FeatureIO(self.data) self.data = fio.custom_feature_filter(feature, operator, threshold) self.set_up_data_from_features() return
def csv_add_features(self, csvsrc, csvdest): afm_dict=dict() param_dict=dict() #E900 column e900_dict = dict() for elem in ['P','Ni','Cu','Mn']: #Si, C not used in e900 e900_dict['wt%s' % elem] = 'wt_percent_%s' % elem e900_dict['fluencestr'] = 'fluence_n_cm2' e900_dict['tempC'] = 'temperature_C' e900_dict['prod_ID'] = 'product_id' afm_dict['DBTT.E900'] = dict(e900_dict) param_dict['DBTT.E900'] = dict() #get_dataframe csv_dataparser = DataParser() csv_dataframe = csv_dataparser.import_data("%s.csv" % os.path.join(self.save_path, csvsrc)) #add features for afm in afm_dict.keys(): (feature_name, feature_data) = cf_help.get_custom_feature_data(class_method_str = afm, starting_dataframe = csv_dataframe, param_dict = dict(param_dict[afm]), addl_feature_method_kwargs = dict(afm_dict[afm])) fio = FeatureIO(csv_dataframe) csv_dataframe = fio.add_custom_features([feature_name],feature_data) #add log10 features log10_dict=dict() log10_dict['fluence_n_cm2'] = dict() log10_dict['flux_n_cm2_sec'] = dict() for lkey in log10_dict.keys(): orig_data = csv_dataframe[lkey] log10_data = np.log10(orig_data) fio = FeatureIO(csv_dataframe) csv_dataframe = fio.add_custom_features(["log(%s)" % lkey], log10_data) #add normalizations norm_dict = dict() norm_dict['log(fluence_n_cm2)']=dict() norm_dict['log(fluence_n_cm2)']['smin'] = 17 norm_dict['log(fluence_n_cm2)']['smax'] = 25 norm_dict['log(flux_n_cm2_sec)']=dict() norm_dict['log(flux_n_cm2_sec)']['smin'] = 10 norm_dict['log(flux_n_cm2_sec)']['smax'] = 15 norm_dict['temperature_C']=dict() norm_dict['temperature_C']['smin'] = 270 norm_dict['temperature_C']['smax'] = 320 for elem in ["P","C","Cu","Ni","Mn","Si"]: norm_dict["at_percent_%s" % elem] = dict() norm_dict["at_percent_%s" % elem]['smin'] = 0.0 norm_dict["at_percent_%s" % elem]['smax'] = 1.717 #max Mn atomic percent for nkey in norm_dict.keys(): fnorm = FeatureNormalization(csv_dataframe) scaled_feature = fnorm.minmax_scale_single_feature(nkey, smin=norm_dict[nkey]['smin'], smax=norm_dict[nkey]['smax']) fio = FeatureIO(csv_dataframe) csv_dataframe = fio.add_custom_features(["N(%s)" % nkey],scaled_feature) csv_dataframe.to_csv("%s.csv" % os.path.join(self.save_path, csvdest)) return
def get_afm_updated_dataset(self, indiv_df, indiv_params): """Update dataframe with additional feature methods """ for afm in indiv_params.keys(): if afm == 'model': #model dealt with separately continue afm_kwargs = dict(indiv_params[afm]) (feature_name, feature_data) = cf_help.get_custom_feature_data( afm, starting_dataframe=indiv_df, addl_feature_method_kwargs=dict(afm_kwargs)) fio = FeatureIO(indiv_df) indiv_df = fio.add_custom_features([afm], feature_data) return indiv_df
def subtraction(self, col1="", col2="", num1="", num2="", **params): """Testing function. col1 <str>: first feature name col2 <str>: second feature name num1 <float>: number to multiply col1 by num2 <float>: number to subtract """ col1_data = self.df[col1] col2_data = self.df[col2] new_data = (col1_data * num1) - col2_data + num2 fio = FeatureIO(self.df) new_df = fio.add_custom_features(["Subtraction_test"],new_data) fnorm = FeatureNormalization(new_df) N_new_data = fnorm.minmax_scale_single_feature("Subtraction_test") return N_new_data
def test_custom_feature_filter(self): df = FeatureIO(dataframe=self.df1).custom_feature_filter( feature='AtomicNumber_composition_average', operator='<', threshold=21) self.assertTrue(df.shape[0] < self.df1.shape[0]) return
def test_add_custom_features(self): df1columns = self.df1.shape[1] df = FeatureIO(dataframe=self.df1).add_custom_features( features_to_add=['test_feature'], data_to_add=np.zeros(shape=[ self.df1.shape[0], ])) self.assertTrue(df1columns < df.shape[1]) return
def add_normalization(self, cname, verbose=0): df = self.dfs[cname] norm_dict = dict() norm_dict['log(fluence_n_cm2)'] = dict() norm_dict['log(fluence_n_cm2)']['smin'] = 17 norm_dict['log(fluence_n_cm2)']['smax'] = 25 norm_dict['log(flux_n_cm2_sec)'] = dict() norm_dict['log(flux_n_cm2_sec)']['smin'] = 10 norm_dict['log(flux_n_cm2_sec)']['smax'] = 15 norm_dict['log(eff fl 100p=10)'] = dict() norm_dict['log(eff fl 100p=10)']['smin'] = 17 norm_dict['log(eff fl 100p=10)']['smax'] = 25 norm_dict['log(eff fl 100p=20)'] = dict() norm_dict['log(eff fl 100p=20)']['smin'] = 17 norm_dict['log(eff fl 100p=20)']['smax'] = 25 norm_dict['log(eff fl 100p=26)'] = dict() norm_dict['log(eff fl 100p=26)']['smin'] = 17 norm_dict['log(eff fl 100p=26)']['smax'] = 25 norm_dict['log(eff fl 100p=23)'] = dict() norm_dict['log(eff fl 100p=23)']['smin'] = 17 norm_dict['log(eff fl 100p=23)']['smax'] = 25 norm_dict['log(eff fl 100p=8)'] = dict() norm_dict['log(eff fl 100p=8)']['smin'] = 17 norm_dict['log(eff fl 100p=8)']['smax'] = 25 norm_dict['temperature_C'] = dict() norm_dict['temperature_C']['smin'] = 270 norm_dict['temperature_C']['smax'] = 320 for elem in ["P", "C", "Cu", "Ni", "Mn", "Si"]: norm_dict["at_percent_%s" % elem] = dict() norm_dict["at_percent_%s" % elem]['smin'] = 0.0 norm_dict["at_percent_%s" % elem]['smax'] = 1.717 #max Mn atomic percent for nkey in norm_dict.keys(): fnorm = FeatureNormalization(df) scaled_feature = fnorm.minmax_scale_single_feature( nkey, smin=norm_dict[nkey]['smin'], smax=norm_dict[nkey]['smax']) fio = FeatureIO(df) df = fio.add_custom_features(["N(%s)" % nkey], scaled_feature) self.dfs[cname] = df return
def test_remove_duplicate_columns(self): # Manually add duplicate column to dataframe self.df1['AtomicNumber_composition_average_copy'] = pd.Series( self.df1['AtomicNumber_composition_average'], index=self.df1.index) self.df1.rename(columns={ 'AtomicNumber_composition_average_copy': 'AtomicNumber_composition_average' }, inplace=True) df = FeatureIO(dataframe=self.df1).remove_duplicate_columns() self.assertTrue(df.shape[1] < self.df1.shape[1]) return
def calculate_EffectiveFluence(self, pvalue=0, ref_flux=3e10, flux_feature="", fluence_feature="", scale_min=1e17, scale_max=1e25, **params): """Calculate effective fluence """ fluence = self.df[fluence_feature] flux = self.df[flux_feature] EFl = fluence * (ref_flux / flux)**pvalue EFl = np.log10(EFl) fio = FeatureIO(self.df) new_df = fio.add_custom_features(["EFl"], EFl) fnorm = FeatureNormalization(new_df) N_EFl = fnorm.minmax_scale_single_feature("EFl", smin=np.log10(scale_min), smax=np.log10(scale_max)) return N_EFl
def make_data(self): n_samples, n_features = 100, 5 y = self.random_state.randn(n_samples) X = self.random_state.randn(n_samples, n_features) nidx = np.arange(0, n_samples) self.dataframe = pd.DataFrame(index=nidx) num_cat = self.random_state.randint(0, 4, n_samples) cats = ['A', 'B', 'C', 'D'] str_cat = [cats[nc] for nc in num_cat] time = nidx * np.pi / 8.0 sine_feature = np.sin(time) + X[:, 0] #add noise linear_feature = 100 * time + 30.0 + X[:, 1] #add noise y_feature = np.sin(time) + y / 10.0 y_feature_error = X[:, 3] / X[:, 4] / 100.0 #add random error d_cols = dict() d_cols["num_idx"] = nidx d_cols["num_cat"] = num_cat d_cols["str_cat"] = str_cat d_cols["time"] = time d_cols["sine_feature"] = sine_feature d_cols["linear_feature"] = linear_feature d_cols["y_feature"] = y_feature d_cols["y_feature_error"] = y_feature_error cols = list(d_cols.keys()) cols.sort() for col in cols: fio = FeatureIO(self.dataframe) self.dataframe = fio.add_custom_features([col], d_cols[col]) fnorm = FeatureNormalization(self.dataframe) N_sine_feature = fnorm.minmax_scale_single_feature("sine_feature") N_linear_feature = fnorm.minmax_scale_single_feature("linear_feature") fio = FeatureIO(self.dataframe) self.dataframe = fio.add_custom_features(["N_sine_feature"], N_sine_feature) fio = FeatureIO(self.dataframe) self.dataframe = fio.add_custom_features(["N_linear_feature"], N_sine_feature) return
def test_keep_custom_features(self): df = FeatureIO(dataframe=self.df1).keep_custom_features( features_to_keep=['AtomicNumber_composition_average']) self.assertTrue( ['AtomicNumber_composition_average'] == df.columns.values.tolist()) return
def test_remove_custom_features(self): df = FeatureIO(dataframe=self.df1).remove_custom_features( features_to_remove=['AtomicNumber_composition_average']) self.assertTrue('AtomicNumber_composition_average' not in df.columns.values.tolist()) return
def add_feature(self, feature_name, feature_data): fio = FeatureIO(self.data) self.data = fio.add_custom_features([feature_name], feature_data) return
def add_prediction_sigma(self, prediction_data_sigma): fio = FeatureIO(self.data) self.data = fio.add_custom_features(["Prediction Sigma"], prediction_data_sigma) self.target_prediction_sigma = self.data["Prediction Sigma"] return
def add_prediction(self, prediction_data): fio = FeatureIO(self.data) self.data = fio.add_custom_features(["Prediction"], prediction_data) self.target_prediction = self.data["Prediction"] return
def _create_data_dict(self): data_dict = dict() for data_name in self.data_setup.keys(): data_path = self.configdict['Data Setup'][data_name]['data_path'] logging.info( 'Creating data dict for data path %s and data name %s' % (data_path, data_name)) data_weights = self.data_setup[data_name]['weights'] if 'labeling_features' in self.general_setup.keys(): labeling_features = self._string_or_list_input_to_list( self.general_setup['labeling_features']) else: labeling_features = None if 'target_error_feature' in self.general_setup.keys(): target_error_feature = self.general_setup[ 'target_error_feature'] else: target_error_feature = None if 'grouping_feature' in self.general_setup.keys(): grouping_feature = self.general_setup['grouping_feature'] else: grouping_feature = None if 'Feature Generation' in self.configdict.keys(): if self.configdict['Feature Generation']['perform_feature_generation'] == bool(True) or \ self.configdict['Feature Generation']['perform_feature_generation'] == "True": generate_features = True else: generate_features = False else: generate_features = False if 'Feature Normalization' in self.configdict.keys(): if self.configdict['Feature Normalization']['normalize_x_features'] == bool(True) or \ self.configdict['Feature Normalization']['normalize_x_features'] == "True": normalize_x_features = True else: normalize_x_features = False if self.configdict['Feature Normalization']['normalize_y_feature'] == bool(True) or \ self.configdict['Feature Normalization']['normalize_y_feature'] == "True": normalize_y_feature = True else: normalize_y_feature = False else: normalize_x_features = False normalize_y_feature = False if 'Feature Selection' in self.configdict.keys(): if self.configdict['Feature Selection']['perform_feature_selection'] == bool(True) or \ self.configdict['Feature Selection']['perform_feature_selection'] == "True": select_features = True else: select_features = False else: select_features = False logging.info("Feature Generation: %s" % generate_features) logging.info("Feature Normalization (x_features): %s" % normalize_x_features) logging.info("Feature Normalization (y_feature): %s" % normalize_y_feature) logging.info("Feature Selection: %s" % select_features) # Parse input data file Xdata, ydata, x_features, y_feature, dataframe = self._parse_input_data( data_path) # Plot initial histogram of input target data DataframeUtilities().plot_dataframe_histogram( configdict=self.configdict, dataframe=dataframe, y_feature=y_feature) original_x_features = list(x_features) original_columns = list(dataframe.columns) logging.debug("original columns: %s" % original_columns) # Remove any missing rows from dataframe #dataframe = dataframe.dropna() # Save off label and grouping data dataframe_labeled = pd.DataFrame() dataframe_grouped = pd.DataFrame() if not (labeling_features is None): dataframe_labeled = FeatureIO( dataframe=dataframe).keep_custom_features( features_to_keep=labeling_features, y_feature=y_feature) if normalize_x_features == bool(True): dataframe_labeled, scaler = FeatureNormalization( dataframe=dataframe_labeled, configdict=self.configdict).normalize_features( x_features=labeling_features, y_feature=y_feature) if not (grouping_feature is None): dataframe_grouped = FeatureIO( dataframe=dataframe).keep_custom_features( features_to_keep=[grouping_feature], y_feature=y_feature) # Generate additional descriptors, as specified in input file (optional) if generate_features: dataframe = self._perform_feature_generation( dataframe=dataframe) # Actually, the x_features_NOUSE is required if starting from no features and doing feature generation. Not renaming for now. RJ 7/17 Xdata, ydata, x_features_NOUSE, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe, target_feature=y_feature) else: Xdata, ydata, x_features, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe, target_feature=y_feature) # First remove features containing strings before doing feature normalization or other operations, but don't remove grouping features if generate_features == bool(True): nonstring_x_features, dataframe_nostrings = MiscFeatureOperations( configdict=self.configdict ).remove_features_containing_strings( dataframe=dataframe, x_features=x_features_NOUSE) #Remove columns containing all entries of NaN dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='all') # Re-obtain x_feature list as some features may have been dropped Xdata, ydata, x_features_NOUSE, y_feature, dataframe_nostrings = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_nostrings, target_feature=y_feature) else: nonstring_x_features, dataframe_nostrings = MiscFeatureOperations( configdict=self.configdict ).remove_features_containing_strings(dataframe=dataframe, x_features=x_features) # Remove columns containing all entries of NaN dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='all') # Fill spots with NaN to be empty string dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='any') # Re-obtain x_feature list as some features may have been dropped Xdata, ydata, x_features, y_feature, dataframe_nostrings = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_nostrings, target_feature=y_feature) logging.debug("pre-changes:%s" % dataframe_nostrings.columns) # Normalize features (optional) if normalize_x_features == bool( True) or normalize_y_feature == bool(True): fn = FeatureNormalization(dataframe=dataframe_nostrings, configdict=self.configdict) dataframe_nostrings, scaler = fn.normalize_features( x_features=x_features, y_feature=y_feature, normalize_x_features=normalize_x_features, normalize_y_feature=normalize_y_feature) x_features, y_feature = DataParser( configdict=self.configdict).get_features( dataframe=dataframe_nostrings, target_feature=y_feature) # Perform feature selection and dimensional reduction, as specified in the input file (optional) if (select_features == bool(True)) and (y_feature in dataframe_nostrings.columns): # Remove any additional columns that are not x_features using to be fit to data features = dataframe_nostrings.columns.values.tolist() features_to_remove = [] for feature in features: if feature not in x_features and feature not in y_feature: features_to_remove.append(feature) dataframe_nostrings = FeatureIO( dataframe=dataframe_nostrings).remove_custom_features( features_to_remove=features_to_remove) dataframe_nostrings = self._perform_feature_selection( dataframe=dataframe_nostrings, x_features=x_features, y_feature=y_feature) x_features, y_feature = DataParser( configdict=self.configdict).get_features( dataframe=dataframe_nostrings, target_feature=y_feature) logging.debug("post-removal:%s" % dataframe_nostrings.columns) # Combine the input dataframe, which has undergone feature generation and normalization, with the grouped and labeled features of original dataframe # First, need to generate dataframe that only has the grouped and labeled features grouping_and_labeling_features = [] duplicate_features = [] if 'grouping_feature' in self.configdict['General Setup'].keys(): grouping_and_labeling_features.append(grouping_feature) if 'labeling_features' in self.configdict['General Setup'].keys(): for feature in labeling_features: grouping_and_labeling_features.append(feature) if feature in x_features: if feature not in duplicate_features: duplicate_features.append(feature) # Now merge dataframes dataframe_labeled_grouped = DataframeUtilities( ).merge_dataframe_columns(dataframe1=dataframe_labeled, dataframe2=dataframe_grouped) dataframe_merged = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe_nostrings, dataframe2=dataframe_labeled_grouped) #Add string columns back in string_x_features = list() for my_x_feature in x_features: if my_x_feature in nonstring_x_features: pass else: string_x_features.append(my_x_feature) logging.debug("string features: %s" % string_x_features) for string_x_feature in string_x_features: dataframe_merged[string_x_feature] = dataframe_orig_dropped_na[ string_x_feature] # Need to remove duplicate features after merging. logging.debug("merged:%s" % dataframe_merged.columns) dataframe_rem = FeatureIO( dataframe=dataframe_merged).remove_duplicate_columns() myXdata, myydata, myx_features, myy_feature, dataframe_final = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_rem, target_feature=y_feature) combined_x_features = list() logging.debug("total features:%s" % myx_features) for feature in myx_features: if (feature in original_x_features) or not ( feature in original_columns ): #originally designated, or created from feature generation combined_x_features.append(feature) logging.debug("combined x features:%s" % combined_x_features) data_dict[data_name] = DataHandler( data=dataframe_final, input_data=dataframe_final[combined_x_features], target_data=myydata, input_features=combined_x_features, target_feature=myy_feature, target_error_feature=target_error_feature, labeling_features=labeling_features, grouping_feature=grouping_feature) # logging.info('Parsed the input data located under %s' % data_path) # Get dataframe stats DataframeUtilities.save_all_dataframe_statistics( dataframe=dataframe_final, configdict=self.configdict) return data_dict, y_feature