def csv_add_features(self, csvsrc, csvdest): afm_dict=dict() param_dict=dict() #E900 column e900_dict = dict() for elem in ['P','Ni','Cu','Mn']: #Si, C not used in e900 e900_dict['wt%s' % elem] = 'wt_percent_%s' % elem e900_dict['fluencestr'] = 'fluence_n_cm2' e900_dict['tempC'] = 'temperature_C' e900_dict['prod_ID'] = 'product_id' afm_dict['DBTT.E900'] = dict(e900_dict) param_dict['DBTT.E900'] = dict() #get_dataframe csv_dataparser = DataParser() csv_dataframe = csv_dataparser.import_data("%s.csv" % os.path.join(self.save_path, csvsrc)) #add features for afm in afm_dict.keys(): (feature_name, feature_data) = cf_help.get_custom_feature_data(class_method_str = afm, starting_dataframe = csv_dataframe, param_dict = dict(param_dict[afm]), addl_feature_method_kwargs = dict(afm_dict[afm])) fio = FeatureIO(csv_dataframe) csv_dataframe = fio.add_custom_features([feature_name],feature_data) #add log10 features log10_dict=dict() log10_dict['fluence_n_cm2'] = dict() log10_dict['flux_n_cm2_sec'] = dict() for lkey in log10_dict.keys(): orig_data = csv_dataframe[lkey] log10_data = np.log10(orig_data) fio = FeatureIO(csv_dataframe) csv_dataframe = fio.add_custom_features(["log(%s)" % lkey], log10_data) #add normalizations norm_dict = dict() norm_dict['log(fluence_n_cm2)']=dict() norm_dict['log(fluence_n_cm2)']['smin'] = 17 norm_dict['log(fluence_n_cm2)']['smax'] = 25 norm_dict['log(flux_n_cm2_sec)']=dict() norm_dict['log(flux_n_cm2_sec)']['smin'] = 10 norm_dict['log(flux_n_cm2_sec)']['smax'] = 15 norm_dict['temperature_C']=dict() norm_dict['temperature_C']['smin'] = 270 norm_dict['temperature_C']['smax'] = 320 for elem in ["P","C","Cu","Ni","Mn","Si"]: norm_dict["at_percent_%s" % elem] = dict() norm_dict["at_percent_%s" % elem]['smin'] = 0.0 norm_dict["at_percent_%s" % elem]['smax'] = 1.717 #max Mn atomic percent for nkey in norm_dict.keys(): fnorm = FeatureNormalization(csv_dataframe) scaled_feature = fnorm.minmax_scale_single_feature(nkey, smin=norm_dict[nkey]['smin'], smax=norm_dict[nkey]['smax']) fio = FeatureIO(csv_dataframe) csv_dataframe = fio.add_custom_features(["N(%s)" % nkey],scaled_feature) csv_dataframe.to_csv("%s.csv" % os.path.join(self.save_path, csvdest)) return
def test_parse_fromfile(self): Xdata, ydata, x_features, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromfile(datapath=self.datapath, as_array=False) self.assertIsInstance(Xdata, pd.DataFrame) self.assertIsInstance(ydata, pd.Series) self.assertIsInstance(dataframe, pd.DataFrame) Xdata, ydata, x_features, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromfile(datapath=self.datapath, as_array=True) self.assertIsInstance(Xdata, np.ndarray) self.assertIsInstance(ydata, np.ndarray) self.assertIsInstance(dataframe, pd.DataFrame) return
def _parse_input_data(self, data_path=""): if not (os.path.isfile(data_path)): raise OSError("No file found at %s" % data_path) Xdata, ydata, x_features, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromfile(datapath=data_path, as_array=False) return Xdata, ydata, x_features, y_feature, dataframe
def test_get_features(self): x_features, y_feature = DataParser( configdict=self.configdict).get_features( dataframe=self.df1, target_feature=self.target_feature, from_input_file=False) self.assertTrue(type(x_features) is list) self.assertTrue(y_feature == self.target_feature) return
def _split_csv_file(self): # Need dataframe and x and y features so can split CSV accordingly. data_path_list = [] dataframe = DataParser(configdict=self.configdict).import_data( datapath=self.configdict['Data Setup']['Initial']['data_path']) y_feature = self.configdict['General Setup']['target_feature'] other_features = [] for column in dataframe.columns.values: if column not in y_feature: other_features.append(column) dataframe_x = dataframe.loc[:, other_features] count = 1 for feature in y_feature: try: dataframe_y = dataframe.loc[:, feature] except KeyError: logging.info( 'Error detected: The feature names in the csv and input files do not match' ) print( 'The feature names in the csv and input files do not match. Please fix feature names and re-run MASTML' ) sys.exit() dataframe_new = DataframeUtilities.merge_dataframe_columns( dataframe1=dataframe_x, dataframe2=dataframe_y) # Write the new dataframe to new CSV, and update data_path_list data_path_split = os.path.split( self.configdict['Data Setup']['Initial']['data_path']) filename = data_path_split[1].split(".csv") data_path = data_path_split[0] + "/" + str( filename[0]) + "_" + str(count) + ".csv" dataframe_new.to_csv(data_path, index=False) data_path_list.append(data_path) count += 1 # Last, add file data paths that are not part of original CSV file to split for key in self.configdict['Data Setup'].keys(): if key != 'Initial': data_path_list.append( self.configdict['Data Setup'][key]['data_path']) return data_path_list
def _perform_feature_selection(self, dataframe, x_features, y_feature): for k, v in self.configdict['Feature Selection'].items(): if k == 'remove_constant_features' and v == 'True': logging.info( 'FEATURE SELECTION: Removing constant features from your feature list' ) dr = DimensionalReduction(dataframe=dataframe, x_features=x_features, y_feature=y_feature) dataframe = dr.remove_constant_features() x_features, y_feature = DataParser( configdict=self.configdict).get_features( dataframe=dataframe, target_feature=y_feature) if k == 'feature_selection_algorithm': logging.info( 'FEATURE SELECTION: Selecting features using a %s algorithm' % v) model_to_use = str(self.configdict['Feature Selection'] ['model_to_use_for_learning_curve']) fs = FeatureSelection(configdict=self.configdict, dataframe=dataframe, x_features=x_features, y_feature=y_feature, model_type=model_to_use) if v == 'sequential_forward_selection': if int(self.configdict['Feature Selection'] ['number_of_features_to_keep']) <= len(x_features): dataframe = fs.sequential_forward_selection( number_features_to_keep=int( self.configdict['Feature Selection'] ['number_of_features_to_keep'])) else: logging.info( 'Warning: you have specified to keep more features than the total number of features in your dataset. Defaulting to keep all features in feature selection' ) dataframe = fs.sequential_forward_selection( number_features_to_keep=int(len(x_features))) if v == 'recursive_feature_elimination': if int(self.configdict['Feature Selection'] ['number_of_features_to_keep']) <= len(x_features): dataframe = fs.feature_selection( feature_selection_type= 'recursive_feature_elimination', number_features_to_keep=int( self.configdict['Feature Selection'] ['number_of_features_to_keep']), use_mutual_info=self.configdict[ 'Feature Selection']['use_mutual_information']) else: logging.info( 'Warning: you have specified to keep more features than the total number of features in your dataset. Defaulting to keep all features in feature selection' ) dataframe = fs.feature_selection( feature_selection_type= 'recursive_feature_elimination', number_features_to_keep=int( self.configdict['Feature Selection'] ['number_of_features_to_keep']), use_mutual_info=self.configdict[ 'Feature Selection']['use_mutual_information']) if self.configdict['Feature Selection'][ 'generate_feature_learning_curve'] == 'True': learningcurve = LearningCurve( configdict=self.configdict, dataframe=dataframe, model_type=model_to_use) logging.info( 'Generating a feature learning curve using a %s algorithm' % v) learningcurve.generate_feature_learning_curve( feature_selection_algorithm= 'recursive_feature_elimination') if v == 'univariate_feature_selection': if int(self.configdict['Feature Selection'] ['number_of_features_to_keep']) <= len(x_features): dataframe = fs.feature_selection( feature_selection_type= 'univariate_feature_selection', number_features_to_keep=int( self.configdict['Feature Selection'] ['number_of_features_to_keep']), use_mutual_info=self.configdict[ 'Feature Selection']['use_mutual_information']) else: logging.info( 'Warning: you have specified to keep more features than the total number of features in your dataset. Defaulting to keep all features in feature selection' ) dataframe = fs.feature_selection( feature_selection_type= 'univariate_feature_selection', number_features_to_keep=int(len(x_features)), use_mutual_info=self.configdict[ 'Feature Selection']['use_mutual_information']) if self.configdict['Feature Selection'][ 'generate_feature_learning_curve'] == 'True': learningcurve = LearningCurve( configdict=self.configdict, dataframe=dataframe, model_type=model_to_use) logging.info( 'Generating a feature learning curve using a %s algorithm' % v) learningcurve.generate_feature_learning_curve( feature_selection_algorithm= 'univariate_feature_selection') return dataframe
def _create_data_dict(self): data_dict = dict() for data_name in self.data_setup.keys(): data_path = self.configdict['Data Setup'][data_name]['data_path'] logging.info( 'Creating data dict for data path %s and data name %s' % (data_path, data_name)) data_weights = self.data_setup[data_name]['weights'] if 'labeling_features' in self.general_setup.keys(): labeling_features = self._string_or_list_input_to_list( self.general_setup['labeling_features']) else: labeling_features = None if 'target_error_feature' in self.general_setup.keys(): target_error_feature = self.general_setup[ 'target_error_feature'] else: target_error_feature = None if 'grouping_feature' in self.general_setup.keys(): grouping_feature = self.general_setup['grouping_feature'] else: grouping_feature = None if 'Feature Generation' in self.configdict.keys(): if self.configdict['Feature Generation']['perform_feature_generation'] == bool(True) or \ self.configdict['Feature Generation']['perform_feature_generation'] == "True": generate_features = True else: generate_features = False else: generate_features = False if 'Feature Normalization' in self.configdict.keys(): if self.configdict['Feature Normalization']['normalize_x_features'] == bool(True) or \ self.configdict['Feature Normalization']['normalize_x_features'] == "True": normalize_x_features = True else: normalize_x_features = False if self.configdict['Feature Normalization']['normalize_y_feature'] == bool(True) or \ self.configdict['Feature Normalization']['normalize_y_feature'] == "True": normalize_y_feature = True else: normalize_y_feature = False else: normalize_x_features = False normalize_y_feature = False if 'Feature Selection' in self.configdict.keys(): if self.configdict['Feature Selection']['perform_feature_selection'] == bool(True) or \ self.configdict['Feature Selection']['perform_feature_selection'] == "True": select_features = True else: select_features = False else: select_features = False logging.info("Feature Generation: %s" % generate_features) logging.info("Feature Normalization (x_features): %s" % normalize_x_features) logging.info("Feature Normalization (y_feature): %s" % normalize_y_feature) logging.info("Feature Selection: %s" % select_features) # Parse input data file Xdata, ydata, x_features, y_feature, dataframe = self._parse_input_data( data_path) # Plot initial histogram of input target data DataframeUtilities().plot_dataframe_histogram( configdict=self.configdict, dataframe=dataframe, y_feature=y_feature) original_x_features = list(x_features) original_columns = list(dataframe.columns) logging.debug("original columns: %s" % original_columns) # Remove any missing rows from dataframe #dataframe = dataframe.dropna() # Save off label and grouping data dataframe_labeled = pd.DataFrame() dataframe_grouped = pd.DataFrame() if not (labeling_features is None): dataframe_labeled = FeatureIO( dataframe=dataframe).keep_custom_features( features_to_keep=labeling_features, y_feature=y_feature) if normalize_x_features == bool(True): dataframe_labeled, scaler = FeatureNormalization( dataframe=dataframe_labeled, configdict=self.configdict).normalize_features( x_features=labeling_features, y_feature=y_feature) if not (grouping_feature is None): dataframe_grouped = FeatureIO( dataframe=dataframe).keep_custom_features( features_to_keep=[grouping_feature], y_feature=y_feature) # Generate additional descriptors, as specified in input file (optional) if generate_features: dataframe = self._perform_feature_generation( dataframe=dataframe) # Actually, the x_features_NOUSE is required if starting from no features and doing feature generation. Not renaming for now. RJ 7/17 Xdata, ydata, x_features_NOUSE, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe, target_feature=y_feature) else: Xdata, ydata, x_features, y_feature, dataframe = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe, target_feature=y_feature) # First remove features containing strings before doing feature normalization or other operations, but don't remove grouping features if generate_features == bool(True): nonstring_x_features, dataframe_nostrings = MiscFeatureOperations( configdict=self.configdict ).remove_features_containing_strings( dataframe=dataframe, x_features=x_features_NOUSE) #Remove columns containing all entries of NaN dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='all') # Re-obtain x_feature list as some features may have been dropped Xdata, ydata, x_features_NOUSE, y_feature, dataframe_nostrings = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_nostrings, target_feature=y_feature) else: nonstring_x_features, dataframe_nostrings = MiscFeatureOperations( configdict=self.configdict ).remove_features_containing_strings(dataframe=dataframe, x_features=x_features) # Remove columns containing all entries of NaN dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='all') # Fill spots with NaN to be empty string dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='any') # Re-obtain x_feature list as some features may have been dropped Xdata, ydata, x_features, y_feature, dataframe_nostrings = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_nostrings, target_feature=y_feature) logging.debug("pre-changes:%s" % dataframe_nostrings.columns) # Normalize features (optional) if normalize_x_features == bool( True) or normalize_y_feature == bool(True): fn = FeatureNormalization(dataframe=dataframe_nostrings, configdict=self.configdict) dataframe_nostrings, scaler = fn.normalize_features( x_features=x_features, y_feature=y_feature, normalize_x_features=normalize_x_features, normalize_y_feature=normalize_y_feature) x_features, y_feature = DataParser( configdict=self.configdict).get_features( dataframe=dataframe_nostrings, target_feature=y_feature) # Perform feature selection and dimensional reduction, as specified in the input file (optional) if (select_features == bool(True)) and (y_feature in dataframe_nostrings.columns): # Remove any additional columns that are not x_features using to be fit to data features = dataframe_nostrings.columns.values.tolist() features_to_remove = [] for feature in features: if feature not in x_features and feature not in y_feature: features_to_remove.append(feature) dataframe_nostrings = FeatureIO( dataframe=dataframe_nostrings).remove_custom_features( features_to_remove=features_to_remove) dataframe_nostrings = self._perform_feature_selection( dataframe=dataframe_nostrings, x_features=x_features, y_feature=y_feature) x_features, y_feature = DataParser( configdict=self.configdict).get_features( dataframe=dataframe_nostrings, target_feature=y_feature) logging.debug("post-removal:%s" % dataframe_nostrings.columns) # Combine the input dataframe, which has undergone feature generation and normalization, with the grouped and labeled features of original dataframe # First, need to generate dataframe that only has the grouped and labeled features grouping_and_labeling_features = [] duplicate_features = [] if 'grouping_feature' in self.configdict['General Setup'].keys(): grouping_and_labeling_features.append(grouping_feature) if 'labeling_features' in self.configdict['General Setup'].keys(): for feature in labeling_features: grouping_and_labeling_features.append(feature) if feature in x_features: if feature not in duplicate_features: duplicate_features.append(feature) # Now merge dataframes dataframe_labeled_grouped = DataframeUtilities( ).merge_dataframe_columns(dataframe1=dataframe_labeled, dataframe2=dataframe_grouped) dataframe_merged = DataframeUtilities().merge_dataframe_columns( dataframe1=dataframe_nostrings, dataframe2=dataframe_labeled_grouped) #Add string columns back in string_x_features = list() for my_x_feature in x_features: if my_x_feature in nonstring_x_features: pass else: string_x_features.append(my_x_feature) logging.debug("string features: %s" % string_x_features) for string_x_feature in string_x_features: dataframe_merged[string_x_feature] = dataframe_orig_dropped_na[ string_x_feature] # Need to remove duplicate features after merging. logging.debug("merged:%s" % dataframe_merged.columns) dataframe_rem = FeatureIO( dataframe=dataframe_merged).remove_duplicate_columns() myXdata, myydata, myx_features, myy_feature, dataframe_final = DataParser( configdict=self.configdict).parse_fromdataframe( dataframe=dataframe_rem, target_feature=y_feature) combined_x_features = list() logging.debug("total features:%s" % myx_features) for feature in myx_features: if (feature in original_x_features) or not ( feature in original_columns ): #originally designated, or created from feature generation combined_x_features.append(feature) logging.debug("combined x features:%s" % combined_x_features) data_dict[data_name] = DataHandler( data=dataframe_final, input_data=dataframe_final[combined_x_features], target_data=myydata, input_features=combined_x_features, target_feature=myy_feature, target_error_feature=target_error_feature, labeling_features=labeling_features, grouping_feature=grouping_feature) # logging.info('Parsed the input data located under %s' % data_path) # Get dataframe stats DataframeUtilities.save_all_dataframe_statistics( dataframe=dataframe_final, configdict=self.configdict) return data_dict, y_feature
def test_get_data(self): Xdata, ydata = DataParser(configdict=self.configdict).get_data( dataframe=self.df1, x_features=self.x_features, y_feature=self.target_feature) return
def test_import_data(self): dataframe = DataParser(configdict=self.configdict).import_data( datapath=self.datapath) self.assertIsInstance(dataframe, pd.DataFrame) return