コード例 #1
0
 def add_filters(self, filter_list):
     if filter_list is None:
         return
     for (feature, operator, threshold) in filter_list:
         fio = FeatureIO(self.data)
         self.data = fio.custom_feature_filter(feature, operator, threshold)
     self.set_up_data_from_features()
     return
コード例 #2
0
ファイル: DBTTData.py プロジェクト: robertmaxwilliams/MAST-ML
 def csv_add_features(self, csvsrc, csvdest):
     afm_dict=dict()
     param_dict=dict()
     #E900 column
     e900_dict = dict()
     for elem in ['P','Ni','Cu','Mn']: #Si, C not used in e900
         e900_dict['wt%s' % elem] = 'wt_percent_%s' % elem
     e900_dict['fluencestr'] = 'fluence_n_cm2'
     e900_dict['tempC'] = 'temperature_C'
     e900_dict['prod_ID'] = 'product_id'
     afm_dict['DBTT.E900'] = dict(e900_dict)
     param_dict['DBTT.E900'] = dict()
     #get_dataframe
     csv_dataparser = DataParser()
     csv_dataframe = csv_dataparser.import_data("%s.csv" % os.path.join(self.save_path, csvsrc))
     #add features
     for afm in afm_dict.keys():
         (feature_name, feature_data) = cf_help.get_custom_feature_data(class_method_str = afm,
             starting_dataframe = csv_dataframe,
             param_dict = dict(param_dict[afm]),
             addl_feature_method_kwargs = dict(afm_dict[afm]))
         fio = FeatureIO(csv_dataframe)
         csv_dataframe = fio.add_custom_features([feature_name],feature_data)
     #add log10 features
     log10_dict=dict()
     log10_dict['fluence_n_cm2'] = dict()
     log10_dict['flux_n_cm2_sec'] = dict()
     for lkey in log10_dict.keys():
         orig_data = csv_dataframe[lkey]
         log10_data = np.log10(orig_data)
         fio = FeatureIO(csv_dataframe)
         csv_dataframe = fio.add_custom_features(["log(%s)" % lkey], log10_data)
     #add normalizations
     norm_dict = dict()
     norm_dict['log(fluence_n_cm2)']=dict()
     norm_dict['log(fluence_n_cm2)']['smin'] = 17
     norm_dict['log(fluence_n_cm2)']['smax'] = 25
     norm_dict['log(flux_n_cm2_sec)']=dict()
     norm_dict['log(flux_n_cm2_sec)']['smin'] = 10
     norm_dict['log(flux_n_cm2_sec)']['smax'] = 15
     norm_dict['temperature_C']=dict()
     norm_dict['temperature_C']['smin'] = 270
     norm_dict['temperature_C']['smax'] = 320
     for elem in ["P","C","Cu","Ni","Mn","Si"]:
         norm_dict["at_percent_%s" % elem] = dict()
         norm_dict["at_percent_%s" % elem]['smin'] = 0.0
         norm_dict["at_percent_%s" % elem]['smax'] = 1.717 #max Mn atomic percent
     for nkey in norm_dict.keys():
         fnorm = FeatureNormalization(csv_dataframe)
         scaled_feature = fnorm.minmax_scale_single_feature(nkey,
                             smin=norm_dict[nkey]['smin'], 
                             smax=norm_dict[nkey]['smax'])
         fio = FeatureIO(csv_dataframe)
         csv_dataframe = fio.add_custom_features(["N(%s)" % nkey],scaled_feature)
     csv_dataframe.to_csv("%s.csv" % os.path.join(self.save_path, csvdest))
     return
コード例 #3
0
 def get_afm_updated_dataset(self, indiv_df, indiv_params):
     """Update dataframe with additional feature methods
     """
     for afm in indiv_params.keys():
         if afm == 'model':  #model dealt with separately
             continue
         afm_kwargs = dict(indiv_params[afm])
         (feature_name, feature_data) = cf_help.get_custom_feature_data(
             afm,
             starting_dataframe=indiv_df,
             addl_feature_method_kwargs=dict(afm_kwargs))
         fio = FeatureIO(indiv_df)
         indiv_df = fio.add_custom_features([afm], feature_data)
     return indiv_df
コード例 #4
0
ファイル: Testing.py プロジェクト: robertmaxwilliams/MAST-ML
 def subtraction(self, col1="", col2="", num1="", num2="", **params):
     """Testing function.
         col1 <str>: first feature name
         col2 <str>: second feature name
         num1 <float>: number to multiply col1 by
         num2 <float>: number to subtract
     """
     col1_data = self.df[col1]
     col2_data = self.df[col2]
     new_data = (col1_data * num1) - col2_data + num2
     fio = FeatureIO(self.df)
     new_df = fio.add_custom_features(["Subtraction_test"],new_data)
     fnorm = FeatureNormalization(new_df)
     N_new_data = fnorm.minmax_scale_single_feature("Subtraction_test")
     return N_new_data
コード例 #5
0
 def test_custom_feature_filter(self):
     df = FeatureIO(dataframe=self.df1).custom_feature_filter(
         feature='AtomicNumber_composition_average',
         operator='<',
         threshold=21)
     self.assertTrue(df.shape[0] < self.df1.shape[0])
     return
コード例 #6
0
 def test_add_custom_features(self):
     df1columns = self.df1.shape[1]
     df = FeatureIO(dataframe=self.df1).add_custom_features(
         features_to_add=['test_feature'],
         data_to_add=np.zeros(shape=[
             self.df1.shape[0],
         ]))
     self.assertTrue(df1columns < df.shape[1])
     return
コード例 #7
0
 def add_normalization(self, cname, verbose=0):
     df = self.dfs[cname]
     norm_dict = dict()
     norm_dict['log(fluence_n_cm2)'] = dict()
     norm_dict['log(fluence_n_cm2)']['smin'] = 17
     norm_dict['log(fluence_n_cm2)']['smax'] = 25
     norm_dict['log(flux_n_cm2_sec)'] = dict()
     norm_dict['log(flux_n_cm2_sec)']['smin'] = 10
     norm_dict['log(flux_n_cm2_sec)']['smax'] = 15
     norm_dict['log(eff fl 100p=10)'] = dict()
     norm_dict['log(eff fl 100p=10)']['smin'] = 17
     norm_dict['log(eff fl 100p=10)']['smax'] = 25
     norm_dict['log(eff fl 100p=20)'] = dict()
     norm_dict['log(eff fl 100p=20)']['smin'] = 17
     norm_dict['log(eff fl 100p=20)']['smax'] = 25
     norm_dict['log(eff fl 100p=26)'] = dict()
     norm_dict['log(eff fl 100p=26)']['smin'] = 17
     norm_dict['log(eff fl 100p=26)']['smax'] = 25
     norm_dict['log(eff fl 100p=23)'] = dict()
     norm_dict['log(eff fl 100p=23)']['smin'] = 17
     norm_dict['log(eff fl 100p=23)']['smax'] = 25
     norm_dict['log(eff fl 100p=8)'] = dict()
     norm_dict['log(eff fl 100p=8)']['smin'] = 17
     norm_dict['log(eff fl 100p=8)']['smax'] = 25
     norm_dict['temperature_C'] = dict()
     norm_dict['temperature_C']['smin'] = 270
     norm_dict['temperature_C']['smax'] = 320
     for elem in ["P", "C", "Cu", "Ni", "Mn", "Si"]:
         norm_dict["at_percent_%s" % elem] = dict()
         norm_dict["at_percent_%s" % elem]['smin'] = 0.0
         norm_dict["at_percent_%s" %
                   elem]['smax'] = 1.717  #max Mn atomic percent
     for nkey in norm_dict.keys():
         fnorm = FeatureNormalization(df)
         scaled_feature = fnorm.minmax_scale_single_feature(
             nkey,
             smin=norm_dict[nkey]['smin'],
             smax=norm_dict[nkey]['smax'])
         fio = FeatureIO(df)
         df = fio.add_custom_features(["N(%s)" % nkey], scaled_feature)
     self.dfs[cname] = df
     return
コード例 #8
0
 def test_remove_duplicate_columns(self):
     # Manually add duplicate column to dataframe
     self.df1['AtomicNumber_composition_average_copy'] = pd.Series(
         self.df1['AtomicNumber_composition_average'], index=self.df1.index)
     self.df1.rename(columns={
         'AtomicNumber_composition_average_copy':
         'AtomicNumber_composition_average'
     },
                     inplace=True)
     df = FeatureIO(dataframe=self.df1).remove_duplicate_columns()
     self.assertTrue(df.shape[1] < self.df1.shape[1])
     return
コード例 #9
0
    def calculate_EffectiveFluence(self,
                                   pvalue=0,
                                   ref_flux=3e10,
                                   flux_feature="",
                                   fluence_feature="",
                                   scale_min=1e17,
                                   scale_max=1e25,
                                   **params):
        """Calculate effective fluence
        """
        fluence = self.df[fluence_feature]
        flux = self.df[flux_feature]

        EFl = fluence * (ref_flux / flux)**pvalue
        EFl = np.log10(EFl)
        fio = FeatureIO(self.df)
        new_df = fio.add_custom_features(["EFl"], EFl)
        fnorm = FeatureNormalization(new_df)
        N_EFl = fnorm.minmax_scale_single_feature("EFl",
                                                  smin=np.log10(scale_min),
                                                  smax=np.log10(scale_max))

        return N_EFl
コード例 #10
0
    def make_data(self):
        n_samples, n_features = 100, 5
        y = self.random_state.randn(n_samples)
        X = self.random_state.randn(n_samples, n_features)

        nidx = np.arange(0, n_samples)
        self.dataframe = pd.DataFrame(index=nidx)
        num_cat = self.random_state.randint(0, 4, n_samples)
        cats = ['A', 'B', 'C', 'D']
        str_cat = [cats[nc] for nc in num_cat]
        time = nidx * np.pi / 8.0
        sine_feature = np.sin(time) + X[:, 0]  #add noise
        linear_feature = 100 * time + 30.0 + X[:, 1]  #add noise
        y_feature = np.sin(time) + y / 10.0
        y_feature_error = X[:, 3] / X[:, 4] / 100.0  #add random error
        d_cols = dict()
        d_cols["num_idx"] = nidx
        d_cols["num_cat"] = num_cat
        d_cols["str_cat"] = str_cat
        d_cols["time"] = time
        d_cols["sine_feature"] = sine_feature
        d_cols["linear_feature"] = linear_feature
        d_cols["y_feature"] = y_feature
        d_cols["y_feature_error"] = y_feature_error
        cols = list(d_cols.keys())
        cols.sort()
        for col in cols:
            fio = FeatureIO(self.dataframe)
            self.dataframe = fio.add_custom_features([col], d_cols[col])
        fnorm = FeatureNormalization(self.dataframe)
        N_sine_feature = fnorm.minmax_scale_single_feature("sine_feature")
        N_linear_feature = fnorm.minmax_scale_single_feature("linear_feature")
        fio = FeatureIO(self.dataframe)
        self.dataframe = fio.add_custom_features(["N_sine_feature"],
                                                 N_sine_feature)
        fio = FeatureIO(self.dataframe)
        self.dataframe = fio.add_custom_features(["N_linear_feature"],
                                                 N_sine_feature)
        return
コード例 #11
0
 def test_keep_custom_features(self):
     df = FeatureIO(dataframe=self.df1).keep_custom_features(
         features_to_keep=['AtomicNumber_composition_average'])
     self.assertTrue(
         ['AtomicNumber_composition_average'] == df.columns.values.tolist())
     return
コード例 #12
0
 def test_remove_custom_features(self):
     df = FeatureIO(dataframe=self.df1).remove_custom_features(
         features_to_remove=['AtomicNumber_composition_average'])
     self.assertTrue('AtomicNumber_composition_average' not in
                     df.columns.values.tolist())
     return
コード例 #13
0
 def add_feature(self, feature_name, feature_data):
     fio = FeatureIO(self.data)
     self.data = fio.add_custom_features([feature_name], feature_data)
     return
コード例 #14
0
 def add_prediction_sigma(self, prediction_data_sigma):
     fio = FeatureIO(self.data)
     self.data = fio.add_custom_features(["Prediction Sigma"],
                                         prediction_data_sigma)
     self.target_prediction_sigma = self.data["Prediction Sigma"]
     return
コード例 #15
0
 def add_prediction(self, prediction_data):
     fio = FeatureIO(self.data)
     self.data = fio.add_custom_features(["Prediction"], prediction_data)
     self.target_prediction = self.data["Prediction"]
     return
コード例 #16
0
ファイル: MASTML.py プロジェクト: robertmaxwilliams/MAST-ML
    def _create_data_dict(self):
        data_dict = dict()
        for data_name in self.data_setup.keys():
            data_path = self.configdict['Data Setup'][data_name]['data_path']

            logging.info(
                'Creating data dict for data path %s and data name %s' %
                (data_path, data_name))

            data_weights = self.data_setup[data_name]['weights']
            if 'labeling_features' in self.general_setup.keys():
                labeling_features = self._string_or_list_input_to_list(
                    self.general_setup['labeling_features'])
            else:
                labeling_features = None
            if 'target_error_feature' in self.general_setup.keys():
                target_error_feature = self.general_setup[
                    'target_error_feature']
            else:
                target_error_feature = None
            if 'grouping_feature' in self.general_setup.keys():
                grouping_feature = self.general_setup['grouping_feature']
            else:
                grouping_feature = None

            if 'Feature Generation' in self.configdict.keys():
                if self.configdict['Feature Generation']['perform_feature_generation'] == bool(True) or \
                                self.configdict['Feature Generation']['perform_feature_generation'] == "True":
                    generate_features = True
                else:
                    generate_features = False
            else:
                generate_features = False

            if 'Feature Normalization' in self.configdict.keys():
                if self.configdict['Feature Normalization']['normalize_x_features'] == bool(True) or \
                                self.configdict['Feature Normalization']['normalize_x_features'] == "True":
                    normalize_x_features = True
                else:
                    normalize_x_features = False
                if self.configdict['Feature Normalization']['normalize_y_feature'] == bool(True) or \
                                self.configdict['Feature Normalization']['normalize_y_feature'] == "True":
                    normalize_y_feature = True
                else:
                    normalize_y_feature = False
            else:
                normalize_x_features = False
                normalize_y_feature = False

            if 'Feature Selection' in self.configdict.keys():
                if self.configdict['Feature Selection']['perform_feature_selection'] == bool(True) or \
                                self.configdict['Feature Selection']['perform_feature_selection'] == "True":
                    select_features = True
                else:
                    select_features = False
            else:
                select_features = False

            logging.info("Feature Generation: %s" % generate_features)
            logging.info("Feature Normalization (x_features): %s" %
                         normalize_x_features)
            logging.info("Feature Normalization (y_feature): %s" %
                         normalize_y_feature)
            logging.info("Feature Selection: %s" % select_features)
            # Parse input data file
            Xdata, ydata, x_features, y_feature, dataframe = self._parse_input_data(
                data_path)

            # Plot initial histogram of input target data
            DataframeUtilities().plot_dataframe_histogram(
                configdict=self.configdict,
                dataframe=dataframe,
                y_feature=y_feature)

            original_x_features = list(x_features)
            original_columns = list(dataframe.columns)
            logging.debug("original columns: %s" % original_columns)
            # Remove any missing rows from dataframe
            #dataframe = dataframe.dropna()

            # Save off label and grouping data
            dataframe_labeled = pd.DataFrame()
            dataframe_grouped = pd.DataFrame()
            if not (labeling_features is None):
                dataframe_labeled = FeatureIO(
                    dataframe=dataframe).keep_custom_features(
                        features_to_keep=labeling_features,
                        y_feature=y_feature)
                if normalize_x_features == bool(True):
                    dataframe_labeled, scaler = FeatureNormalization(
                        dataframe=dataframe_labeled,
                        configdict=self.configdict).normalize_features(
                            x_features=labeling_features, y_feature=y_feature)
            if not (grouping_feature is None):
                dataframe_grouped = FeatureIO(
                    dataframe=dataframe).keep_custom_features(
                        features_to_keep=[grouping_feature],
                        y_feature=y_feature)

            # Generate additional descriptors, as specified in input file (optional)
            if generate_features:
                dataframe = self._perform_feature_generation(
                    dataframe=dataframe)
                # Actually, the x_features_NOUSE is required if starting from no features and doing feature generation. Not renaming for now. RJ 7/17
                Xdata, ydata, x_features_NOUSE, y_feature, dataframe = DataParser(
                    configdict=self.configdict).parse_fromdataframe(
                        dataframe=dataframe, target_feature=y_feature)

            else:
                Xdata, ydata, x_features, y_feature, dataframe = DataParser(
                    configdict=self.configdict).parse_fromdataframe(
                        dataframe=dataframe, target_feature=y_feature)

            # First remove features containing strings before doing feature normalization or other operations, but don't remove grouping features
            if generate_features == bool(True):
                nonstring_x_features, dataframe_nostrings = MiscFeatureOperations(
                    configdict=self.configdict
                ).remove_features_containing_strings(
                    dataframe=dataframe, x_features=x_features_NOUSE)
                #Remove columns containing all entries of NaN
                dataframe_nostrings = dataframe_nostrings.dropna(axis=1,
                                                                 how='all')
                # Re-obtain x_feature list as some features may have been dropped
                Xdata, ydata, x_features_NOUSE, y_feature, dataframe_nostrings = DataParser(
                    configdict=self.configdict).parse_fromdataframe(
                        dataframe=dataframe_nostrings,
                        target_feature=y_feature)
            else:
                nonstring_x_features, dataframe_nostrings = MiscFeatureOperations(
                    configdict=self.configdict
                ).remove_features_containing_strings(dataframe=dataframe,
                                                     x_features=x_features)

            # Remove columns containing all entries of NaN
            dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='all')

            # Fill spots with NaN to be empty string
            dataframe_nostrings = dataframe_nostrings.dropna(axis=1, how='any')

            # Re-obtain x_feature list as some features may have been dropped
            Xdata, ydata, x_features, y_feature, dataframe_nostrings = DataParser(
                configdict=self.configdict).parse_fromdataframe(
                    dataframe=dataframe_nostrings, target_feature=y_feature)

            logging.debug("pre-changes:%s" % dataframe_nostrings.columns)

            # Normalize features (optional)
            if normalize_x_features == bool(
                    True) or normalize_y_feature == bool(True):
                fn = FeatureNormalization(dataframe=dataframe_nostrings,
                                          configdict=self.configdict)
                dataframe_nostrings, scaler = fn.normalize_features(
                    x_features=x_features,
                    y_feature=y_feature,
                    normalize_x_features=normalize_x_features,
                    normalize_y_feature=normalize_y_feature)
                x_features, y_feature = DataParser(
                    configdict=self.configdict).get_features(
                        dataframe=dataframe_nostrings,
                        target_feature=y_feature)

            # Perform feature selection and dimensional reduction, as specified in the input file (optional)
            if (select_features
                    == bool(True)) and (y_feature
                                        in dataframe_nostrings.columns):
                # Remove any additional columns that are not x_features using to be fit to data
                features = dataframe_nostrings.columns.values.tolist()
                features_to_remove = []
                for feature in features:
                    if feature not in x_features and feature not in y_feature:
                        features_to_remove.append(feature)
                dataframe_nostrings = FeatureIO(
                    dataframe=dataframe_nostrings).remove_custom_features(
                        features_to_remove=features_to_remove)
                dataframe_nostrings = self._perform_feature_selection(
                    dataframe=dataframe_nostrings,
                    x_features=x_features,
                    y_feature=y_feature)
                x_features, y_feature = DataParser(
                    configdict=self.configdict).get_features(
                        dataframe=dataframe_nostrings,
                        target_feature=y_feature)

            logging.debug("post-removal:%s" % dataframe_nostrings.columns)
            # Combine the input dataframe, which has undergone feature generation and normalization, with the grouped and labeled features of original dataframe
            # First, need to generate dataframe that only has the grouped and labeled features
            grouping_and_labeling_features = []
            duplicate_features = []
            if 'grouping_feature' in self.configdict['General Setup'].keys():
                grouping_and_labeling_features.append(grouping_feature)
            if 'labeling_features' in self.configdict['General Setup'].keys():
                for feature in labeling_features:
                    grouping_and_labeling_features.append(feature)
                    if feature in x_features:
                        if feature not in duplicate_features:
                            duplicate_features.append(feature)

            # Now merge dataframes
            dataframe_labeled_grouped = DataframeUtilities(
            ).merge_dataframe_columns(dataframe1=dataframe_labeled,
                                      dataframe2=dataframe_grouped)
            dataframe_merged = DataframeUtilities().merge_dataframe_columns(
                dataframe1=dataframe_nostrings,
                dataframe2=dataframe_labeled_grouped)

            #Add string columns back in
            string_x_features = list()
            for my_x_feature in x_features:
                if my_x_feature in nonstring_x_features:
                    pass
                else:
                    string_x_features.append(my_x_feature)
            logging.debug("string features: %s" % string_x_features)
            for string_x_feature in string_x_features:
                dataframe_merged[string_x_feature] = dataframe_orig_dropped_na[
                    string_x_feature]

            # Need to remove duplicate features after merging.
            logging.debug("merged:%s" % dataframe_merged.columns)
            dataframe_rem = FeatureIO(
                dataframe=dataframe_merged).remove_duplicate_columns()

            myXdata, myydata, myx_features, myy_feature, dataframe_final = DataParser(
                configdict=self.configdict).parse_fromdataframe(
                    dataframe=dataframe_rem, target_feature=y_feature)
            combined_x_features = list()
            logging.debug("total features:%s" % myx_features)
            for feature in myx_features:
                if (feature in original_x_features) or not (
                        feature in original_columns
                ):  #originally designated, or created from feature generation
                    combined_x_features.append(feature)
            logging.debug("combined x features:%s" % combined_x_features)
            data_dict[data_name] = DataHandler(
                data=dataframe_final,
                input_data=dataframe_final[combined_x_features],
                target_data=myydata,
                input_features=combined_x_features,
                target_feature=myy_feature,
                target_error_feature=target_error_feature,
                labeling_features=labeling_features,
                grouping_feature=grouping_feature)  #
            logging.info('Parsed the input data located under %s' % data_path)

            # Get dataframe stats
            DataframeUtilities.save_all_dataframe_statistics(
                dataframe=dataframe_final, configdict=self.configdict)

        return data_dict, y_feature