def generate_citrine_features(self, save_to_csv=True):
        logging.info(
            'WARNING: You have specified generation of features from Citrine. Based on which materials you are'
            'interested in, there may be many records to parse through, thus this routine may take a long time to complete!'
        )
        compositions = self.dataframe['Material compositions'].tolist()
        citrine_dict_property_min = dict()
        citrine_dict_property_max = dict()
        citrine_dict_property_avg = dict()
        for composition in compositions:
            pifquery = self._get_pifquery(composition=composition)
            property_name_list, property_value_list = self._get_pifquery_property_list(
                pifquery=pifquery)
            property_names_unique, parsed_property_min, parsed_property_max, parsed_property_avg = self._parse_pifquery_property_list(
                property_name_list=property_name_list,
                property_value_list=property_value_list)
            citrine_dict_property_min[composition] = parsed_property_min
            citrine_dict_property_max[composition] = parsed_property_max
            citrine_dict_property_avg[composition] = parsed_property_avg

        dataframe = self.dataframe
        citrine_dict_list = [
            citrine_dict_property_min, citrine_dict_property_max,
            citrine_dict_property_avg
        ]
        for citrine_dict in citrine_dict_list:
            dataframe_citrine = pd.DataFrame.from_dict(data=citrine_dict,
                                                       orient='index')
            # Need to reorder compositions in new dataframe to match input dataframe
            dataframe_citrine = dataframe_citrine.reindex(
                self.dataframe['Material compositions'].tolist())
            # Need to make compositions the first column, instead of the row names
            dataframe_citrine.index.name = 'Material compositions'
            dataframe_citrine.reset_index(inplace=True)
            # Need to delete duplicate column before merging dataframes
            del dataframe_citrine['Material compositions']
            # Merge magpie feature dataframe with originally supplied dataframe
            dataframe = DataframeUtilities().merge_dataframe_columns(
                dataframe1=dataframe, dataframe2=dataframe_citrine)

        if save_to_csv == bool(True):
            # Get y_feature in this dataframe, attach it to save path
            for column in dataframe.columns.values:
                if column in self.configdict['General Setup'][
                        'target_feature']:
                    filetag = column
            dataframe.to_csv(self.configdict['General Setup']['save_path'] +
                             "/" + 'input_with_citrine_features' + '_' +
                             str(filetag) + '.csv',
                             index=False)

        return dataframe
    def generate_materialsproject_features(self, save_to_csv=True):
        try:
            compositions = self.dataframe['Material compositions']
        except KeyError:
            print(
                'No column called "Material compositions" exists in the supplied dataframe.'
            )
            sys.exit()

        mpdata_dict_composition = {}
        for composition in compositions:
            composition_data_mp = self._get_data_from_materials_project(
                composition=composition)
            mpdata_dict_composition[composition] = composition_data_mp

        dataframe = self.dataframe
        dataframe_mp = pd.DataFrame.from_dict(data=mpdata_dict_composition,
                                              orient='index')
        # Need to reorder compositions in new dataframe to match input dataframe
        dataframe_mp = dataframe_mp.reindex(
            self.dataframe['Material compositions'].tolist())
        # Need to make compositions the first column, instead of the row names
        dataframe_mp.index.name = 'Material compositions'
        dataframe_mp.reset_index(inplace=True)
        # Need to delete duplicate column before merging dataframes
        del dataframe_mp['Material compositions']
        # Merge magpie feature dataframe with originally supplied dataframe
        dataframe = DataframeUtilities().merge_dataframe_columns(
            dataframe1=dataframe, dataframe2=dataframe_mp)

        if save_to_csv == bool(True):
            # Get y_feature in this dataframe, attach it to save path
            for column in dataframe.columns.values:
                if column in self.configdict['General Setup'][
                        'target_feature']:
                    filetag = column
            dataframe.to_csv(self.configdict['General Setup']['save_path'] +
                             "/" + 'input_with_matproj_features' + '_' +
                             str(filetag) + '.csv',
                             index=False)
        return dataframe
    def generate_magpie_features(self, save_to_csv=True):
        compositions = []
        composition_components = []

        # Replace empty composition fields with empty string instead of NaN
        self.dataframe = self.dataframe.fillna('')
        for column in self.dataframe.columns:
            if 'Material composition' in column:
                composition_components.append(self.dataframe[column].tolist())

        if len(composition_components) < 1:
            logging.info(
                'ERROR: No column with "Material composition xx" was found in the supplied dataframe'
            )
            sys.exit()

        row = 0
        while row < len(composition_components[0]):
            composition = ''
            for composition_component in composition_components:
                composition += str(composition_component[row])
            compositions.append(composition)
            row += 1

        # Add the column of combined material compositions into the dataframe
        self.dataframe['Material compositions'] = compositions

        # Assign each magpiedata feature set to appropriate composition name
        magpiedata_dict_composition_average = {}
        magpiedata_dict_arithmetic_average = {}
        magpiedata_dict_max = {}
        magpiedata_dict_min = {}
        magpiedata_dict_difference = {}
        magpiedata_dict_atomic_bysite = {}

        for composition in compositions:
            magpiedata_composition_average, magpiedata_arithmetic_average, magpiedata_max, magpiedata_min, magpiedata_difference = self._get_computed_magpie_features(
                composition=composition)
            magpiedata_atomic_notparsed = self._get_atomic_magpie_features(
                composition=composition)

            magpiedata_dict_composition_average[
                composition] = magpiedata_composition_average
            magpiedata_dict_arithmetic_average[
                composition] = magpiedata_arithmetic_average
            magpiedata_dict_max[composition] = magpiedata_max
            magpiedata_dict_min[composition] = magpiedata_min
            magpiedata_dict_difference[composition] = magpiedata_difference

            # Add site-specific elemental features
            count = 1
            magpiedata_atomic_bysite = {}
            for entry in magpiedata_atomic_notparsed.keys():
                for magpiefeature, featurevalue in magpiedata_atomic_notparsed[
                        entry].items():
                    magpiedata_atomic_bysite["Site" + str(count) + "_" +
                                             str(magpiefeature)] = featurevalue
                count += 1

            magpiedata_dict_atomic_bysite[
                composition] = magpiedata_atomic_bysite

        magpiedata_dict_list = [
            magpiedata_dict_composition_average,
            magpiedata_dict_arithmetic_average, magpiedata_dict_max,
            magpiedata_dict_min, magpiedata_dict_difference,
            magpiedata_dict_atomic_bysite
        ]

        dataframe = self.dataframe
        for magpiedata_dict in magpiedata_dict_list:
            dataframe_magpie = pd.DataFrame.from_dict(data=magpiedata_dict,
                                                      orient='index')
            # Need to reorder compositions in new dataframe to match input dataframe
            dataframe_magpie = dataframe_magpie.reindex(
                self.dataframe['Material compositions'].tolist())
            # Need to make compositions the first column, instead of the row names
            dataframe_magpie.index.name = 'Material compositions'
            dataframe_magpie.reset_index(inplace=True)
            # Need to delete duplicate column before merging dataframes
            del dataframe_magpie['Material compositions']
            # Merge magpie feature dataframe with originally supplied dataframe
            dataframe = DataframeUtilities().merge_dataframe_columns(
                dataframe1=dataframe, dataframe2=dataframe_magpie)

        if save_to_csv == bool(True):
            # Get y_feature in this dataframe, attach it to save path
            for column in dataframe.columns.values:
                if column in self.configdict['General Setup'][
                        'target_feature']:
                    filetag = column
            fname = self.configdict['General Setup'][
                'save_path'] + "/" + 'input_with_magpie_features' + '_' + str(
                    filetag) + '.csv'
            dataframe.to_csv(fname, index=False)

        return dataframe
Example #4
0
    def normalize_features(self,
                           x_features,
                           y_feature,
                           normalize_x_features,
                           normalize_y_feature,
                           to_csv=True):
        if normalize_x_features == bool(True) and normalize_y_feature == bool(
                False):
            scaler = StandardScaler().fit(X=self.dataframe[x_features])
            array_normalized = scaler.fit_transform(
                X=self.dataframe[x_features])
            array_normalized = DataframeUtilities().concatenate_arrays(
                X_array=array_normalized,
                y_array=np.asarray(self.dataframe[y_feature]).reshape([-1, 1]))
        elif normalize_x_features == bool(
                False) and normalize_y_feature == bool(True):
            scaler = StandardScaler().fit(
                X=np.asarray(self.dataframe[y_feature]).reshape([-1, 1]))
            array_normalized = scaler.fit_transform(
                X=np.asarray(self.dataframe[y_feature]).reshape([-1, 1]))
            array_normalized = DataframeUtilities().concatenate_arrays(
                X_array=np.asarray(self.dataframe[x_features]),
                y_array=array_normalized.reshape([-1, 1]))
        elif normalize_x_features == bool(
                True) and normalize_y_feature == bool(True):
            scaler_x = StandardScaler().fit(X=self.dataframe[x_features])
            scaler_y = StandardScaler().fit(
                X=np.asarray(self.dataframe[y_feature]).reshape([-1, 1]))
            array_normalized_x = scaler_x.fit_transform(
                X=self.dataframe[x_features])
            array_normalized_y = scaler_y.fit_transform(
                X=np.asarray(self.dataframe[y_feature]).reshape([-1, 1]))
            array_normalized = DataframeUtilities().concatenate_arrays(
                X_array=array_normalized_x, y_array=array_normalized_y)
        else:
            "You must specify to normalize either x_features, y_feature, or both, or set perform_feature_normalization=False in the input file"
            sys.exit()

        dataframe_normalized = DataframeUtilities().array_to_dataframe(
            array=array_normalized)
        dataframe_normalized = DataframeUtilities().assign_columns_as_features(
            dataframe=dataframe_normalized,
            x_features=x_features,
            y_feature=y_feature,
            remove_first_row=False)
        if to_csv == True:
            # Need configdict to get save path
            #configdict = ConfigFileParser(configfile=sys.argv[1]).get_config_dict(path_to_file=os.getcwd())
            # Get y_feature in this dataframe, attach it to save path
            for column in dataframe_normalized.columns.values:
                if column in self.configdict['General Setup'][
                        'target_feature']:
                    filetag = column
            dataframe_normalized.to_csv(
                self.configdict['General Setup']['save_path'] + "/" +
                'input_data_normalized' + '_' + str(filetag) + '.csv',
                index=False)

        if not (normalize_x_features == bool(True)
                and normalize_y_feature == bool(True)):
            return dataframe_normalized, scaler
        else:
            return dataframe_normalized, (scaler_x, scaler_y)