Ejemplo n.º 1
0
class BandgapPhase():
    def __init__(self):
        self.edft = pd.read_csv(path + '/ML/data/pero_dft.csv', sep='\t')
        self.comps_wdup = pd.read_csv(
            path + '/ICSD_data/ICSD_all_data_with_all_phases.csv',
            sep='\t')  # compositions with duplicates for phase prediction
        self.ae = AutoEncoder()
        self.VAE = self.ae.build_AE(vae=True)
        self.VAE.load_weights(path + '/saved_models/best_model_VAE.h5')
        self.scaler = StandardScaler()

    def arrange_comp(
            self):  # arrange chemical formula in (AA')BO3 and A(BB')O3 order
        def arrange_formula(row):
            formula_lst = re.findall('[A-Z][^A-Z]*',
                                     row.SimulatedComposition.strip())
            # print (formula_lst)
            B_doped = 0
            if len(formula_lst) == 4:
                r = re.compile("([a-zA-Z]+)([0-9]+)")
                m = r.match(formula_lst[0])
                elem1 = m.group(1)
                frac1 = m.group(2)
                if int(frac1) == 8:
                    B_doped = 1
                if B_doped == 1:  # A(BB')O3 type
                    A1 = elem1
                    A1_frac = ''
                    B1 = r.match(formula_lst[1]).group(1)
                    B1_frac = str(int(r.match(formula_lst[1]).group(2)) / 8.0)
                    B2 = r.match(formula_lst[2]).group(1)
                    B2_frac = str(int(r.match(formula_lst[2]).group(2)) / 8.0)
                    StructuredForm = "{}({}{}){}".format(
                        A1, B1 + B1_frac, B2 + B2_frac, 'O3')
                else:  # (AA')BO3 type
                    A1 = elem1
                    A1_frac = str(int(frac1) / 8.0)
                    A2 = r.match(formula_lst[1]).group(1)
                    A2_frac = str(int(r.match(formula_lst[1]).group(2)) / 8.0)
                    B1 = r.match(formula_lst[2]).group(1)
                    B1_frac = ''
                    StructuredForm = "({}{}){}{}".format(
                        A1 + A1_frac, A2 + A2_frac, B1, 'O3')
                return StructuredForm
            else:
                return 0

        self.edft['StructuredFormula'] = self.edft.apply(arrange_formula,
                                                         axis=1)
        df = self.edft[self.edft.StructuredFormula != 0]

        def get_fprint(
                row):  # getting material fingerprint for each composition
            print(row.StructuredFormula)
            try:
                fprint = self.ae.get_fingerprint(self.VAE,
                                                 row.StructuredFormula)
                print(fprint)
                return fprint[0], fprint[1]
            except:
                return None, None
            return

        df['Fingerprint_x'], df['Fingerprint_y'] = zip(
            *df.apply(get_fprint, axis=1))
        df = df.dropna()
        df.to_pickle(path + '/ML/data/processed_dft_data.pkl')
        print(df.info())

    def get_all_features(self):
        df = pd.read_pickle(path + '/ML/data/processed_dft_data.pkl')
        pv = Perovskites(df)
        df = pv.parse_formula(df)
        df = pv.add_features(df)
        df = pv.SISSO_features(df)
        df = df.drop([
            'SimulatedComposition', 'O p-band center (eV)',
            'Predicted Log k* (cm/s)', 'Charge transfer gap (eV)',
            'Formation energy', 'Fingerprint_x', 'Fingerprint_y'
        ],
                     axis=1)
        df.to_csv(path + '/ML/data/dft_data_with_features.csv',
                  sep='\t',
                  index=False)

    def bgap_pred_all_features(slef):
        df = pd.read_csv(path + '/ML/data/dft_data_with_features.csv',
                         sep='\t')
        df = df.drop([
            'StructuredFormula', 'A1', 'A1_frac', 'A2', 'A2_frac', 'B1',
            'B1_frac', 'B2', 'B2_frac', 'O', 'O_frac', 'atom_numO',
            'mend_numO', 'atomic_rO', 'O_X', 'M_O', 'V_O', 'therm_con_O',
            'polarizability_O', 'lattice_const_O', 'Row_O', 'Group_O', 'nO',
            'rO'
        ],
                     axis=1)
        df_x = df.drop(['Ehull', 'Bandgap'], axis=1)
        df_y = df[['Bandgap']]
        algo_dict_mse = {
            'DT': [],
            'SVR': [],
            'PLS': [],
            'EN': [],
            'KNN': [],
            'RAND': [],
            'GBR': []
        }
        algo_dict_mae = {
            'DT': [],
            'SVR': [],
            'PLS': [],
            'KNN': [],
            'RAND': [],
            'GBR': []
        }
        for i in range(20):
            X_train, X_test, y_train, y_test = train_test_split(
                df_x, df_y.values.ravel(), test_size=0.2, random_state=i)
            pipelines = []
            pipelines.append(('DT',
                              Pipeline([('Scaler', StandardScaler()),
                                        ('DT', DecisionTreeRegressor())])))
            pipelines.append(
                ('SVR', Pipeline([('Scaler', StandardScaler()),
                                  ('SVR', SVR())])))  #
            pipelines.append(('PLS',
                              Pipeline([('Scaler', StandardScaler()),
                                        ('PLS', PLSRegression())])))
            pipelines.append(('KNN',
                              Pipeline([('Scaler', StandardScaler()),
                                        ('KNN', KNeighborsRegressor())])))
            pipelines.append(('RAND',
                              Pipeline([('Scaler', StandardScaler()),
                                        ('RAND', RandomForestRegressor())])))
            pipelines.append(
                ('GBR',
                 Pipeline([('Scaler', StandardScaler()),
                           ('GBR', GradientBoostingRegressor())])))

            results = []
            names = []
            for name, model in pipelines:
                # cv = KFold(n_splits=10, random_state=10)
                cv = LeaveOneOut()
                cv_results_mse = cross_val_score(
                    model,
                    X_train,
                    y_train,
                    cv=cv,
                    scoring='neg_mean_squared_error')
                cv_results_mae = cross_val_score(
                    model,
                    X_train,
                    y_train,
                    cv=cv,
                    scoring='neg_mean_absolute_error')
                msg_mse = "%s: MSE %f (%f)" % (name, cv_results_mse.mean(),
                                               cv_results_mse.std())
                msg_mae = "%s: MAE %f (%f)" % (name, cv_results_mae.mean(),
                                               cv_results_mae.std())
                print(msg_mse)
                print(msg_mae)
                algo_dict_mse[name].append(np.sqrt(-1 * cv_results_mse.mean()))
                algo_dict_mae[name].append(-1 * cv_results_mae.mean())
            print('\n')
        print('DT 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['DT']).mean(),
               np.array(algo_dict_mae['DT']).mean(),
               np.array(algo_dict_mae['DT']).std()))
        print('SVR 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['SVR']).mean(),
               np.array(algo_dict_mae['SVR']).mean(),
               np.array(algo_dict_mae['SVR']).std()))
        print('PLS 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['PLS']).mean(),
               np.array(algo_dict_mae['PLS']).mean(),
               np.array(algo_dict_mae['PLS']).std()))
        print('KNN 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['KNN']).mean(),
               np.array(algo_dict_mae['KNN']).mean(),
               np.array(algo_dict_mae['KNN']).std()))
        print('RAND 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['RAND']).mean(),
               np.array(algo_dict_mae['RAND']).mean(),
               np.array(algo_dict_mae['RAND']).std()))
        print('GBR 10-fold CV RMSE: %.3f  MAE: %.3f (%.3f)' %
              (np.array(algo_dict_mse['GBR']).mean(),
               np.array(algo_dict_mae['GBR']).mean(),
               np.array(algo_dict_mae['GBR']).std()))

    def plot_bandgap_fprints(self):
        df = pd.read_pickle(path + '/ML/data/processed_dft_data.pkl')
        cm = plt.cm.get_cmap('RdYlBu_r')
        fig = plt.figure()
        ax = fig.add_subplot(111)
        sc = ax.scatter(np.array(df['Fingerprint_x']),
                        np.array(df['Fingerprint_y']),
                        c=np.array(df['Bandgap']),
                        marker='o',
                        s=10,
                        cmap=cm)
        plt.tight_layout()
        plt.colorbar(sc)
        plt.show()

    def most_similar_dft(self, compound, n=6):
        df = pd.read_pickle(path + '/ML/data/processed_dft_data.pkl')
        df['Fingerprint'] = df[['Fingerprint_x',
                                'Fingerprint_y']].values.tolist()
        dft_fingerprints = np.stack(df['Fingerprint'])
        fingerprint = self.ae.get_fingerprint(self.VAE, compound, vae=True)

        euc_dis_list = np.array([])
        for i in dft_fingerprints:
            euc_dis = self.ae.get_euclidean_distance(fingerprint, i)
            euc_dis_list = np.append(euc_dis_list, euc_dis)
        ind = np.argsort(euc_dis_list)[:n]
        eucledian_distance = euc_dis_list[ind]
        result_df = pd.concat([
            df.iloc[ind][[
                'StructuredFormula', 'Bandgap', 'Ehull', 'Formation energy'
            ]].reset_index(drop=True),
            pd.DataFrame(list(eucledian_distance),
                         columns=['Euclidean Distance'])
        ],
                              axis=1)  # pretty print
        return result_df

    def predict_bandgap(self, n=5):
        df = pd.read_pickle(path + '/ML/data/processed_dft_data.pkl')
        df = df.drop_duplicates(keep='first')  # there were 2 duplicates

        def pred_prop(row):
            most_similar = self.most_similar_dft(row.StructuredFormula,
                                                 n=n + 1)
            predicted_bandgap = np.array(most_similar['Bandgap'].to_list(
            )[1:]).mean(
            )  # discard 1st element because it's the material being considered

            return predicted_bandgap

        df['Predicted_Bandgap'] = df.apply(pred_prop, axis=1)
        df[['StructuredFormula', 'Bandgap', 'Predicted_Bandgap'
            ]].to_csv(path + '/ML/data/Bgap_predictions.csv',
                      sep='\t',
                      index=False)

        bandgap_mse = mean_squared_error(np.array(df['Bandgap']),
                                         np.array(df['Predicted_Bandgap']))
        bandgap_mae = mean_absolute_error(np.array(df['Bandgap']),
                                          np.array(df['Predicted_Bandgap']))

        print('Bandgap RMSE: ', np.sqrt(bandgap_mse))
        print('Bandgap MAE: ', bandgap_mae)

    def find_phases(self):
        df = self.comps_wdup[[
            'StructuredFormula', 'A1', 'A1_frac', 'A2', 'A2_frac', 'B1',
            'B1_frac', 'B2', 'B2_frac', 'CrystalClass'
        ]]
        df_noformula = df.drop(['StructuredFormula'],
                               axis=1).drop_duplicates(keep='first')
        print(df_noformula.info())

        def get_phases(row):
            phases = ((df_noformula['A1'] == row.A1) &
                      (df_noformula['A1_frac'] == row.A1_frac) &
                      (df_noformula['A2'] == row.A2) &
                      (df_noformula['A2_frac'] == row.A2_frac) &
                      (df_noformula['B1'] == row.B1) &
                      (df_noformula['B1_frac'] == row.B1_frac) &
                      (df_noformula['B2'] == row.B2) &
                      (df_noformula['B2_frac'] == row.B2_frac))
            competing_phase_idx = phases[phases == True].index.tolist()
            other_phases = []
            for idx in competing_phase_idx:
                other_phases.append(df.iloc[idx]['CrystalClass'])
            multiple_phases = 1 if len(other_phases) > 1 else 0
            return other_phases, multiple_phases, len(other_phases)

        df_noformula['All_phases'], df_noformula[
            'Multiple_phases'], df_noformula['How_many_phases'] = zip(
                *df_noformula.apply(get_phases, axis=1))
        df_noformula.drop_duplicates(subset=[
            'A1', 'A1_frac', 'A2', 'A2_frac', 'B1', 'B1_frac', 'B2', 'B2_frac',
            'Multiple_phases', 'How_many_phases'
        ],
                                     keep='first',
                                     inplace=True)
        df_noformula['StructuredFormula'] = df.iloc[
            df_noformula.index]['StructuredFormula']
        df_noformula.to_csv(path + '/ML/data/all_phases_new.csv', sep='\t')
        print(df_noformula['Multiple_phases'].sum())

    def predict_phases(self):
        df = pd.read_csv(path + '/ML/data/all_phases_new.csv', sep='\t')
        df = df[df.Multiple_phases == 1]

        def get_nearest_crystal_systems(row):
            most_similar_df = self.ae.most_similar(
                self.VAE, compound=row.StructuredFormula, experimental=1, n=6
            )  # consider the composition being considered and 5 other neighbours
            similar_crystal_systems = most_similar_df['CrystalSystem'].to_list(
            )[1:]
            all_phase_lst = yaml.load(row.All_phases)
            predicted_phases = list(
                set(all_phase_lst) & set(similar_crystal_systems))
            return similar_crystal_systems, predicted_phases, len(
                predicted_phases)

        df['6_most_similar_crystal_systems'], df[
            'overlapping_predictions'], df[
                'Num_overlapping_predictions'] = zip(
                    *df.apply(get_nearest_crystal_systems, axis=1))
        df_pr = df[(df.Num_overlapping_predictions == df.How_many_phases)]
        df_3_more = df[df.How_many_phases >= 3]

    def plot_phase_prediction(self, ax=None):
        df = pd.read_csv(path + '/ML/data/phase_pred_plotdata_new.csv',
                         sep='\t')
        crystal_systems = {
            1: 'Triclinic',
            2: 'Monoclinic',
            3: 'Orthorhombic',
            4: 'Tetragonal',
            5: 'Cubic',
            6: 'Trigonal',
            7: 'Hexagonal'
        }
        if ax == None:
            ax = plt.gca()

        width = 200
        height = 500
        verts = list(
            zip([-width, width, width, -width],
                [-height, -height, height, height]))

        def plot_individual_comp(row):
            for i in yaml.load(row.All_phases):
                c = 1 if i in yaml.load(row.overlapping_predictions) else 0
                sc = ax.scatter(row.StructuredFormula,
                                i.title(),
                                c=c,
                                marker=verts,
                                vmin=0,
                                vmax=1,
                                s=500,
                                cmap=matplotlib.colors.ListedColormap(
                                    ['blue', 'red']))

        SC = df.apply(plot_individual_comp, axis=1)
        ax.tick_params(axis='x', labelsize=10)
        ax.tick_params(axis='y', rotation=60)
        red_patch = mpatches.Patch(color='red', label='Predicted')
        blue_patch = mpatches.Patch(color='blue', label='Not-predicted')
        ax.legend(handles=[red_patch, blue_patch],
                  prop={
                      'weight': 'bold',
                      'size': 11
                  })
        ax.set_xlabel('Composition', fontsize=14)
        plt.setp(ax.get_xticklabels(),
                 rotation=60,
                 ha="right",
                 fontsize=10,
                 rotation_mode="anchor")
        ax.set_ylabel('Displayed phases', fontsize=14)
        return SC
Ejemplo n.º 2
0
class CrystalSystem():
    def __init__(self):
        self.exp_df = pd.read_csv(exp_data_file, sep='\t')
        self.ae = AutoEncoder()
        self.VAE = self.ae.build_AE(vae=True)
        self.VAE.load_weights(path + '/saved_models/best_model_VAE.h5')

    def validate(self):
        def get_similar_compounds(row):
            most_similar_df = self.ae.most_similar(
                self.VAE, row.StructuredFormula, n=6, experimental=1, vae=True
            )  # get 6 most similar crystal systems, 1st would refer to the material itself. Discard it and get the rest
            real_crystal_class = row.CrystalClass  # crystal system
            real_space_group = row.HMS  # space group
            similar_crystal_systems = most_similar_df['CrystalSystem'].to_list(
            )[1:]
            similar_space_groups = most_similar_df['HMS'].to_list()[1:]
            most_voted_class = max(
                similar_crystal_systems, key=similar_crystal_systems.count
            )  # if 2 crystal systems have 2 votes each, the one with the lowest euclidean distance is selected
            most_voted_space_group = max(similar_space_groups,
                                         key=similar_space_groups.count)
            if real_crystal_class == most_voted_class:
                correctly_identified_cc = 1
            else:
                correctly_identified_cc = 0
            if real_space_group == most_voted_space_group:
                correctly_identified_spg = 1
            else:
                correctly_identified_spg = 0

            print(row.StructuredFormula, correctly_identified_cc)

            return most_similar_df['CollectionCode'].to_list()[1:], most_similar_df['StructuredFormula'].to_list()[1:],\
              most_similar_df['CrystalSystem'].to_list()[1:], most_voted_class, most_voted_space_group, most_similar_df['Euclidean Distance'].to_list()[1:], correctly_identified_cc, correctly_identified_spg

        self.exp_df['Most Similar ICSD IDs'], self.exp_df['Most Similar Compounds'], self.exp_df['Most Similar Crystal Systems'], self.exp_df['predicted_crystal_system'], self.exp_df['predicted_space_group'],\
         self.exp_df['Euclidean Distances'], self.exp_df['crystal_system_correctly_identified'], self.exp_df['space_group_correctly_identified'] = zip(*self.exp_df.apply(get_similar_compounds, axis=1))

        corrects_cc = self.exp_df['crystal_system_correctly_identified'].sum()
        correct_percentage_cc = corrects_cc * 100.0 / self.exp_df[
            self.exp_df['CrystalClass'].notna()].shape[0]
        corrects_spg = self.exp_df['space_group_correctly_identified'].sum()
        correct_percentage_spg = corrects_spg * 100.0 / self.exp_df[
            self.exp_df['HMS'].notna()].shape[0]
        print(
            '%.2f%% of crystal systems have been correctly identified by the VAE fingerprinting model'
            % (correct_percentage_cc))
        print(
            '%.2f%% of space groups have been correctly identified by the VAE fingerprinting model'
            % (correct_percentage_spg))
        self.exp_df[[
            'StructuredFormula', 'HMS', 'CrystalClass',
            'Most Similar ICSD IDs', 'Most Similar Compounds',
            'Most Similar Crystal Systems', 'predicted_space_group',
            'predicted_crystal_system', 'crystal_system_correctly_identified',
            'space_group_correctly_identified'
        ]].to_csv(path + '/fingerprint_validation.csv', sep='\t', index=False)
        return self.exp_df[[
            'StructuredFormula', 'HMS', 'CrystalClass',
            'Most Similar ICSD IDs', 'Most Similar Compounds',
            'Most Similar Crystal Systems', 'predicted_space_group',
            'predicted_crystal_system', 'crystal_system_correctly_identified',
            'space_group_correctly_identified'
        ]]

    def get_confusion_matrix(
            self):  # confusion matrix for crystal class classification
        system_val = {
            'triclinic': 1,
            'monoclinic': 2,
            'orthorhombic': 3,
            'tetragonal': 4,
            'cubic': 5,
            'trigonal': 6,
            'hexagonal': 7
        }
        validate_df = pd.read_csv(path + '/fingerprint_validation.csv',
                                  sep='\t')
        true_lbl_str = validate_df['CrystalClass'].tolist()
        predicted_lbl_str = validate_df['predicted_crystal_system']
        cm = np.zeros((7, 7))  # empty confusion matrix
        for t, p in zip(true_lbl_str, predicted_lbl_str):
            if not pd.isnull(t):
                print(t, p)
                cm[system_val[p] - 1][system_val[t] - 1] += 1
        print(cm)
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        title = 'Confusion Matrix'
        classes = np.array([
            'Triclinic', 'Monoclinic', 'Orthorhombic', 'Tetragonal', 'Cubic',
            'Trigonal', 'Hexagonal'
        ])

        # classification report
        y_true = []
        y_pred = []
        for t, p in zip(true_lbl_str, predicted_lbl_str):
            if not pd.isnull(t):
                y_true.append(system_val[t] - 1)
                y_pred.append(system_val[p] - 1)

        print(
            classification_report(y_true,
                                  y_pred,
                                  target_names=classes,
                                  digits=4))

        fig, ax = plt.subplots()
        im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.YlOrRd)
        ax.figure.colorbar(im, ax=ax)
        # We want to show all ticks...
        ax.set(
            xticks=np.arange(cm.shape[1]),
            yticks=np.arange(cm.shape[0]),
            # ... and label them with the respective list entries
            xticklabels=classes,
            yticklabels=classes,
            #title=title,
            ylabel='True label',
            xlabel='Predicted label')

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(),
                 rotation=45,
                 ha="right",
                 fontsize=12,
                 rotation_mode="anchor")
        plt.setp(ax.get_yticklabels(), fontsize=12)
        ax.set_xlabel('Predicted label', fontsize=14)
        ax.set_ylabel('True label', fontsize=14)
        normalize = True
        # Loop over data dimensions and create text annotations.
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j,
                        i,
                        format(cm[i, j], fmt),
                        ha="center",
                        va="center",
                        color="white" if cm[i, j] > thresh else "black")
        fig.tight_layout()
        fig.savefig(path + '/figures/fingerprint_conf_mat.png',
                    dpi=800,
                    bbox_inches='tight')
        plt.show()

    def get_spg_conf_mat(
            self):  # confusion matrix for space group classification
        validate_df = pd.read_csv(path + '/fingerprint_validation.csv',
                                  sep='\t')
        true_lbl_str = validate_df['HMS'].tolist()
        predicted_lbl_str = validate_df['predicted_space_group'].tolist()
        unique_spgs = list(sorted(set(true_lbl_str + predicted_lbl_str)))
        dim = len(unique_spgs)
        spg_dict = {k: v for (v, k) in enumerate(unique_spgs)}

        cm = np.zeros((dim, dim))  # empty confusion matrix
        for t, p in zip(true_lbl_str, predicted_lbl_str):
            if not pd.isnull(t):
                # print (t, p)
                cm[spg_dict[p] - 1][spg_dict[t] - 1] += 1
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        title = 'Confusion Matrix'
        classes = np.array(unique_spgs)

        # classification report
        y_true = []
        y_pred = []
        for t, p in zip(true_lbl_str, predicted_lbl_str):
            if not pd.isnull(t):
                y_true.append(spg_dict[t] - 1)
                y_pred.append(spg_dict[p] - 1)

        print(
            classification_report(y_true,
                                  y_pred,
                                  target_names=np.array(unique_spgs),
                                  digits=4))

        fig, ax = plt.subplots()
        im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.YlOrRd)
        ax.figure.colorbar(im, ax=ax)
        # We want to show all ticks...
        ax.set(
            xticks=np.arange(cm.shape[1]),
            yticks=np.arange(cm.shape[0]),
            # ... and label them with the respective list entries
            xticklabels=classes,
            yticklabels=classes,
            #title=title,
            ylabel='True label',
            xlabel='Predicted label')

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(),
                 rotation=90,
                 ha="right",
                 fontsize=4,
                 rotation_mode="anchor")
        plt.setp(ax.get_yticklabels(), fontsize=4)
        ax.set_xlabel('Predicted label')
        ax.set_ylabel('True label')
        normalize = True
        # Loop over data dimensions and create text annotations.
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.

        fig.tight_layout()
        fig.savefig(path + '/figures/fingerprint_spg_conf_mat.png',
                    dpi=800,
                    bbox_inches='tight')
        plt.show()