class BandgapPhase(): def __init__(self): self.edft = pd.read_csv(path + '/ML/data/pero_dft.csv', sep='\t') self.comps_wdup = pd.read_csv( path + '/ICSD_data/ICSD_all_data_with_all_phases.csv', sep='\t') # compositions with duplicates for phase prediction = AutoEncoder() self.VAE = self.VAE.load_weights(path + '/saved_models/best_model_VAE.h5') self.scaler = StandardScaler() def arrange_comp( self): # arrange chemical formula in (AA')BO3 and A(BB')O3 order def arrange_formula(row): formula_lst = re.findall('[A-Z][^A-Z]*', row.SimulatedComposition.strip()) # print (formula_lst) B_doped = 0 if len(formula_lst) == 4: r = re.compile("([a-zA-Z]+)([0-9]+)") m = r.match(formula_lst[0]) elem1 = frac1 = if int(frac1) == 8: B_doped = 1 if B_doped == 1: # A(BB')O3 type A1 = elem1 A1_frac = '' B1 = r.match(formula_lst[1]).group(1) B1_frac = str(int(r.match(formula_lst[1]).group(2)) / 8.0) B2 = r.match(formula_lst[2]).group(1) B2_frac = str(int(r.match(formula_lst[2]).group(2)) / 8.0) StructuredForm = "{}({}{}){}".format( A1, B1 + B1_frac, B2 + B2_frac, 'O3') else: # (AA')BO3 type A1 = elem1 A1_frac = str(int(frac1) / 8.0) A2 = r.match(formula_lst[1]).group(1) A2_frac = str(int(r.match(formula_lst[1]).group(2)) / 8.0) B1 = r.match(formula_lst[2]).group(1) B1_frac = '' StructuredForm = "({}{}){}{}".format( A1 + A1_frac, A2 + A2_frac, B1, 'O3') return StructuredForm else: return 0 self.edft['StructuredFormula'] = self.edft.apply(arrange_formula, axis=1) df = self.edft[self.edft.StructuredFormula != 0] def get_fprint( row): # getting material fingerprint for each composition print(row.StructuredFormula) try: fprint =, row.StructuredFormula) print(fprint) return fprint[0], fprint[1] except: return None, None return df['Fingerprint_x'], df['Fingerprint_y'] = zip( *df.apply(get_fprint, axis=1)) df = df.dropna() df.to_pickle(path + '/ML/data/processed_dft_data.pkl') print( def get_all_features(self): df = pd.read_pickle(path + '/ML/data/processed_dft_data.pkl') pv = Perovskites(df) df = pv.parse_formula(df) df = pv.add_features(df) df = pv.SISSO_features(df) df = df.drop([ 'SimulatedComposition', 'O p-band center (eV)', 'Predicted Log k* (cm/s)', 'Charge transfer gap (eV)', 'Formation energy', 'Fingerprint_x', 'Fingerprint_y' ], axis=1) df.to_csv(path + '/ML/data/dft_data_with_features.csv', sep='\t', index=False) def bgap_pred_all_features(slef): df = pd.read_csv(path + '/ML/data/dft_data_with_features.csv', sep='\t') df = df.drop([ 'StructuredFormula', 'A1', 'A1_frac', 'A2', 'A2_frac', 'B1', 'B1_frac', 'B2', 'B2_frac', 'O', 'O_frac', 'atom_numO', 'mend_numO', 'atomic_rO', 'O_X', 'M_O', 'V_O', 'therm_con_O', 'polarizability_O', 'lattice_const_O', 'Row_O', 'Group_O', 'nO', 'rO' ], axis=1) df_x = df.drop(['Ehull', 'Bandgap'], axis=1) df_y = df[['Bandgap']] algo_dict_mse = { 'DT': [], 'SVR': [], 'PLS': [], 'EN': [], 'KNN': [], 'RAND': [], 'GBR': [] } algo_dict_mae = { 'DT': [], 'SVR': [], 'PLS': [], 'KNN': [], 'RAND': [], 'GBR': [] } for i in range(20): X_train, X_test, y_train, y_test = train_test_split( df_x, df_y.values.ravel(), test_size=0.2, random_state=i) pipelines = [] pipelines.append(('DT', Pipeline([('Scaler', StandardScaler()), ('DT', DecisionTreeRegressor())]))) pipelines.append( ('SVR', Pipeline([('Scaler', StandardScaler()), ('SVR', SVR())]))) # pipelines.append(('PLS', Pipeline([('Scaler', StandardScaler()), ('PLS', PLSRegression())]))) pipelines.append(('KNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())]))) pipelines.append(('RAND', Pipeline([('Scaler', StandardScaler()), ('RAND', RandomForestRegressor())]))) pipelines.append( ('GBR', Pipeline([('Scaler', StandardScaler()), ('GBR', GradientBoostingRegressor())]))) results = [] names = [] for name, model in pipelines: # cv = KFold(n_splits=10, random_state=10) cv = LeaveOneOut() cv_results_mse = cross_val_score( model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error') cv_results_mae = cross_val_score( model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error') msg_mse = "%s: MSE %f (%f)" % (name, cv_results_mse.mean(), cv_results_mse.std()) msg_mae = "%s: MAE %f (%f)" % (name, cv_results_mae.mean(), cv_results_mae.std()) print(msg_mse) print(msg_mae) algo_dict_mse[name].append(np.sqrt(-1 * cv_results_mse.mean())) algo_dict_mae[name].append(-1 * cv_results_mae.mean()) print('\n') print('DT 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['DT']).mean(), np.array(algo_dict_mae['DT']).mean(), np.array(algo_dict_mae['DT']).std())) print('SVR 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['SVR']).mean(), np.array(algo_dict_mae['SVR']).mean(), np.array(algo_dict_mae['SVR']).std())) print('PLS 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['PLS']).mean(), np.array(algo_dict_mae['PLS']).mean(), np.array(algo_dict_mae['PLS']).std())) print('KNN 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['KNN']).mean(), np.array(algo_dict_mae['KNN']).mean(), np.array(algo_dict_mae['KNN']).std())) print('RAND 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['RAND']).mean(), np.array(algo_dict_mae['RAND']).mean(), np.array(algo_dict_mae['RAND']).std())) print('GBR 10-fold CV RMSE: %.3f MAE: %.3f (%.3f)' % (np.array(algo_dict_mse['GBR']).mean(), np.array(algo_dict_mae['GBR']).mean(), np.array(algo_dict_mae['GBR']).std())) def plot_bandgap_fprints(self): df = pd.read_pickle(path + '/ML/data/processed_dft_data.pkl') cm ='RdYlBu_r') fig = plt.figure() ax = fig.add_subplot(111) sc = ax.scatter(np.array(df['Fingerprint_x']), np.array(df['Fingerprint_y']), c=np.array(df['Bandgap']), marker='o', s=10, cmap=cm) plt.tight_layout() plt.colorbar(sc) def most_similar_dft(self, compound, n=6): df = pd.read_pickle(path + '/ML/data/processed_dft_data.pkl') df['Fingerprint'] = df[['Fingerprint_x', 'Fingerprint_y']].values.tolist() dft_fingerprints = np.stack(df['Fingerprint']) fingerprint =, compound, vae=True) euc_dis_list = np.array([]) for i in dft_fingerprints: euc_dis =, i) euc_dis_list = np.append(euc_dis_list, euc_dis) ind = np.argsort(euc_dis_list)[:n] eucledian_distance = euc_dis_list[ind] result_df = pd.concat([ df.iloc[ind][[ 'StructuredFormula', 'Bandgap', 'Ehull', 'Formation energy' ]].reset_index(drop=True), pd.DataFrame(list(eucledian_distance), columns=['Euclidean Distance']) ], axis=1) # pretty print return result_df def predict_bandgap(self, n=5): df = pd.read_pickle(path + '/ML/data/processed_dft_data.pkl') df = df.drop_duplicates(keep='first') # there were 2 duplicates def pred_prop(row): most_similar = self.most_similar_dft(row.StructuredFormula, n=n + 1) predicted_bandgap = np.array(most_similar['Bandgap'].to_list( )[1:]).mean( ) # discard 1st element because it's the material being considered return predicted_bandgap df['Predicted_Bandgap'] = df.apply(pred_prop, axis=1) df[['StructuredFormula', 'Bandgap', 'Predicted_Bandgap' ]].to_csv(path + '/ML/data/Bgap_predictions.csv', sep='\t', index=False) bandgap_mse = mean_squared_error(np.array(df['Bandgap']), np.array(df['Predicted_Bandgap'])) bandgap_mae = mean_absolute_error(np.array(df['Bandgap']), np.array(df['Predicted_Bandgap'])) print('Bandgap RMSE: ', np.sqrt(bandgap_mse)) print('Bandgap MAE: ', bandgap_mae) def find_phases(self): df = self.comps_wdup[[ 'StructuredFormula', 'A1', 'A1_frac', 'A2', 'A2_frac', 'B1', 'B1_frac', 'B2', 'B2_frac', 'CrystalClass' ]] df_noformula = df.drop(['StructuredFormula'], axis=1).drop_duplicates(keep='first') print( def get_phases(row): phases = ((df_noformula['A1'] == row.A1) & (df_noformula['A1_frac'] == row.A1_frac) & (df_noformula['A2'] == row.A2) & (df_noformula['A2_frac'] == row.A2_frac) & (df_noformula['B1'] == row.B1) & (df_noformula['B1_frac'] == row.B1_frac) & (df_noformula['B2'] == row.B2) & (df_noformula['B2_frac'] == row.B2_frac)) competing_phase_idx = phases[phases == True].index.tolist() other_phases = [] for idx in competing_phase_idx: other_phases.append(df.iloc[idx]['CrystalClass']) multiple_phases = 1 if len(other_phases) > 1 else 0 return other_phases, multiple_phases, len(other_phases) df_noformula['All_phases'], df_noformula[ 'Multiple_phases'], df_noformula['How_many_phases'] = zip( *df_noformula.apply(get_phases, axis=1)) df_noformula.drop_duplicates(subset=[ 'A1', 'A1_frac', 'A2', 'A2_frac', 'B1', 'B1_frac', 'B2', 'B2_frac', 'Multiple_phases', 'How_many_phases' ], keep='first', inplace=True) df_noformula['StructuredFormula'] = df.iloc[ df_noformula.index]['StructuredFormula'] df_noformula.to_csv(path + '/ML/data/all_phases_new.csv', sep='\t') print(df_noformula['Multiple_phases'].sum()) def predict_phases(self): df = pd.read_csv(path + '/ML/data/all_phases_new.csv', sep='\t') df = df[df.Multiple_phases == 1] def get_nearest_crystal_systems(row): most_similar_df = self.VAE, compound=row.StructuredFormula, experimental=1, n=6 ) # consider the composition being considered and 5 other neighbours similar_crystal_systems = most_similar_df['CrystalSystem'].to_list( )[1:] all_phase_lst = yaml.load(row.All_phases) predicted_phases = list( set(all_phase_lst) & set(similar_crystal_systems)) return similar_crystal_systems, predicted_phases, len( predicted_phases) df['6_most_similar_crystal_systems'], df[ 'overlapping_predictions'], df[ 'Num_overlapping_predictions'] = zip( *df.apply(get_nearest_crystal_systems, axis=1)) df_pr = df[(df.Num_overlapping_predictions == df.How_many_phases)] df_3_more = df[df.How_many_phases >= 3] def plot_phase_prediction(self, ax=None): df = pd.read_csv(path + '/ML/data/phase_pred_plotdata_new.csv', sep='\t') crystal_systems = { 1: 'Triclinic', 2: 'Monoclinic', 3: 'Orthorhombic', 4: 'Tetragonal', 5: 'Cubic', 6: 'Trigonal', 7: 'Hexagonal' } if ax == None: ax = plt.gca() width = 200 height = 500 verts = list( zip([-width, width, width, -width], [-height, -height, height, height])) def plot_individual_comp(row): for i in yaml.load(row.All_phases): c = 1 if i in yaml.load(row.overlapping_predictions) else 0 sc = ax.scatter(row.StructuredFormula, i.title(), c=c, marker=verts, vmin=0, vmax=1, s=500, cmap=matplotlib.colors.ListedColormap( ['blue', 'red'])) SC = df.apply(plot_individual_comp, axis=1) ax.tick_params(axis='x', labelsize=10) ax.tick_params(axis='y', rotation=60) red_patch = mpatches.Patch(color='red', label='Predicted') blue_patch = mpatches.Patch(color='blue', label='Not-predicted') ax.legend(handles=[red_patch, blue_patch], prop={ 'weight': 'bold', 'size': 11 }) ax.set_xlabel('Composition', fontsize=14) plt.setp(ax.get_xticklabels(), rotation=60, ha="right", fontsize=10, rotation_mode="anchor") ax.set_ylabel('Displayed phases', fontsize=14) return SC
class CrystalSystem(): def __init__(self): self.exp_df = pd.read_csv(exp_data_file, sep='\t') = AutoEncoder() self.VAE = self.VAE.load_weights(path + '/saved_models/best_model_VAE.h5') def validate(self): def get_similar_compounds(row): most_similar_df = self.VAE, row.StructuredFormula, n=6, experimental=1, vae=True ) # get 6 most similar crystal systems, 1st would refer to the material itself. Discard it and get the rest real_crystal_class = row.CrystalClass # crystal system real_space_group = row.HMS # space group similar_crystal_systems = most_similar_df['CrystalSystem'].to_list( )[1:] similar_space_groups = most_similar_df['HMS'].to_list()[1:] most_voted_class = max( similar_crystal_systems, key=similar_crystal_systems.count ) # if 2 crystal systems have 2 votes each, the one with the lowest euclidean distance is selected most_voted_space_group = max(similar_space_groups, key=similar_space_groups.count) if real_crystal_class == most_voted_class: correctly_identified_cc = 1 else: correctly_identified_cc = 0 if real_space_group == most_voted_space_group: correctly_identified_spg = 1 else: correctly_identified_spg = 0 print(row.StructuredFormula, correctly_identified_cc) return most_similar_df['CollectionCode'].to_list()[1:], most_similar_df['StructuredFormula'].to_list()[1:],\ most_similar_df['CrystalSystem'].to_list()[1:], most_voted_class, most_voted_space_group, most_similar_df['Euclidean Distance'].to_list()[1:], correctly_identified_cc, correctly_identified_spg self.exp_df['Most Similar ICSD IDs'], self.exp_df['Most Similar Compounds'], self.exp_df['Most Similar Crystal Systems'], self.exp_df['predicted_crystal_system'], self.exp_df['predicted_space_group'],\ self.exp_df['Euclidean Distances'], self.exp_df['crystal_system_correctly_identified'], self.exp_df['space_group_correctly_identified'] = zip(*self.exp_df.apply(get_similar_compounds, axis=1)) corrects_cc = self.exp_df['crystal_system_correctly_identified'].sum() correct_percentage_cc = corrects_cc * 100.0 / self.exp_df[ self.exp_df['CrystalClass'].notna()].shape[0] corrects_spg = self.exp_df['space_group_correctly_identified'].sum() correct_percentage_spg = corrects_spg * 100.0 / self.exp_df[ self.exp_df['HMS'].notna()].shape[0] print( '%.2f%% of crystal systems have been correctly identified by the VAE fingerprinting model' % (correct_percentage_cc)) print( '%.2f%% of space groups have been correctly identified by the VAE fingerprinting model' % (correct_percentage_spg)) self.exp_df[[ 'StructuredFormula', 'HMS', 'CrystalClass', 'Most Similar ICSD IDs', 'Most Similar Compounds', 'Most Similar Crystal Systems', 'predicted_space_group', 'predicted_crystal_system', 'crystal_system_correctly_identified', 'space_group_correctly_identified' ]].to_csv(path + '/fingerprint_validation.csv', sep='\t', index=False) return self.exp_df[[ 'StructuredFormula', 'HMS', 'CrystalClass', 'Most Similar ICSD IDs', 'Most Similar Compounds', 'Most Similar Crystal Systems', 'predicted_space_group', 'predicted_crystal_system', 'crystal_system_correctly_identified', 'space_group_correctly_identified' ]] def get_confusion_matrix( self): # confusion matrix for crystal class classification system_val = { 'triclinic': 1, 'monoclinic': 2, 'orthorhombic': 3, 'tetragonal': 4, 'cubic': 5, 'trigonal': 6, 'hexagonal': 7 } validate_df = pd.read_csv(path + '/fingerprint_validation.csv', sep='\t') true_lbl_str = validate_df['CrystalClass'].tolist() predicted_lbl_str = validate_df['predicted_crystal_system'] cm = np.zeros((7, 7)) # empty confusion matrix for t, p in zip(true_lbl_str, predicted_lbl_str): if not pd.isnull(t): print(t, p) cm[system_val[p] - 1][system_val[t] - 1] += 1 print(cm) cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] title = 'Confusion Matrix' classes = np.array([ 'Triclinic', 'Monoclinic', 'Orthorhombic', 'Tetragonal', 'Cubic', 'Trigonal', 'Hexagonal' ]) # classification report y_true = [] y_pred = [] for t, p in zip(true_lbl_str, predicted_lbl_str): if not pd.isnull(t): y_true.append(system_val[t] - 1) y_pred.append(system_val[p] - 1) print( classification_report(y_true, y_pred, target_names=classes, digits=4)) fig, ax = plt.subplots() im = ax.imshow(cm, interpolation='nearest', ax.figure.colorbar(im, ax=ax) # We want to show all ticks... ax.set( xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), # ... and label them with the respective list entries xticklabels=classes, yticklabels=classes, #title=title, ylabel='True label', xlabel='Predicted label') # Rotate the tick labels and set their alignment. plt.setp(ax.get_xticklabels(), rotation=45, ha="right", fontsize=12, rotation_mode="anchor") plt.setp(ax.get_yticklabels(), fontsize=12) ax.set_xlabel('Predicted label', fontsize=14) ax.set_ylabel('True label', fontsize=14) normalize = True # Loop over data dimensions and create text annotations. fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i in range(cm.shape[0]): for j in range(cm.shape[1]): ax.text(j, i, format(cm[i, j], fmt), ha="center", va="center", color="white" if cm[i, j] > thresh else "black") fig.tight_layout() fig.savefig(path + '/figures/fingerprint_conf_mat.png', dpi=800, bbox_inches='tight') def get_spg_conf_mat( self): # confusion matrix for space group classification validate_df = pd.read_csv(path + '/fingerprint_validation.csv', sep='\t') true_lbl_str = validate_df['HMS'].tolist() predicted_lbl_str = validate_df['predicted_space_group'].tolist() unique_spgs = list(sorted(set(true_lbl_str + predicted_lbl_str))) dim = len(unique_spgs) spg_dict = {k: v for (v, k) in enumerate(unique_spgs)} cm = np.zeros((dim, dim)) # empty confusion matrix for t, p in zip(true_lbl_str, predicted_lbl_str): if not pd.isnull(t): # print (t, p) cm[spg_dict[p] - 1][spg_dict[t] - 1] += 1 cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] title = 'Confusion Matrix' classes = np.array(unique_spgs) # classification report y_true = [] y_pred = [] for t, p in zip(true_lbl_str, predicted_lbl_str): if not pd.isnull(t): y_true.append(spg_dict[t] - 1) y_pred.append(spg_dict[p] - 1) print( classification_report(y_true, y_pred, target_names=np.array(unique_spgs), digits=4)) fig, ax = plt.subplots() im = ax.imshow(cm, interpolation='nearest', ax.figure.colorbar(im, ax=ax) # We want to show all ticks... ax.set( xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), # ... and label them with the respective list entries xticklabels=classes, yticklabels=classes, #title=title, ylabel='True label', xlabel='Predicted label') # Rotate the tick labels and set their alignment. plt.setp(ax.get_xticklabels(), rotation=90, ha="right", fontsize=4, rotation_mode="anchor") plt.setp(ax.get_yticklabels(), fontsize=4) ax.set_xlabel('Predicted label') ax.set_ylabel('True label') normalize = True # Loop over data dimensions and create text annotations. fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. fig.tight_layout() fig.savefig(path + '/figures/fingerprint_spg_conf_mat.png', dpi=800, bbox_inches='tight')