def getGraphX(AID): '''Takes in AID, finds graphreps for pytroch implementation''' AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID) save_path = AID_path+ '/' + AID +'mol_processed.pkl' pickle_off = open(save_path,'rb') activity_table=pickle.load(pickle_off) pickle_off.close() graph_rep_list = [mol2graph.mol2vec(m) for m in activity_table['MOL']] AID_and_graph_rep = pd.DataFrame() AID_and_graph_rep['Graph Rep']= graph_rep_list AID_and_graph_rep['PUBCHEM_CID'] = activity_table['PUBCHEM_CID'] main_aid_save_path = AID_path+ '/' + AID +'_processed.pkl' pickle_off = open(main_aid_save_path,'rb') main_activity_table=pickle.load(pickle_off) pickle_off.close() main_activity_table = main_activity_table.merge(AID_and_graph_rep,on='PUBCHEM_CID') save_df = True new_aid_save_path = main_aid_save_path = AID_path+ '/' + AID +'graph_processed.pkl' if save_df == True: main_activity_table.to_pickle(new_aid_save_path) return main_activity_table #%% '''Classifier Section''' '''SVM'''
test_size=0.2, random_state=2562) labels = np.array([ 1 if x == 'Active' else 0 for x in activity_table['PUBCHEM_ACTIVITY_OUTCOME'] ]) for big_train_ind, big_test_ind in big_splitter.split( activity_table, activity_table['PUBCHEM_ACTIVITY_OUTCOME']): train_X = np.atleast_2d(activity_table['MOL'].iloc[big_train_ind]).T train_y = labels[big_train_ind] test_X = np.atleast_2d(activity_table['MOL'].iloc[big_test_ind]).T test_y = labels[big_test_ind] train_X_oversampled, train_y_oversampled = ros.fit_resample( train_X, train_y) train_X = [ mol2graph.mol2vec(m) for m in np.squeeze(train_X_oversampled) ] test_X = [mol2graph.mol2vec(m) for m in np.squeeze(test_X)] #attach train labels to data for data, label in zip(train_X, train_y_oversampled): data.y = torch.tensor([[label]], dtype=torch.float) for data, label in zip(test_X, test_y): data.y = torch.tensor([[label]], dtype=torch.float) train_loader = DataLoader(train_X, batch_size=128, shuffle=True, drop_last=False, num_workers=8) test_loader = DataLoader(test_X, batch_size=128, shuffle=True,
f'Epoch: {epoch}, Loss: {epoch_loss:.3f}, Train acc: {train_acc:.3f}, Val acc: {val_acc:.3f}', ) return hist if __name__ == '__main__': get_data() train_mols = [m for m in Chem.SDMolSupplier('solubility.train.sdf')] test_mols = [m for m in Chem.SDMolSupplier('solubility.test.sdf')] sol_cls_dict = {'(A) low': 0, '(B) medium': 1, '(C) high': 2} print(sol_cls_dict) train_x = [mol2graph.mol2vec(m) for m in train_mols] for i, data in enumerate(train_x): y = sol_cls_dict[train_mols[i].GetProp('SOL_classification')] data.y = torch.tensor([y], dtype=torch.long) test_x = [mol2graph.mol2vec(m) for m in test_mols] for i, data in enumerate(test_x): y = sol_cls_dict[test_mols[i].GetProp('SOL_classification')] data.y = torch.tensor([y], dtype=torch.long) print(f'Number of graphs: {len(train_x)}') first_sample = train_x[0] print(f'Looking at first example ..') print(f'\t{train_x[0]}') print(f'\t # of nodes: {first_sample.x.shape[0]}') print(f'\t # of features per node: {first_sample.x.shape[1]}')