def __init__(self, filename, cols_to_read, delimiter=',', tokens=None, pad=True, tokenize=True, augment=False, flip=True): super(SmilesDataset, self).__init__() self.tokenize = tokenize data = read_smiles_property_file(filename, cols_to_read, delimiter) smiles = data[0] clean_smiles, clean_idx = sanitize_smiles(smiles) if len(data) > 1: target = np.array(data[1:], dtype='float') target = np.array(target) target = target.T self.target = target[clean_idx] else: self.target = None if augment: clean_smiles, self.target = augment_smiles(clean_smiles, self.target) if pad: clean_smiles, self.length = pad_sequences(clean_smiles) tokens, self.token2idx, self.num_tokens = get_tokens( clean_smiles, tokens) if tokenize: clean_smiles, self.tokens = seq2tensor(clean_smiles, tokens, flip) self.data = clean_smiles
def __init__(self, filename, tokenized=False, cols_to_read=None, delimiter=',', mol_tokens=None, prot_tokens=None, pad=True): super(SmilesProteinDataset, self).__init__() if not tokenized: data = read_smiles_property_file(filename, cols_to_read, delimiter) smiles = data[0] proteins = np.array(data[1]) target = np.array(data[2], dtype='float') clean_smiles, clean_idx = sanitize_smiles(smiles) self.target = target[clean_idx] proteins = list(proteins[clean_idx]) if pad: clean_smiles, self.mol_lengths = pad_sequences(clean_smiles) proteins, self.prot_lengths = pad_sequences(proteins) self.mol_tokens, self.mol_token2idx, self.mol_num_tokens = \ get_tokens(clean_smiles, mol_tokens) self.prot_tokens, self.prot_token2idx, self.prot_num_tokens = \ get_tokens(proteins, prot_tokens) clean_smiles = seq2tensor(clean_smiles, self.mol_tokens) proteins = seq2tensor(proteins, self.prot_tokens) self.molecules = clean_smiles self.proteins = proteins else: f = open(filename, 'rb') data = pickle.load(f) self.mol_tokens = data['smiles_tokens'] self.prot_tokens = data['proteins_tokens'] self.mol_num_tokens = len(data['smiles_tokens']) self.prot_num_tokens = len(data['proteins_tokens']) self.molecules = data['smiles'] self.proteins = data['proteins'] self.target = data['labels'] assert len(self.molecules) == len(self.proteins) assert len(self.molecules) == len(self.target)
import torch.nn.functional as F from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, f1_score from openchem.data.utils import read_smiles_property_file data = read_smiles_property_file( './benchmark_datasets/reactions/4_11_with_y2.csv', cols_to_read=[11, 12, 14], keep_header=False) reactant1 = data[0] reactant2 = data[1] labels = np.array(data[2], dtype="float").reshape(-1, 1) reactants = [reactant1[i] + " " + reactant2[i] for i in range(len(reactant2))] from openchem.data.utils import get_tokens tokens, _, _ = get_tokens(reactants) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(reactants, labels, test_size=0.2, random_state=42) y_mean = np.mean(y_train) y_std = np.std(y_train) y_train = (y_train - y_mean) / y_std y_test = (y_test - y_mean) / y_std from openchem.data.utils import save_smiles_property_file save_smiles_property_file('./benchmark_datasets/reactions/train.smi', X_train, y_train,
from torch.optim import RMSprop, Adam from torch.optim.lr_scheduler import ExponentialLR, StepLR import torch.nn.functional as F from openchem.data.utils import read_smiles_property_file data = read_smiles_property_file( 'benchmark_datasets/melt_temp/melting_data.txt', cols_to_read=[0, 1], delimiter='\t', keep_header=False) smiles = data[0][1:] labels = np.array(data[1][1:], dtype='float').reshape(-1) from openchem.data.utils import get_tokens tokens, _, _ = get_tokens(smiles) tokens = tokens + ' ' from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(smiles, labels, test_size=0.2, random_state=42) train_mean = np.mean(y_train) train_std = np.std(y_train) print("Mean Tmelt in training data: ", train_mean) print("Standard deviation of Tmelt in training data: ", train_std) print("Min value of Tmelt in training data: ", np.min(y_train)) print("Max value of Tmelt in training data: ", np.max(y_train)) y_train = (y_train - train_mean) / train_std