def __init__(self,
              filename,
              cols_to_read,
              delimiter=',',
              tokens=None,
              pad=True,
              tokenize=True,
              augment=False,
              flip=True):
     super(SmilesDataset, self).__init__()
     self.tokenize = tokenize
     data = read_smiles_property_file(filename, cols_to_read, delimiter)
     smiles = data[0]
     clean_smiles, clean_idx = sanitize_smiles(smiles)
     if len(data) > 1:
         target = np.array(data[1:], dtype='float')
         target = np.array(target)
         target = target.T
         self.target = target[clean_idx]
     else:
         self.target = None
     if augment:
         clean_smiles, self.target = augment_smiles(clean_smiles,
                                                    self.target)
     if pad:
         clean_smiles, self.length = pad_sequences(clean_smiles)
     tokens, self.token2idx, self.num_tokens = get_tokens(
         clean_smiles, tokens)
     if tokenize:
         clean_smiles, self.tokens = seq2tensor(clean_smiles, tokens, flip)
     self.data = clean_smiles
 def __init__(self,
              filename,
              tokenized=False,
              cols_to_read=None,
              delimiter=',',
              mol_tokens=None,
              prot_tokens=None,
              pad=True):
     super(SmilesProteinDataset, self).__init__()
     if not tokenized:
         data = read_smiles_property_file(filename, cols_to_read, delimiter)
         smiles = data[0]
         proteins = np.array(data[1])
         target = np.array(data[2], dtype='float')
         clean_smiles, clean_idx = sanitize_smiles(smiles)
         self.target = target[clean_idx]
         proteins = list(proteins[clean_idx])
         if pad:
             clean_smiles, self.mol_lengths = pad_sequences(clean_smiles)
             proteins, self.prot_lengths = pad_sequences(proteins)
         self.mol_tokens, self.mol_token2idx, self.mol_num_tokens = \
             get_tokens(clean_smiles, mol_tokens)
         self.prot_tokens, self.prot_token2idx, self.prot_num_tokens = \
             get_tokens(proteins, prot_tokens)
         clean_smiles = seq2tensor(clean_smiles, self.mol_tokens)
         proteins = seq2tensor(proteins, self.prot_tokens)
         self.molecules = clean_smiles
         self.proteins = proteins
     else:
         f = open(filename, 'rb')
         data = pickle.load(f)
         self.mol_tokens = data['smiles_tokens']
         self.prot_tokens = data['proteins_tokens']
         self.mol_num_tokens = len(data['smiles_tokens'])
         self.prot_num_tokens = len(data['proteins_tokens'])
         self.molecules = data['smiles']
         self.proteins = data['proteins']
         self.target = data['labels']
     assert len(self.molecules) == len(self.proteins)
     assert len(self.molecules) == len(self.target)
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, f1_score

from openchem.data.utils import read_smiles_property_file
data = read_smiles_property_file(
    './benchmark_datasets/reactions/4_11_with_y2.csv',
    cols_to_read=[11, 12, 14],
    keep_header=False)
reactant1 = data[0]
reactant2 = data[1]
labels = np.array(data[2], dtype="float").reshape(-1, 1)

reactants = [reactant1[i] + " " + reactant2[i] for i in range(len(reactant2))]

from openchem.data.utils import get_tokens
tokens, _, _ = get_tokens(reactants)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reactants,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=42)
y_mean = np.mean(y_train)
y_std = np.std(y_train)
y_train = (y_train - y_mean) / y_std
y_test = (y_test - y_mean) / y_std

from openchem.data.utils import save_smiles_property_file
save_smiles_property_file('./benchmark_datasets/reactions/train.smi',
                          X_train,
                          y_train,
from torch.optim import RMSprop, Adam
from torch.optim.lr_scheduler import ExponentialLR, StepLR
import torch.nn.functional as F

from openchem.data.utils import read_smiles_property_file
data = read_smiles_property_file(
    'benchmark_datasets/melt_temp/melting_data.txt',
    cols_to_read=[0, 1],
    delimiter='\t',
    keep_header=False)
smiles = data[0][1:]
labels = np.array(data[1][1:], dtype='float').reshape(-1)

from openchem.data.utils import get_tokens
tokens, _, _ = get_tokens(smiles)
tokens = tokens + ' '

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(smiles,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=42)

train_mean = np.mean(y_train)
train_std = np.std(y_train)
print("Mean Tmelt in training data: ", train_mean)
print("Standard deviation of Tmelt in training data: ", train_std)
print("Min value of Tmelt in training data: ", np.min(y_train))
print("Max value of Tmelt in training data: ", np.max(y_train))
y_train = (y_train - train_mean) / train_std