Beispiel #1
0
 def __init__(self,
              filename,
              cols_to_read,
              delimiter=',',
              tokens=None,
              pad=True,
              tokenize=True,
              augment=False,
              flip=True):
     super(SmilesDataset, self).__init__()
     self.tokenize = tokenize
     data = read_smiles_property_file(filename, cols_to_read, delimiter)
     smiles = data[0]
     clean_smiles, clean_idx = sanitize_smiles(smiles)
     if len(data) > 1:
         target = np.array(data[1:], dtype='float')
         target = np.array(target)
         target = target.T
         self.target = target[clean_idx]
     else:
         self.target = None
     if augment:
         clean_smiles, self.target = augment_smiles(clean_smiles,
                                                    self.target)
     if pad:
         clean_smiles, self.length = pad_sequences(clean_smiles)
     tokens, self.token2idx, self.num_tokens = get_tokens(
         clean_smiles, tokens)
     if tokenize:
         clean_smiles, self.tokens = seq2tensor(clean_smiles, tokens, flip)
     self.data = clean_smiles
Beispiel #2
0
    def infer(self, batch_size, prime_str, tokens, max_len=120, end_token='>'):
        self.eval()
        hidden = self.Encoder.init_hidden(batch_size)
        if self.has_stack:
            stack = self.Stack.init_stack(batch_size)
        prime_input, _ = seq2tensor([prime_str] * batch_size,
                                         tokens=tokens,
                                         flip=False)
        prime_input = torch.tensor(prime_input).long()
        if self.use_cuda:
            prime_input = prime_input.cuda()
        new_samples = [[prime_str] * batch_size]

        # Use priming string to "build up" hidden state
        for p in range(len(prime_str[0]) - 1):
            _, hidden, stack = self.forward_step(prime_input[:, p],
                                                 hidden,
                                                 stack)
        inp = prime_input[:, -1]

        for p in range(max_len):
            output, hidden, stack = self.forward_step(inp, hidden, stack)
            # Sample from the network as a multinomial distribution
            probs = torch.softmax(output, dim=1).detach()
            top_i = torch.multinomial(probs, 1).cpu().numpy()

            # Add predicted character to string and use as next input
            predicted_char = (np.array(tokens)[top_i].reshape(-1))
            predicted_char = predicted_char.tolist()
            new_samples.append(predicted_char)

            # Prepare next input token for the generator
            inp, _ = seq2tensor(predicted_char, tokens=tokens)
            inp = torch.tensor(inp.squeeze(1)).long()
            if self.use_cuda:
                inp = inp.cuda()

        # Remove characters after end tokens
        string_samples = []
        new_samples = np.array(new_samples)
        #print(new_samples)
        for i in range(batch_size):
            sample = list(new_samples[:, i])
            if end_token in sample:
                end_token_idx = sample.index(end_token)
                string_samples.append(''.join(sample[1:end_token_idx]))
        return string_samples
 def __init__(self,
              filename,
              tokenized=False,
              cols_to_read=None,
              delimiter=',',
              mol_tokens=None,
              prot_tokens=None,
              pad=True):
     super(SmilesProteinDataset, self).__init__()
     if not tokenized:
         data = read_smiles_property_file(filename, cols_to_read, delimiter)
         smiles = data[0]
         proteins = np.array(data[1])
         target = np.array(data[2], dtype='float')
         clean_smiles, clean_idx = sanitize_smiles(smiles)
         self.target = target[clean_idx]
         proteins = list(proteins[clean_idx])
         if pad:
             clean_smiles, self.mol_lengths = pad_sequences(clean_smiles)
             proteins, self.prot_lengths = pad_sequences(proteins)
         self.mol_tokens, self.mol_token2idx, self.mol_num_tokens = \
             get_tokens(clean_smiles, mol_tokens)
         self.prot_tokens, self.prot_token2idx, self.prot_num_tokens = \
             get_tokens(proteins, prot_tokens)
         clean_smiles = seq2tensor(clean_smiles, self.mol_tokens)
         proteins = seq2tensor(proteins, self.prot_tokens)
         self.molecules = clean_smiles
         self.proteins = proteins
     else:
         f = open(filename, 'rb')
         data = pickle.load(f)
         self.mol_tokens = data['smiles_tokens']
         self.prot_tokens = data['proteins_tokens']
         self.mol_num_tokens = len(data['smiles_tokens'])
         self.prot_num_tokens = len(data['proteins_tokens'])
         self.molecules = data['smiles']
         self.proteins = data['proteins']
         self.target = data['labels']
     assert len(self.molecules) == len(self.proteins)
     assert len(self.molecules) == len(self.target)
Beispiel #4
0
    def infer(self, prime_str, n_to_generate, max_len, tokens, temperature=0.8):
        self.eval()
        tokens = np.array(tokens).reshape(-1)
        prime_str = [prime_str] * n_to_generate
        tokens = list(tokens[0])
        num_tokens = len(tokens)
        prime_input = seq2tensor(prime_str, tokens)
        tokens = np.array(tokens)
        batch_size = prime_input.shape[0]
        seq_len = prime_input.shape[1] - 1
        hidden = self.Encoder.init_hidden(batch_size)
        prime_input = torch.tensor(prime_input).long()
        if self.use_cuda:
            prime_input = prime_input.cuda()
        if self.has_stack:
            stack = self.Stack.init_stack(batch_size)
        for c in range(seq_len):
            inp_token = self.Embedding(prime_input[:, c].view(batch_size, -1))
            if self.has_stack:
                stack = self.Stack(hidden, stack)
                stack_top = stack[:, 0, :].unsqueeze(1)
                inp_token = torch.cat((inp_token, stack_top), dim=2)
            output, hidden = self.Encoder(inp_token, hidden)
        inp = prime_input[:, -1]
        predicted = [' '] * (batch_size * (max_len - seq_len))
        predicted = np.reshape(predicted, (batch_size, max_len - seq_len))
        for c in range(max_len - seq_len):
            inp_token = self.Embedding(inp.view(batch_size, -1))
            if self.has_stack:
                stack = self.Stack(hidden, stack)
                stack_top = stack[:, 0, :].unsqueeze(1)
                inp_token = torch.cat((inp_token, stack_top), dim=2)
            output, hidden = self.Encoder(inp_token, hidden)
            output = self.MLP(output)
            output_dist = output.data.view(-1).div(temperature).exp()
            output_dist = output_dist.view(batch_size, num_tokens)
            top_i = torch.multinomial(output_dist, 1)
            # Add predicted character to string and use as next input
            predicted_char = tokens[top_i]
            predicted[:, c] = predicted_char[:, 0]
            inp = torch.tensor(top_i)

        return predicted
 def predict(self, smiles, use_tqdm=False):
     double = False
     canonical_smiles = []
     invalid_smiles = []
     if use_tqdm:
         pbar = tqdm(range(len(smiles)))
     else:
         pbar = range(len(smiles))
     for i in pbar:
         sm = smiles[i]
         if use_tqdm:
             pbar.set_description("Calculating predictions...")
         try:
             sm = Chem.MolToSmiles(Chem.MolFromSmiles(sm, sanitize=False))
             if len(sm) == 0:
                 invalid_smiles.append(sm)
             else:
                 canonical_smiles.append(sm)
         except:
             invalid_smiles.append(sm)
     if len(canonical_smiles) == 0:
         return canonical_smiles, [], invalid_smiles
     if len(canonical_smiles) == 1:
         double = True
         canonical_smiles = [canonical_smiles[0], canonical_smiles[0]]
     padded_smiles, length = pad_sequences(canonical_smiles)
     smiles_tensor, _ = seq2tensor(padded_smiles, self.tokens, flip=False)
     prediction = []
     for i in range(len(self.model)):
         prediction.append(self.model[i]([
             torch.LongTensor(smiles_tensor).cuda(),
             torch.LongTensor(length).cuda()
         ],
                                         eval=True).detach().cpu().numpy())
     prediction = np.array(prediction).reshape(len(self.model), -1)
     prediction = np.min(prediction, axis=0)
     if double:
         canonical_smiles = canonical_smiles[0]
         prediction = [prediction[0]]
     return canonical_smiles, prediction, invalid_smiles