Esempio n. 1
0
File: ODO.py Progetto: adw62/ODO
 def train_generator(self, moment_files):
     print('Training RNN')
     folder = 'vector_based/'
     if not os.path.exists(self.gen_data_dir + folder + 'vecs.csv'):
         print('Gen Vectors....')
         mols = pd.read_csv(self.gen_data_dir + 'mols.csv', header=0)
         mols = [x[0] for x in mols.values]
         get_latent_vecs(mols, self.gen_data_dir, folder + 'vecs.csv')
     mew, std = pretrain(self.gen_data_dir, 'Voc', folder + 'vecs.csv',
                         'input_mols_filtered.csv', folder + 'Prior.ckpt')
     ckpt_file = self.gen_data_dir + folder + 'Prior.ckpt'
     header = get_headings()
     np.savetxt(moment_files[0],
                np.array([mew]),
                header=','.join(header),
                delimiter=',',
                comments='',
                newline='\n')
     np.savetxt(moment_files[1],
                np.array([std]),
                header=','.join(header),
                delimiter=',',
                comments='',
                newline='\n')
     return ckpt_file
Esempio n. 2
0
    def __init__(self, voc, smi_file, vec_file):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.voc = voc
        self.smiles = pd.read_csv(smi_file, header=0, dtype=str).values
        self.smiles = [x[0] for x in self.smiles]

        # reading descriptors
        data = pd.read_csv(vec_file, header=0)
        #look for missing data entries
        if data.isnull().values.any():
            raise ValueError(
                'Found nan in data, possible data missing in generation vectors.'
            )

        #correct heading order
        headings = get_headings()
        data = data.reindex(columns=headings)
        #look for missing columns
        if data.isnull().values.any():
            raise ValueError(
                'Found nan in data, possible columns missing in generation vectors.'
            )

        #calculate mew and std for normilization of generation vectors
        self.vectors = data.values
        self.mew, self.std = get_moments(self.vectors)
        # catch any zeros which will give nan when normalizing
        self.std = np.array([x if x != 0 else 1.0 for x in self.std])
        self.vectors = (self.vectors - self.mew) / self.std
Esempio n. 3
0
File: ODO.py Progetto: adw62/ODO
 def discrim_to_gen(self):
     float_bool = get_float_bool(self.discrim_data_dir, 'float_bool.csv')
     all_rounded = []
     for i in range(50):
         rounded = []
         for x, is_float in zip(self.x_solution, float_bool):
             if is_float == 1:
                 rounded.append(x)
             else:
                 rounded.append(prob_round(x))
         if rounded not in all_rounded:
             all_rounded.append(rounded)
     all_rounded = np.array(all_rounded)
     header = get_headings()
     np.savetxt(self.discrim_data_dir + 'rounded.csv',
                all_rounded,
                header=','.join(header),
                delimiter=',',
                comments='',
                newline='\n')
Esempio n. 4
0
File: ODO.py Progetto: adw62/ODO
    def k_near_search(self, vector, lib_vec_file, num_neighbours=1):
        # vector is vector for compund we want to search for neigbours of
        # lib_vec_file is a libary of compunds in vector form which will be search for neighbours
        all_neigh_dist = []
        all_neigh_index = []
        all_neigh_vec = []
        chunksize = 100000
        print('Looking for nearest neighbour:')
        for i, chunk in enumerate(
                pd.read_csv(lib_vec_file, chunksize=chunksize, header=0)):
            print('Evaluating chunk {} of length {}'.format(i, len(chunk)))
            # correct heading order
            chunk = chunk.reindex(columns=get_headings())
            traning_data = chunk.values
            tree = spatial.KDTree(traning_data)
            ans = tree.query(np.array(vector), k=num_neighbours)
            all_neigh_dist.append(ans[0])
            all_neigh_index.append(ans[1] + (i * chunksize))
            all_neigh_vec.append(traning_data[ans[1]])

        result = [[x, y] for _, x, y in sorted(zip(
            all_neigh_dist, all_neigh_vec, all_neigh_index),
                                               key=lambda pair: pair[0])]
        return result[0]
Esempio n. 5
0
File: ODO.py Progetto: adw62/ODO
    def generate(self,
                 data_dir,
                 ckpt_file,
                 mode='vectors',
                 batch_size=1,
                 samples=50,
                 moments=None):
        modes = ['reinvent', 'vectors']
        network_size = 398
        if mode == 'reinvent':
            data = [np.zeroes(network_size)]
        elif mode == 'vectors':
            vec_file = self.gen_data_dir + 'vector_based/vecs.csv'
            if moments is None:
                #Calculate the mew and std used to normalize generation data
                data = pd.read_csv(vec_file, header=0)
                # correct heading order
                data = data.reindex(columns=get_headings())
                data = data.values
                mew, std = get_moments(data)
                # catch any zeros which will give nan when normalizing
                std = np.array([x if x != 0 else 1.0 for x in std])
            else:
                #read mew and std from file, this save some time and memory
                mew = pd.read_csv(moments[0], header=0)
                std = pd.read_csv(moments[1], header=0)
                mew = mew.reindex(columns=get_headings()).values
                std = std.reindex(columns=get_headings()).values

            vectors = pd.read_csv(self.discrim_data_dir + 'rounded.csv',
                                  header=0)
            vectors = vectors.reindex(columns=get_headings()).values
            vectors = (vectors - mew) / std

            #replace data with normalized vectors
            data = torch.FloatTensor(vectors)
        else:
            raise ValueError('Supported generation modes are {}'.format(modes))

        voc = Vocabulary(init_from_file=self.gen_data_dir + 'Voc')
        Prior = RNN(voc, network_size)

        if torch.cuda.is_available():
            Prior.rnn.load_state_dict(torch.load(ckpt_file))
        else:
            Prior.rnn.load_state_dict(
                torch.load(ckpt_file,
                           map_location=lambda storage, loc: storage))

        all_smi = set()
        valid = 0
        with open('./output_smi.csv', 'w') as file:
            for j, test_vec in enumerate(data):
                test_vec = test_vec.float()
                for i in range(samples):
                    seqs, prior_likelihood, entropy = Prior.sample(
                        batch_size, test_vec)
                    smiles = seq_to_smiles(seqs, voc)[0]
                    if Chem.MolFromSmiles(smiles):
                        valid += 1
                        all_smi.add(smiles)
                        file.write(smiles + str(',{}\n'.format(j)))
        all_smi = list(all_smi)
        print("\n{:>4.1f}% valid SMILES".format(100 * (valid /
                                                       (samples * len(data)))))
        return all_smi
Esempio n. 6
0
File: ODO.py Progetto: adw62/ODO
    def __init__(self):
        #Define file locations
        self.discrim_data_dir = './discriminator/data/'
        self.gen_data_dir = './generator/data/'
        self.gen_ckpt_file = self.gen_data_dir + 'vector_based/Prior.ckpt'

        self.target = 6.1
        self.mixing = 0.0
        #Make a discriminative model and use finite differnces to solev this modle for a set of inputs predicted to give a set target
        self.y_property, self.x_solution = ODO.discrim(
            self, target_property=self.target, train=False)

        print('Target activity {}, optimized solution achieved activity {}'.
              format(self.target, self.y_property[0]))

        #Convert the solution to the decriminative modle to a form that can be fed into a genereative model
        ODO.discrim_to_gen(self)

        mew = self.gen_data_dir + 'mew.dat'
        std = self.gen_data_dir + 'std.dat'
        moments = [mew, std]
        #train a generative model if needed
        train_RNN = False
        if train_RNN:
            self.gen_ckpt_file = ODO.train_generator(self, moments)

        #Use a generative modle to produce smiles using output of discriminative modle as input
        print(
            'Generating SMILES from proposed vectors using RNN weights at {}'.
            format(self.gen_ckpt_file))
        if not os.path.exists(mew) and os.path.exists(std):
            print(
                'mew and std used to normalize generation input not found at {}, {}'
                .format(mew, std))
            moments = None
        generated_smis = ODO.generate(self,
                                      self.gen_data_dir,
                                      self.gen_ckpt_file,
                                      moments=moments)

        #convert smiles back into vectors to be tested by the discriminative model
        get_latent_vecs(generated_smis, self.discrim_data_dir,
                        'output_vecs.csv')
        try:
            generated_vecs = pd.read_csv(self.discrim_data_dir +
                                         'output_vecs.csv',
                                         header=0,
                                         dtype=np.float64)
            generated_vecs = generated_vecs.reindex(
                columns=get_headings()).values
        except:
            raise ValueError('Try deleting {}'.format(self.discrim_data_dir +
                                                      'output_vecs.csv'))

        #test if generating smiles close to generation vectors
        #ODO.test_vector_msd(self, generated_vecs)

        predict = ODO.predict_with_discrim(self, generated_vecs)
        np.savetxt('./a_{}_m_{}.dat'.format(self.target, self.mixing), predict)
        print('Average activity {} and std {}'.format(np.average(predict),
                                                      np.std(predict)))
        thresh_hold = 7.0
        thresh = [True if x >= thresh_hold else False for x in predict]
        smi_above_thresh = [[x, y]
                            for x, y, z in zip(generated_smis, predict, thresh)
                            if z is True]
        print('Number of compunds created = {}'.format(len(thresh)))
        print('Precent of compounds with activity above {} = {}'.format(
            thresh_hold, 100 * (thresh.count(True) / len(thresh))))
        for x in generated_smis:
            print(x)
Esempio n. 7
0
File: ODO.py Progetto: adw62/ODO
 def test_vector_msd(self, generated_vectors):
     vectors = pd.read_csv(self.discrim_data_dir + 'rounded.csv', header=0)
     vectors = vectors.reindex(columns=get_headings()).values
     vec = vectors[0]
     msd = [np.average((vec - x)**2) for x in generated_vectors]
     print(msd)