Beispiel #1
0
def evaluate(bit_vector):  # Takes a bitarray object
    # gene is a list of int, bit_vector a bitarray
    gene = BITtoGene(bit_vector)
    smile = opt.canonicalize(cfg_util.decode(
        opt.GenetoCFG(gene)))  # Transform the gene into a smile
    score = score_util.calc_score(smile)  # Calculate the J score
    return score
    def Rollout(self):
        self.rnn.reset_state()
        for m in self.moves:
            self.rnn.forward(np.array([m]).astype(np.int32))
        beam_width = 16  # must be bigger than 3!
        eps = 1e-100
        lhs_list = zinc_grammar.lhs_list
        lhs_map = zinc_grammar.lhs_map
        initial_stack = self.stack
        initial_rule = self.moves
        candidates = [(self.rnn, initial_stack, initial_rule, 0.0)]
        sequence_length = 250
        for t in range(sequence_length):
            next_candidates = []
            for previous_model, previous_stack, rules, log_likelihood in candidates:
                if len(previous_stack) == 0:
                    next_candidates.append(
                        (None, previous_stack, rules, log_likelihood))
                    continue
                model = previous_model.copy()
                x = np.asarray([rules[-1]]).astype(np.int32)
                with chainer.using_config('train', False):
                    with chainer.no_backprop_mode():
                        unmasked_probability = model.forward(x).data[0]
                stack = copy.copy(previous_stack)
                next_nonterminal = lhs_map[stack.pop()]
                mask = zinc_grammar.masks[next_nonterminal]
                masked_log_probability = np.log(unmasked_probability * mask +
                                                eps)
                order = masked_log_probability.argsort()[:-beam_width:-1]
                for sampled_rule in order:
                    if masked_log_probability[sampled_rule] > np.log(
                            eps) + eps:
                        rhs = filter(
                            lambda a: (type(a) == nltk.grammar.Nonterminal) and
                            (str(a) != 'None'),
                            zinc_grammar.GCFG.productions()
                            [sampled_rule].rhs())
                        next_candidates.append(
                            (model, stack + list(map(str, rhs))[::-1],
                             rules + [sampled_rule], log_likelihood +
                             masked_log_probability[sampled_rule]))
            candidates = sorted(next_candidates,
                                key=lambda x: -x[3])[:beam_width]
            if all([len(candidate[1]) == 0 for candidate in candidates]):
                break

        smiles = []
        self.moves_rollout = []
        for candidate in candidates:
            self.moves_rollout.append(candidate[2])
            smiles.append(cfg_util.decode(candidate[2]))
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        scores = pool.map(rdock_util.score, smiles)
        pool.close()
        pool.terminate()
        self.rollout_scores = scores
        self.rollout_smiles = smiles
        return [(score, smiles) for score, smiles in zip(scores, smiles)]
Beispiel #3
0
 def GetResult(self):
     # Get the result
     smiles = cfg_util.decode(self.moves)
     try:
         score = max(1.0, smiles_util.calc_score(smiles))
     except:
         score = 0.0
     return (score, smiles)
Beispiel #4
0
def save_log(population):
    save = input(
        "Save logs and image of final population ? (press 'y' or 'n') : ")
    if save == 'n':
        pass

    else:
        directory = input("Please input log file name (or directory) : ")

        # Creating a folder for this log

        os.system('mkdir ' + directory)
        file_name = directory

        # Stocking the final population in a pickle object

        f = open(directory + '/' + file_name + ".p", 'wb')

        # Remove double and non valid smile from the list befor stocking it

        ms = []
        smile_list = []
        for bit_vector in population:
            gene = BITtoGene(bit_vector)
            smile = opt.canonicalize(cfg_util.decode(opt.GenetoCFG(gene)))

            if smile != '' and smile != None and smile not in smile_list:
                if MolFromSmiles(smile) != None:
                    smile_list.append(smile)
                    ms.append(MolFromSmiles(smile))
        pickle.dump(smile_list, f)
        f.close()

        # Stocking the random.seed of this experiement in a text file

        f = open(directory + '/' + 'seed.txt', 'w')
        f.write(str(time) + '\n')
        #Stocking the final population and their score in the same file
        f.write('smile' + '\t' + 'score' + '\n')
        for smile in smile_list:
            score = score_util.calc_score(smile)
            f.write(smile + '\t' + str(score) + '\n')
        f.close()

    # Saving population Image

    if save == 'n':
        pass

    else:
        for i in range(len(ms)):
            Draw.MolToFile(ms[i],
                           directory + '/' + str(i) + '.png',
                           size=(120, 120))
        os.system(
            'montage ' + directory + '/*.png ' + directory + '/final.png'
        )  # Execute this command in the shell. Put all images of the molecules in a unique image
Beispiel #5
0
def main():
    rnn = RNN(rule_size=len(zinc_grammar.GCFG.productions()))
    serializers.load_npz("model-9.npz", rnn)
    rule_size = len(zinc_grammar.GCFG.productions())
    valid_smiles = []
    for trial in range(10000):
        rules_sampled = [0]
        rnn.reset_state()
        for _ in range(280):
            with chainer.no_backprop_mode():
                rule_prev = np.array([rules_sampled[-1]]).astype(np.int32)
                prob = rnn.get_probability(rule_prev).data[0]
                rule_sampled = np.random.choice(rule_size, p=prob)
                #print(rule_prev, rule_sampled, prob)
                rules_sampled.append(rule_sampled)
        smiles = cfg_util.decode(rules_sampled)
        if is_valid_smiles(smiles):
            valid_smiles.append(smiles)
            print(smiles, file=sys.stderr)
        if trial%100 == 0:
            print("{},{},{}".format(trial, len(valid_smiles), len(set(valid_smiles))))
def main():
    rnn = RNN(rule_size=len(zinc_grammar.GCFG.productions()))
    serializers.load_npz("model-9.npz", rnn)
    rule_size = len(zinc_grammar.GCFG.productions())
    valid_smiles = []
    for trial in range(10000):
        print(trial, file=sys.stderr)
        rules_sampled = [0]
        stack = ['chain']
        rnn.reset_state()
        for _ in range(280):
            rule_prev = np.array([rules_sampled[-1]]).astype(np.int32)
            with chainer.no_backprop_mode():
                unmasked_prob = rnn.get_probability(rule_prev).data[0]
            if len(stack) > 0:
                p = stack.pop()
            else:
                p = 'Nothing'
            next_nonterminal = zinc_grammar.lhs_map[p]
            mask = zinc_grammar.masks[next_nonterminal]
            masked_prob_unnormalized = unmasked_prob * mask
            Z = np.sum(masked_prob_unnormalized)
            masked_prob = masked_prob_unnormalized / Z
            rule_sampled = np.random.choice(rule_size, p=masked_prob)
            rhs = filter(
                lambda a:
                (type(a) == nltk.grammar.Nonterminal) and (str(a) != 'None'),
                zinc_grammar.GCFG.productions()[rule_sampled].rhs())
            stack.extend(list(map(str, rhs))[::-1])
            rules_sampled.append(rule_sampled)
        smiles = cfg_util.decode(rules_sampled)
        if is_valid_smiles(smiles):
            valid_smiles.append(smiles)
            print(smiles, file=sys.stderr)
        if trial % 100 == 0:
            print("{},{},{}".format(trial, len(valid_smiles),
                                    len(set(valid_smiles))))
 def __init__(self, rules):
     self.rules = list(rules)
     self.smiles = cfg_util.decode(rules)[0]
     self.subtreesize = [-1 for _ in range(len(self.rules))]
     self.__calc_subtreesize(0)
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--smifile', default='250k_rndm_zinc_drugs_clean.smi')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--mu', type=int, default=32)
    parser.add_argument('--lam', type=int, default=64)
    parser.add_argument('--generation', type=int, default=1000)
    args = parser.parse_args()

    np.random.seed(args.seed)

    gene_length = 300

    N_mu = args.mu
    N_lambda = args.lam

    # initialize population
    seed_smiles = []
    with open(args.smifile) as f:
        for line in f:
            smiles = line.rstrip()
            seed_smiles.append(smiles)

    start_time = time.time()

    initial_smiles = np.random.choice(seed_smiles, N_mu+N_lambda)
    initial_smiles = [s for s in initial_smiles]
    initial_genes = [CFGtoGene(cfg_util.encode(s), max_len=gene_length)
                     for s in initial_smiles]
    initial_scores = rdock_util.score_qsub(initial_smiles)

    population = []
    for score, gene, smiles in zip(initial_scores, initial_genes,
                                   initial_smiles):
        population.append((score, smiles, gene))

    population = sorted(population, key=lambda x: x[0])[:N_mu]

    all_smiles = [canonicalize(p[1]) for p in population]
    all_result = [(p[0], s) for p, s in zip(population, all_smiles)]

    scores = [p[0] for p in population]
    max_score = np.max(scores)
    elapsed_time = time.time() - start_time
    print("%{},{},{}".format(0, max_score, elapsed_time))
    for p in population:
        print("{},{}".format(p[0], p[1]))

    for generation in range(args.generation):
        new_population_smiles = []
        new_population_genes = []
        for _ in range(N_lambda):
            p = population[np.random.randint(len(population))]
            p_gene = p[2]
            c_gene = mutation(p_gene)

            c_smiles = canonicalize(cfg_util.decode(GenetoCFG(c_gene)))
            if c_smiles != '' and c_smiles not in all_smiles:
                new_population_smiles.append(c_smiles)
                new_population_genes.append(c_gene)
                all_smiles.append(c_smiles)

        new_population_scores = rdock_util.score_qsub(new_population_smiles)
        for score, gene, smiles in zip(new_population_scores,
                                       new_population_genes,
                                       new_population_smiles):
            population.append((score, smiles, gene))
            all_result.append((score, smiles))
        population = sorted(population, key=lambda x: x[0])[:N_mu]
        scores = [i[0] for i in population]
        max_score = np.max(scores)
        elapsed_time = time.time() - start_time
        print("%{},{},{}".format(generation+1, max_score, elapsed_time))
        for p in population:
            print("{},{}".format(p[0], p[1]))

    print("list of generated smiles:")
    for r in all_result:
        print("{},{}".format(r[0], r[1]))
Beispiel #9
0
def main(Pipes, island_id, nb_of_island, mig_interval, logn=-1):
    #parser = argparse.ArgumentParser()
    #parser.add_argument('--smifile', default='250k_rndm_zinc_drugs_clean.smi')
    #parser.add_argument('--seed', type=int, default=t.time())
    #args = parser.parse_args()

    smifile = '250k_rndm_zinc_drugs_clean.smi'
    if logn == -1:
        np.random.seed(0 + island_id)
    else:
        np.random.seed(int(t.time()))
    #np.random.seed(0)
    global best_smiles
    global best_score
    global all_smiles

    gene_length = 300

    N_mu = int(1000 / nb_of_island)
    N_lambda = int(2000 / nb_of_island)

    # initialize population
    seed_smiles = []
    with open(smifile) as f:
        for line in f:
            smiles = line.rstrip()
            seed_smiles.append(smiles)

    initial_smiles = np.random.choice(seed_smiles, N_mu + N_lambda)
    initial_smiles = [canonicalize(s) for s in initial_smiles]
    initial_genes = [
        CFGtoGene(cfg_util.encode(s), max_len=gene_length)
        for s in initial_smiles
    ]
    initial_scores = [score_util.calc_score(s) for s in initial_smiles]
    #print(initial_scores)
    population = []
    for score, gene, smiles in zip(initial_scores, initial_genes,
                                   initial_smiles):
        population.append((score, smiles, gene))

    population = sorted(population, key=lambda x: x[0], reverse=True)[:N_mu]

    th = threading.Timer(60, current_best, [])
    th.start()
    print("Start!")
    all_smiles = [p[1] for p in population]
    #print([p[0] for p in population])
    #mig_interval = 5 # A migration every 1000 iteration
    x = [i for i in range(mig_interval, 1000000000, mig_interval)
         ]  # All the generation in wich a migration should occur
    k = 1  # First migration
    t0 = t.time()
    for generation in range(1000000000):
        scores = [p[0] for p in population]
        mean_score = np.mean(scores)
        min_score = np.min(scores)
        std_score = np.std(scores)
        best_score = np.max(scores)
        idx = np.argmax(scores)
        best_smiles = population[idx][1]
        print("%{},{},{},{},{}".format(generation, best_score, mean_score,
                                       min_score, std_score))

        new_population = []
        for _ in range(N_lambda):
            p = population[np.random.randint(len(population))]
            p_gene = p[2]
            c_gene = mutation(p_gene)

            c_smiles = canonicalize(cfg_util.decode(GenetoCFG(c_gene)))
            if c_smiles not in all_smiles:
                c_score = score_util.calc_score(c_smiles)
                c = (c_score, c_smiles, c_gene)
                new_population.append(c)
                all_smiles.append(c_smiles)

        population.extend(new_population)
        population = sorted(population, key=lambda x: x[0],
                            reverse=True)[:N_mu]

        # Every mig_interval generation make
        if generation in x:
            print('Starting Migration')
            if k >= nb_of_island:
                k = 1
            population = migration(Pipes, island_id, nb_of_island, population,
                                   k)
            k += 1
        if t.time() - t0 >= 3600 * 8:
            break
    if logn == -1:
        f = open(
            str(island_id) + '_final_pop' + '_' + str(nb_of_island) + '_' +
            str(mig_interval) + '.csv', 'w')
    if logn != -1:
        f = open(
            str(island_id) + '_final_pop' + '_' + str(nb_of_island) + '_' +
            str(mig_interval) + '_' + str(logn) + '.csv', 'w')
    population = pd.DataFrame(population)
    population.to_csv(f)
    f.close()
Beispiel #10
0
def main():

    global time
    time = t.time()
    print(time)
    random.seed(time)

    max_generation = 1000000
    max_time = 8 * 3600  # 8 hours
    population_size = 100
    bit_vector_size = 2400  # Maximum length of vectors in the population (should be a multiple of 8)

    P = [0.5 for _ in range(0, bit_vector_size)]  # Probability vector
    LR = 0.1  # Learning Rate (typically 0.1–0.4)
    MS = 0.05  # Degree of mutation (typical value is 0.05)
    Pr_mutation = 0.08  # Probability of mutation (typically 0.02)

    mu = 2  # Number of vector used to make P evolve

    k = 0
    duration = 0
    converge = False
    best_fitness = -1e11
    best_bit_vector = None
    while converge is not True and duration < max_time:  # k < max_generation or duration < max_time depending on what you want

        population = []
        score_smile = []
        best_bit_vector = None
        best_fitness = -1e10
        for i in range(0, population_size):

            bit_vector = generate_bit_vector(
                P)  # Create a new vector which represents an individual
            population.append(bit_vector)
            fitness = evaluate(
                population[i])  # Evaluate the fitness of the new vectorsr
            if fitness > -1e10:
                score_smile.append([fitness, bit_vector])
            #print(fitness)
            #gene = BITtoGene(bit_vector)
            #smile = opt.canonicalize(cfg_util.decode(opt.GenetoCFG(gene)))
            #print(smile)
            if fitness > best_fitness:  # /!\ '<' and '>'

                best_fitness = fitness  # Update the best individual (i.e. max fitness)
                best_bit_vector = bit_vector

                gene = BITtoGene(best_bit_vector)
                smile = opt.canonicalize(cfg_util.decode(opt.GenetoCFG(gene)))
                print(best_fitness, '=', smile)
        try:
            # Evolution
            for j in range(0, len(P)):

                #P[j] = P[j]*(1 - LR) + int(best_bit_vector[j])*LR  # Update the probability vector with the best indiv

                score_smile = sorted(
                    score_smile, key=lambda x: x[0],
                    reverse=False)  # The best smile is a the end of the list
                print(score_smile)
                if len(score_smile) < mu:
                    N = len(score_smile)
                else:
                    N = mu
                X = 0
                for i in range(N):
                    X += (i + 1) * (score_smile[i][1][j] - P[j])

                P[j] = P[j] + LR / (
                    P[j] *
                    (1 - P[j])) * X  # Information Geometric implementation

            # Mutation
            for j in range(0, len(P)):

                if random.random() < Pr_mutation:

                    P[j] = P[j] * (1 - MS) + random.randint(0, 1) * MS
        except:
            print('No valid SMILE generated : pass')
        converge = convergence(P)
        k += 1
        duration = t.time() - time
        print(k, ' time : ', duration, ' s')

    gene = BITtoGene(best_bit_vector)
    smile = opt.canonicalize(cfg_util.decode(opt.GenetoCFG(gene)))
    print(smile)

    save_log(population)

    return best_bit_vector
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--smifile', default='250k_rndm_zinc_drugs_clean.smi')
    parser.add_argument('--seed', type=int, default=0)
    args = parser.parse_args()

    np.random.seed(args.seed)

    global best_smiles
    global best_score
    global all_smiles

    gene_length = 300

    N_mu = 100
    N_lambda = 200

    # initialize population
    seed_smiles = []
    with open(args.smifile) as f:
        for line in f:
            smiles = line.rstrip()
            seed_smiles.append(smiles)

    initial_smiles = np.random.choice(seed_smiles, N_mu + N_lambda)
    initial_smiles = [canonicalize(s) for s in initial_smiles]
    initial_genes = [
        CFGtoGene(cfg_util.encode(s), max_len=gene_length)
        for s in initial_smiles
    ]
    initial_scores = [score_util.calc_score(s) for s in initial_smiles]

    population = []
    for score, gene, smiles in zip(initial_scores, initial_genes,
                                   initial_smiles):
        population.append((score, smiles, gene))

    population = sorted(population, key=lambda x: x[0], reverse=True)[:N_mu]

    t = threading.Timer(60, current_best, [])
    t.start()
    print("Start!")
    all_smiles = [p[1] for p in population]
    for generation in range(1000000000):
        scores = [p[0] for p in population]
        mean_score = np.mean(scores)
        min_score = np.min(scores)
        std_score = np.std(scores)
        best_score = np.max(scores)
        idx = np.argmax(scores)
        best_smiles = population[idx][1]
        print("%{},{},{},{},{}".format(generation, best_score, mean_score,
                                       min_score, std_score))

        new_population = []
        for _ in range(N_lambda):
            p = population[np.random.randint(len(population))]
            p_gene = p[2]
            c_gene = mutation(p_gene)

            c_smiles = canonicalize(cfg_util.decode(GenetoCFG(c_gene)))
            if c_smiles not in all_smiles:
                c_score = score_util.calc_score(c_smiles)
                c = (c_score, c_smiles, c_gene)
                new_population.append(c)
                all_smiles.append(c_smiles)

        population.extend(new_population)
        population = sorted(population, key=lambda x: x[0],
                            reverse=True)[:N_mu]
def main():
    population = []
    rules = np.load("rules.npz")['arr_0']
    initial_rules = np.random.choice(rules, 100)
    initial_genes = [CFGtoGene(rule, max_len=288) for rule in initial_rules]
    initial_scores = []
    for i in range(0, len(initial_genes), multiprocessing.cpu_count()):
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        initial_scores.extend(
            pool.map(rdock_util.score, [
                cfg_util.decode(GenetoCFG(gene))[0]
                for gene in initial_genes[i:i + multiprocessing.cpu_count()]
            ]))
        pool.close()
        pool.terminate()
    for s, m in zip(initial_scores, initial_genes):
        population.append((s, m))

    trial = 0
    valid_smiles = []
    scores = []
    all_smiles = []
    t = threading.Timer(60, current_best, [scores])
    t.start()
    for generation in range(100):
        print("generation", generation)
        population = sorted(population, key=lambda x: x[0])[:100]
        for s, g in population:
            print(s, cfg_util.decode(GenetoCFG(g))[0])
        cpu_count = multiprocessing.cpu_count()
        # crossover
        children_smiles = []
        children_genes = []
        while len(children_smiles) < cpu_count * 0.8:
            idx1, idx2 = np.random.choice(len(population), size=2)
            score1, gene1 = population[idx1]
            score2, gene2 = population[idx2]
            cut_point = np.random.choice(len(gene1))
            gene_child = gene1[:cut_point] + gene2[cut_point:]
            smiles_child = cfg_util.decode(GenetoCFG(gene_child))[0]
            if is_valid_smiles(
                    smiles_child) and smiles_child not in all_smiles:
                children_smiles.append(smiles_child)
                children_genes.append(gene_child)
                all_smiles.append(smiles_child)
        # mutation
        while len(children_smiles) < cpu_count:
            idx = np.random.choice(len(population))
            score, gene = population[idx]
            mutation_idx = np.random.choice(len(gene))
            gene_mutant = copy.deepcopy(gene)
            gene_mutant[mutation_idx] = np.random.choice(80)
            smiles_mutant = cfg_util.decode(GenetoCFG(gene_mutant))[0]
            if is_valid_smiles(
                    smiles_mutant) and smiles_mutant not in all_smiles:
                children_smiles.append(smiles_mutant)
                children_genes.append(gene_mutant)
                all_smiles.append(smiles_mutant)

        pool = multiprocessing.Pool(cpu_count)
        scores_child = pool.map(rdock_util.score, children_smiles)
        pool.close()
        pool.terminate()
        scores.extend(scores_child)
        assert (len(scores_child) == len(children_genes))
        for s, g in zip(scores_child, children_genes):
            if (s, g) not in population:
                population.append((s, g))