def data_provider(k, flags):
     tokens = get_default_tokens()
     demo_data = GeneratorData(training_data_path=flags.demo_file,
                               delimiter='\t',
                               cols_to_read=[0],
                               keep_header=True,
                               pad_symbol=' ',
                               max_len=120,
                               tokens=tokens,
                               use_cuda=use_cuda)
     unbiased_data = GeneratorData(training_data_path=flags.unbiased_file,
                                   delimiter='\t',
                                   cols_to_read=[0],
                                   keep_header=True,
                                   pad_symbol=' ',
                                   max_len=120,
                                   tokens=tokens,
                                   use_cuda=use_cuda)
     prior_data = GeneratorData(training_data_path=flags.prior_data,
                                delimiter='\t',
                                cols_to_read=[0],
                                keep_header=True,
                                pad_symbol=' ',
                                max_len=120,
                                tokens=tokens,
                                use_cuda=use_cuda)
     return {
         'demo_data': demo_data,
         'unbiased_data': unbiased_data,
         'prior_data': prior_data
     }
Beispiel #2
0
 def data_provider(k, flags):
     tokens = get_default_tokens()
     gen_data = GeneratorData(training_data_path=flags.data_file,
                              delimiter='\t',
                              cols_to_read=[0],
                              keep_header=True,
                              pad_symbol=' ',
                              max_len=120,
                              tokens=tokens,
                              use_cuda=use_cuda)
     return {"train": gen_data, "val": gen_data, "test": gen_data}
                        help='The directory to save the created dataset')
    parser.add_argument('--filename',
                        type=str,
                        default='drd2_active.smi',
                        help='The filename for the created dataset')
    args = parser.parse_args()

    assert (os.path.exists(args.svc))
    assert (os.path.exists(args.data))
    assert (0 < args.threshold < 1)

    # Load file containing SMILES
    gen_data = GeneratorData(training_data_path=args.data,
                             delimiter='\t',
                             cols_to_read=[0],
                             keep_header=True,
                             pad_symbol=' ',
                             max_len=120,
                             tokens=get_default_tokens(),
                             use_cuda=False)

    # Load classifier
    clf = DRD2Model(args.svc)

    # Screen SMILES in data file and write active compounds to file.
    os.makedirs(args.save_dir, exist_ok=True)
    num_active = 0
    with open(os.path.join(args.save_dir, args.filename), 'w') as f:
        for i in trange(gen_data.file_len, desc='Screening compounds...'):
            smiles = gen_data.file[i][1:-1]
            p = clf(smiles)
            if p >= args.threshold:
Beispiel #4
0
from irelease.data import GeneratorData
from irelease.env import MoleculeEnv
from irelease.model import Encoder, PositionalEncoding, StackDecoderLayer, LinearOut, StackRNN, RNNLinearOut, RewardNetRNN
from irelease.reward import RewardFunction
from irelease.rl import PolicyAgent, MolEnvProbabilityActionSelector, REINFORCE, GuidedRewardLearningIRL, \
    StateActionProbRegistry
from irelease.stackrnn import StackRNNCell
from irelease.utils import init_hidden, init_stack, get_default_tokens, init_hidden_2d, init_stack_2d, init_cell, seq2tensor

gen_data_path = '../data/chembl_xsmall.smi'
tokens = get_default_tokens()
# print(f'Number of tokens = {len(tokens)}')
gen_data = GeneratorData(training_data_path=gen_data_path,
                         delimiter='\t',
                         cols_to_read=[0],
                         keep_header=True,
                         tokens=tokens,
                         tokens_reload=True)

bz = 32


class MyTestCase(unittest.TestCase):
    def test_batch(self):
        batch = gen_data.random_training_set(batch_size=bz)
        assert (len(batch[0]) == bz and len(batch[1]) == bz)

    def test_embeddings(self):
        x, y = gen_data.random_training_set(batch_size=bz)
        encoder = Encoder(gen_data.n_characters, 128,
                          gen_data.char2idx[gen_data.pad_symbol])