Esempio n. 1
0
def main(args):
    """
    main function
    """

    model_config = json.load(open(args.model_config, 'r'))
    if args.use_cuda:
        paddle.set_device("gpu")
    else:
        paddle.set_device("cpu")

    encoder_model = ProteinEncoderModel(model_config, name='protein')
    model = ProteinModel(encoder_model, model_config)
    model.load_dict(paddle.load(args.predict_model))

    tokenizer = ProteinTokenizer()
    examples = []
    with codecs.open(args.predict_data) as f_read:
        for line in f_read:
            if len(line.strip()) == 0:
                continue
            examples.append(line.strip())

    example_ids = [tokenizer.gen_token_ids(example) for example in examples]
    max_seq_len = max([len(example_id) for example_id in example_ids]) 
    pos = [list(range(1, len(example_id) + 1)) for example_id in example_ids]
    pad_to_max_seq_len(example_ids, max_seq_len)
    pad_to_max_seq_len(pos, max_seq_len)

    texts = paddle.to_tensor(example_ids)
    pos = paddle.to_tensor(pos)
    pred = model(texts, pos)
    pred = pred.numpy()

    show_results(examples, pred, model_config['task'])
Esempio n. 2
0
class Pfam(object):
    """Class for pfam dataset.
    For more details, please check paper "Evaluating Protein Transfer Learning with TAPE".
    """
    def __init__(self):
        self.tokenizer = ProteinTokenizer()
        self.clear()

    def gen_sequence_data(self, data):
        """Genearte sequence data.
        """
        amino_acids = data['amino_acids']
        token_ids = self.tokenizer.gen_token_ids(amino_acids)
        return token_ids

    def append(self, data):
        """Append data.
        """
        token_ids = self.gen_sequence_data(data)
        self.token_ids.extend(token_ids)
        self.lengths.append(len(token_ids))

    def clear(self):
        """Clear data.
        """
        self.token_ids = []
        self.lengths = []

    def save_npz(self, filename):
        """Save data to npz format file.
        """
        np.savez('%s' % filename,
                 token_ids=np.array(self.token_ids, dtype='int8'),
                 lengths=np.array(self.lengths, dtype='int64'))
Esempio n. 3
0
def load_pdbbind_dataset(data_path, featurizer):
    """tbd"""
    tokenizer = ProteinTokenizer()
    file = os.path.join(data_path, 'raw.txt')
    data_list = []
    with open(file, 'r') as f:
        for line in f:
            protein, smiles, affinity = line.strip().split(',')
            smiles = smiles.split()[0]
            affinity = float(affinity)

            data = {}
            mol_graph = featurizer.gen_features({'smiles': smiles})
            if mol_graph is None:
                continue
            data.update(mol_graph)
            data['protein_token_ids'] = np.array(
                tokenizer.gen_token_ids(protein))
            data['affinity'] = np.array([affinity])
            data_list.append(data)
        dataset = InMemoryDataset(data_list=data_list)
    return dataset
Esempio n. 4
0
def main(args):
    """main"""
    paddle.enable_static()

    model_config = json.load(open(args.model_config, 'r'))

    exe_params = default_exe_params(False, args.use_cuda, args.thread_num)
    exe = exe_params['exe']
    gpu_id = exe_params['gpu_id']
    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
    else:
        place = fluid.CPUPlace()

    task = model_config['task']

    model = TAPEModel(model_config=model_config, name=task)

    test_program = fluid.Program()
    test_startup = fluid.Program()
    with fluid.program_guard(test_program, test_startup):
        with fluid.unique_name.guard():
            model.forward(True)
            exe.run(test_startup)

    if not args.init_model is None and args.init_model != "":
        load_partial_params(exe, args.init_model, test_program)
    else:
        raise RuntimeError('Please set init_model.')

    tokenizer = ProteinTokenizer()
    test_fetch_list = model.get_fetch_list(is_inference=True)

    examples = []
    for line in sys.stdin:
        if len(line.strip()) == 0:
            continue
        examples.append(line.strip())

    for i in range(0, len(examples), args.batch_size):
        inputs = gen_batch_data(examples[i: min(len(examples), i + args.batch_size)], tokenizer, place)
        results = exe.run(
                program=test_program,
                feed=inputs,
                fetch_list=test_fetch_list,
                return_numpy=False)
        pred = np.array(results[0])
        print(pred)
        show_results(examples, pred, task)
Esempio n. 5
0
class SecondaryStructure(object):
    """Class for second structure dataset.
    For more details, please check paper "Evaluating Protein Transfer Learning with TAPE".
    """
    def __init__(self):
        self.tokenizer = ProteinTokenizer()
        self.clear()

    def gen_sequence_data(self, data):
        """Genearte sequence data.
        """
        amino_acids = data['amino_acids']
        token_ids = self.tokenizer.gen_token_ids(amino_acids)
        labels3 = [0] + data['ss3'] + [0]
        labels8 = [0] + data['ss8'] + [0]
        return token_ids, labels3, labels8

    def append(self, data):
        """Append data.
        """
        token_ids, labels3, labels8 = self.gen_sequence_data(data)
        self.token_ids.extend(token_ids)
        self.labels3.extend(labels3)
        self.labels8.extend(labels8)
        self.lengths.append(len(token_ids))

    def clear(self):
        """Clear data.
        """
        self.token_ids = []
        self.labels3 = []
        self.labels8 = []
        self.lengths = []

    def save_npz(self, filename):
        """Save data to npz format file.
        """
        np.savez('%s' % filename,
                 token_ids=np.array(self.token_ids, dtype='int8'),
                 labels3=np.array(self.labels3, dtype='int8'),
                 labels8=np.array(self.labels8, dtype='int8'),
                 lengths=np.array(self.lengths, dtype='int64'))
Esempio n. 6
0
 def __init__(self):
     self.tokenizer = ProteinTokenizer()
     self.clear()
Esempio n. 7
0
def main():
    """Entry for data preprocessing."""
    tokenizer = ProteinTokenizer()
    for dataset in ['davis', 'kiba']:
        data_dir = os.path.join(args.dataset_root, dataset)
        if not os.path.exists(data_dir):
            print('Cannot find {}'.format(data_dir))
            continue

        train_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt')))
        train_fold = [ee for e in train_fold for ee in e]  # flatten
        test_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt')))
        ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')),
                            object_pairs_hook=OrderedDict)
        proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')),
                             object_pairs_hook=OrderedDict)
        # Use encoding 'latin1' to load py2 pkl from py3
        # pylint: disable=E1123
        affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'),
                               encoding='latin1')

        smiles_lst, protein_lst = [], []
        for k in ligands.keys():
            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]),
                                      isomericSmiles=True)
            smiles_lst.append(smiles)

        for k in proteins.keys():
            protein_lst.append(proteins[k])

        if dataset == 'davis':
            # Kd data
            affinity = [-np.log10(y / 1e9) for y in affinity]

        affinity = np.asarray(affinity)

        # pylint: disable=E1123
        os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True)
        for split in ['train', 'test']:
            print('processing {} set of {}'.format(split, dataset))

            split_dir = os.path.join(data_dir, 'processed', split)
            # pylint: disable=E1123
            os.makedirs(split_dir, exist_ok=True)

            fold = train_fold if split == 'train' else test_fold
            rows, cols = np.where(np.isnan(affinity) == False)
            rows, cols = rows[fold], cols[fold]

            data_lst = []
            for idx in range(len(rows)):
                mol = AllChem.MolFromSmiles(smiles_lst[rows[idx]])
                mol_graph = mol_to_graph_data(mol)
                data = {k: v for k, v in mol_graph.items()}

                seqs = []
                for seq in protein_lst[cols[idx]].split('\x01'):
                    seqs.extend(tokenizer.gen_token_ids(seq))
                data['protein_token_ids'] = np.array(seqs)

                af = affinity[rows[idx], cols[idx]]
                if dataset == 'davis':
                    data['Log10_Kd'] = np.array([af])
                elif dataset == 'kiba':
                    data['KIBA'] = np.array([af])

                data_lst.append(data)

            random.shuffle(data_lst)
            npz = os.path.join(split_dir, '{}_{}.npz'.format(dataset, split))
            save_data_list_to_npz(data_lst, npz)

        print('==============================')
        print('dataset:', dataset)
        print('train_fold:', len(train_fold))
        print('test_fold:', len(test_fold))
        print('unique drugs:', len(set(smiles_lst)))
        print('unique proteins:', len(set(protein_lst)))
Esempio n. 8
0
def load_davis_dataset(data_path, featurizer):
    """tbd"""
    tokenizer = ProteinTokenizer()
    for dataset in ['davis']:
        data_dir = os.path.join(data_path, dataset)
        if not os.path.exists(data_dir):
            print('Cannot find {}'.format(data_dir))
            continue

        train_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt')))
        train_fold = [ee for e in train_fold for ee in e]  # flatten
        test_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt')))
        ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')),
                            object_pairs_hook=OrderedDict)
        proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')),
                             object_pairs_hook=OrderedDict)
        # Use encoding 'latin1' to load py2 pkl from py3
        # pylint: disable=E1123
        affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'),
                               encoding='latin1')

        smiles_lst, protein_lst = [], []
        # print("keys :",ligands.keys())
        for k in ligands.keys():
            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]),
                                      isomericSmiles=True)
            smiles_lst.append(smiles)

        for k in proteins.keys():
            protein_lst.append(proteins[k])

        if dataset == 'davis':
            # Kd data
            affinity = [-np.log10(y / 1e9) for y in affinity]

        affinity = np.asarray(affinity)

        # pylint: disable=E1123
        os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True)
        train_test_dataset = []
        for split in ['train', 'test']:
            print('processing {} set of {}'.format(split, dataset))

            split_dir = os.path.join(data_dir, 'processed', split)
            # pylint: disable=E1123
            os.makedirs(split_dir, exist_ok=True)

            fold = train_fold if split == 'train' else test_fold
            rows, cols = np.where(np.isnan(affinity) == False)
            rows, cols = rows[fold], cols[fold]
            # changed from npz files to 1
            data_lst = [[] for _ in range(1)]
            for idx in range(len(rows)):
                # mol_graph = smiles_to_graph_data(smiles_lst[rows[idx]])
                # if idx >= 1000:
                #     break
                mol_graph = mol_to_md_graph_data(Chem.MolFromSmiles(
                    smiles_lst[rows[idx]]),
                                                 add_3dpos=False)
                data = {k: v for k, v in mol_graph.items()}

                seqs = []
                for seq in protein_lst[cols[idx]].split('\x01'):
                    seqs.extend(tokenizer.gen_token_ids(seq))
                data['protein_token_ids'] = np.array(seqs)

                af = affinity[rows[idx], cols[idx]]
                if dataset == 'davis':
                    data['Log10_Kd'] = np.array([af])
                elif dataset == 'kiba':
                    data['KIBA'] = np.array([af])

                data_lst[idx % 1].append(data)

            random.shuffle(data_lst)
            # how to deal with the distributed feature ?
            # Now return the whone dataset
            # print("data lst:",data_lst)
            train_test_dataset.append(InMemoryDataset(data_lst[0]))
        print('==============================')
        print('dataset:', dataset)
        print('train_fold:', len(train_fold))
        print('test_fold:', len(test_fold))
        print('unique drugs:', len(set(smiles_lst)))
        print('unique proteins:', len(set(protein_lst)))
        return train_test_dataset