def main(args): """ main function """ model_config = json.load(open(args.model_config, 'r')) if args.use_cuda: paddle.set_device("gpu") else: paddle.set_device("cpu") encoder_model = ProteinEncoderModel(model_config, name='protein') model = ProteinModel(encoder_model, model_config) model.load_dict(paddle.load(args.predict_model)) tokenizer = ProteinTokenizer() examples = [] with codecs.open(args.predict_data) as f_read: for line in f_read: if len(line.strip()) == 0: continue examples.append(line.strip()) example_ids = [tokenizer.gen_token_ids(example) for example in examples] max_seq_len = max([len(example_id) for example_id in example_ids]) pos = [list(range(1, len(example_id) + 1)) for example_id in example_ids] pad_to_max_seq_len(example_ids, max_seq_len) pad_to_max_seq_len(pos, max_seq_len) texts = paddle.to_tensor(example_ids) pos = paddle.to_tensor(pos) pred = model(texts, pos) pred = pred.numpy() show_results(examples, pred, model_config['task'])
class Pfam(object): """Class for pfam dataset. For more details, please check paper "Evaluating Protein Transfer Learning with TAPE". """ def __init__(self): self.tokenizer = ProteinTokenizer() self.clear() def gen_sequence_data(self, data): """Genearte sequence data. """ amino_acids = data['amino_acids'] token_ids = self.tokenizer.gen_token_ids(amino_acids) return token_ids def append(self, data): """Append data. """ token_ids = self.gen_sequence_data(data) self.token_ids.extend(token_ids) self.lengths.append(len(token_ids)) def clear(self): """Clear data. """ self.token_ids = [] self.lengths = [] def save_npz(self, filename): """Save data to npz format file. """ np.savez('%s' % filename, token_ids=np.array(self.token_ids, dtype='int8'), lengths=np.array(self.lengths, dtype='int64'))
def load_pdbbind_dataset(data_path, featurizer): """tbd""" tokenizer = ProteinTokenizer() file = os.path.join(data_path, 'raw.txt') data_list = [] with open(file, 'r') as f: for line in f: protein, smiles, affinity = line.strip().split(',') smiles = smiles.split()[0] affinity = float(affinity) data = {} mol_graph = featurizer.gen_features({'smiles': smiles}) if mol_graph is None: continue data.update(mol_graph) data['protein_token_ids'] = np.array( tokenizer.gen_token_ids(protein)) data['affinity'] = np.array([affinity]) data_list.append(data) dataset = InMemoryDataset(data_list=data_list) return dataset
def main(args): """main""" paddle.enable_static() model_config = json.load(open(args.model_config, 'r')) exe_params = default_exe_params(False, args.use_cuda, args.thread_num) exe = exe_params['exe'] gpu_id = exe_params['gpu_id'] if args.use_cuda: place = fluid.CUDAPlace(gpu_id) else: place = fluid.CPUPlace() task = model_config['task'] model = TAPEModel(model_config=model_config, name=task) test_program = fluid.Program() test_startup = fluid.Program() with fluid.program_guard(test_program, test_startup): with fluid.unique_name.guard(): model.forward(True) exe.run(test_startup) if not args.init_model is None and args.init_model != "": load_partial_params(exe, args.init_model, test_program) else: raise RuntimeError('Please set init_model.') tokenizer = ProteinTokenizer() test_fetch_list = model.get_fetch_list(is_inference=True) examples = [] for line in sys.stdin: if len(line.strip()) == 0: continue examples.append(line.strip()) for i in range(0, len(examples), args.batch_size): inputs = gen_batch_data(examples[i: min(len(examples), i + args.batch_size)], tokenizer, place) results = exe.run( program=test_program, feed=inputs, fetch_list=test_fetch_list, return_numpy=False) pred = np.array(results[0]) print(pred) show_results(examples, pred, task)
class SecondaryStructure(object): """Class for second structure dataset. For more details, please check paper "Evaluating Protein Transfer Learning with TAPE". """ def __init__(self): self.tokenizer = ProteinTokenizer() self.clear() def gen_sequence_data(self, data): """Genearte sequence data. """ amino_acids = data['amino_acids'] token_ids = self.tokenizer.gen_token_ids(amino_acids) labels3 = [0] + data['ss3'] + [0] labels8 = [0] + data['ss8'] + [0] return token_ids, labels3, labels8 def append(self, data): """Append data. """ token_ids, labels3, labels8 = self.gen_sequence_data(data) self.token_ids.extend(token_ids) self.labels3.extend(labels3) self.labels8.extend(labels8) self.lengths.append(len(token_ids)) def clear(self): """Clear data. """ self.token_ids = [] self.labels3 = [] self.labels8 = [] self.lengths = [] def save_npz(self, filename): """Save data to npz format file. """ np.savez('%s' % filename, token_ids=np.array(self.token_ids, dtype='int8'), labels3=np.array(self.labels3, dtype='int8'), labels8=np.array(self.labels8, dtype='int8'), lengths=np.array(self.lengths, dtype='int64'))
def __init__(self): self.tokenizer = ProteinTokenizer() self.clear()
def main(): """Entry for data preprocessing.""" tokenizer = ProteinTokenizer() for dataset in ['davis', 'kiba']: data_dir = os.path.join(args.dataset_root, dataset) if not os.path.exists(data_dir): print('Cannot find {}'.format(data_dir)) continue train_fold = json.load( open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt'))) train_fold = [ee for e in train_fold for ee in e] # flatten test_fold = json.load( open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt'))) ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')), object_pairs_hook=OrderedDict) proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')), object_pairs_hook=OrderedDict) # Use encoding 'latin1' to load py2 pkl from py3 # pylint: disable=E1123 affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'), encoding='latin1') smiles_lst, protein_lst = [], [] for k in ligands.keys(): smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]), isomericSmiles=True) smiles_lst.append(smiles) for k in proteins.keys(): protein_lst.append(proteins[k]) if dataset == 'davis': # Kd data affinity = [-np.log10(y / 1e9) for y in affinity] affinity = np.asarray(affinity) # pylint: disable=E1123 os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True) for split in ['train', 'test']: print('processing {} set of {}'.format(split, dataset)) split_dir = os.path.join(data_dir, 'processed', split) # pylint: disable=E1123 os.makedirs(split_dir, exist_ok=True) fold = train_fold if split == 'train' else test_fold rows, cols = np.where(np.isnan(affinity) == False) rows, cols = rows[fold], cols[fold] data_lst = [] for idx in range(len(rows)): mol = AllChem.MolFromSmiles(smiles_lst[rows[idx]]) mol_graph = mol_to_graph_data(mol) data = {k: v for k, v in mol_graph.items()} seqs = [] for seq in protein_lst[cols[idx]].split('\x01'): seqs.extend(tokenizer.gen_token_ids(seq)) data['protein_token_ids'] = np.array(seqs) af = affinity[rows[idx], cols[idx]] if dataset == 'davis': data['Log10_Kd'] = np.array([af]) elif dataset == 'kiba': data['KIBA'] = np.array([af]) data_lst.append(data) random.shuffle(data_lst) npz = os.path.join(split_dir, '{}_{}.npz'.format(dataset, split)) save_data_list_to_npz(data_lst, npz) print('==============================') print('dataset:', dataset) print('train_fold:', len(train_fold)) print('test_fold:', len(test_fold)) print('unique drugs:', len(set(smiles_lst))) print('unique proteins:', len(set(protein_lst)))
def load_davis_dataset(data_path, featurizer): """tbd""" tokenizer = ProteinTokenizer() for dataset in ['davis']: data_dir = os.path.join(data_path, dataset) if not os.path.exists(data_dir): print('Cannot find {}'.format(data_dir)) continue train_fold = json.load( open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt'))) train_fold = [ee for e in train_fold for ee in e] # flatten test_fold = json.load( open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt'))) ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')), object_pairs_hook=OrderedDict) proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')), object_pairs_hook=OrderedDict) # Use encoding 'latin1' to load py2 pkl from py3 # pylint: disable=E1123 affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'), encoding='latin1') smiles_lst, protein_lst = [], [] # print("keys :",ligands.keys()) for k in ligands.keys(): smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]), isomericSmiles=True) smiles_lst.append(smiles) for k in proteins.keys(): protein_lst.append(proteins[k]) if dataset == 'davis': # Kd data affinity = [-np.log10(y / 1e9) for y in affinity] affinity = np.asarray(affinity) # pylint: disable=E1123 os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True) train_test_dataset = [] for split in ['train', 'test']: print('processing {} set of {}'.format(split, dataset)) split_dir = os.path.join(data_dir, 'processed', split) # pylint: disable=E1123 os.makedirs(split_dir, exist_ok=True) fold = train_fold if split == 'train' else test_fold rows, cols = np.where(np.isnan(affinity) == False) rows, cols = rows[fold], cols[fold] # changed from npz files to 1 data_lst = [[] for _ in range(1)] for idx in range(len(rows)): # mol_graph = smiles_to_graph_data(smiles_lst[rows[idx]]) # if idx >= 1000: # break mol_graph = mol_to_md_graph_data(Chem.MolFromSmiles( smiles_lst[rows[idx]]), add_3dpos=False) data = {k: v for k, v in mol_graph.items()} seqs = [] for seq in protein_lst[cols[idx]].split('\x01'): seqs.extend(tokenizer.gen_token_ids(seq)) data['protein_token_ids'] = np.array(seqs) af = affinity[rows[idx], cols[idx]] if dataset == 'davis': data['Log10_Kd'] = np.array([af]) elif dataset == 'kiba': data['KIBA'] = np.array([af]) data_lst[idx % 1].append(data) random.shuffle(data_lst) # how to deal with the distributed feature ? # Now return the whone dataset # print("data lst:",data_lst) train_test_dataset.append(InMemoryDataset(data_lst[0])) print('==============================') print('dataset:', dataset) print('train_fold:', len(train_fold)) print('test_fold:', len(test_fold)) print('unique drugs:', len(set(smiles_lst))) print('unique proteins:', len(set(protein_lst))) return train_test_dataset