コード例 #1
0
 def gen_features(self, raw_data):
     """tbd"""
     smiles = raw_data['smiles']
     mol = AllChem.MolFromSmiles(smiles)
     if mol is None:
         return None
     data = mol_to_graph_data(mol)
     data['smiles'] = smiles
     return data
コード例 #2
0
ファイル: featurizer.py プロジェクト: xiaoyao4573/PaddleHelix
 def gen_features(self, raw_data):
     """tbd"""
     smiles, label = raw_data['smiles'], raw_data['label']
     mol = AllChem.MolFromSmiles(smiles)
     if mol is None:
         return None
     data = mol_to_graph_data(mol)
     data['label'] = label.reshape([-1])
     data['smiles'] = smiles
     return data
コード例 #3
0
    def gen_features(self, raw_data):
        """Convert smiles to graph data.

        Returns:
            data(dict): a dict of numpy ndarray consists of graph features.
        """
        smiles = raw_data['smiles']
        mol = AllChem.MolFromSmiles(smiles)
        if mol is None:
            return None
        data = mol_to_graph_data(mol)
        data['smiles'] = smiles
        return data
コード例 #4
0
 def test_gen_features(self):
     raw_data_list = [
         {
             'smiles': 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1'
         },
         {
             'smiles': 'CCCCCCCCCCOCC(O)CN'
         },
     ]
     smiles = raw_data_list[0]['smiles']
     mol = AllChem.MolFromSmiles(smiles)
     data3 = mol_to_graph_data(mol)
     self.assertTrue(data3)
コード例 #5
0
 def gen_features(self, raw_data):
     """tbd"""
     smiles = raw_data['smiles']
     mol = AllChem.MolFromSmiles(smiles)
     if mol is None:
         return None
     data = mol_to_graph_data(mol)
     new_data = {}
     transformed = transform_contextpred(data, self.k, self.l1, self.l2)
     if transformed is None:
         return None
     new_data['transformed'] = transformed
     new_data['smiles'] = smiles
     return new_data
コード例 #6
0
    def __call__(self, raw_data):
        """
        Gen features according to raw data and return a single graph data.

        Args:
            raw_data: It contains smiles and label,we convert smiles to mol
            by rdkit,then convert mol to graph data.
        
        Returns:
            data: It contains reshape label and smiles.
        """
        smiles = raw_data['smiles']
        mol = AllChem.MolFromSmiles(smiles)
        if mol is None:
            return None
        data = mol_to_graph_data(mol)
        return data
コード例 #7
0
ファイル: mutag_dataset.py プロジェクト: zzsnow/PaddleHelix
def load_mutag_dataset(data_path):
    """Load mutag dataset, process the raw dataset to graph data.
    """
    smiles_path = os.path.join(data_path, 'mutag_188_data.can')
    labels_path = os.path.join(data_path, 'mutag_188_target.txt')

    smiles_list = pd.read_csv(smiles_path, sep=' ', header=None)[0]
    labels = pd.read_csv(labels_path, header=None)[0].replace(-1, 0).values

    data_list, data_smiles_list = [], []
    for i in range(len(smiles_list)):
        s = smiles_list[i]
        rdkit_mol = AllChem.MolFromSmiles(s)
        if not rdkit_mol is None:  # ignore invalid mol objects
            data = mol_to_graph_data(rdkit_mol)
            data['label'] = labels[i].reshape([-1])
            data_list.append(data)
            data_smiles_list.append(smiles_list[i])

    return data_list, data_smiles_list
コード例 #8
0
    def gen_features(self, raw_data):
        """
        Gen features according to raw data and return a single graph data.

        Args:
            raw_data: It contains smiles and label,we convert smiles 
            to mol by rdkit,then convert mol to graph data.
        
        Returns:
            data: It contains reshape label and smiles.

        """
        smiles = raw_data['smiles']
        mol = AllChem.MolFromSmiles(smiles)
        if mol is None:
            return None
        data = mol_to_graph_data(mol)
        if not self.is_inference:
            label = raw_data['label']
            data['label'] = label.reshape([-1])
        data['smiles'] = smiles
        return data
コード例 #9
0
def preprocess_dataset(name):
    """
    Preprocess raw datasets.

    Args:
        name (str): name of the dataset.
    """
    data_dir = os.path.join('data', name, 'raw')
    if not os.path.exists(data_dir):
        print('Ignore MUTAG dataset. Cannot find the corresponding folder: %s.' % data_dir)
        return

    can, txt = Datasets[name]
    smiles_path = os.path.join(data_dir, can)
    labels_path = os.path.join(data_dir, txt)
    smiles_list = pd.read_csv(smiles_path, sep=' ', header=None)[0]
    labels = pd.read_csv(labels_path, header=None)[0].replace(-1, 0).values

    data_list, data_smiles_list = [], []
    for i in range(len(smiles_list)):
        s = smiles_list[i]
        mol = AllChem.MolFromSmiles(s)
        if mol is not None:
            data = mol_to_graph_data(mol)
            data['label'] = labels[i].reshape([-1])
            data_list.append(data)
            data_smiles_list.append(smiles_list[i])

    processed_dir = os.path.join('data', name, 'processed')
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    with open(os.path.join(processed_dir, 'smiles.txt'), 'w') as f:
        for smiles in smiles_list:
            f.write('%s\n' % smiles)

    save_data_list_to_npz(
        data_list, os.path.join(processed_dir, 'data.npz'))
コード例 #10
0
ファイル: ptc_mr_dataset.py プロジェクト: zzsnow/PaddleHelix
def load_ptc_mr_dataset(data_path):
    """Load PTC-MR dataset.
    """
    raw_dir = join(self.root, self.dataset_name, 'raw')
    smiles_path = os.path.join(data_path, 'ptc_MR_data.can')
    labels_path = os.path.join(data_path, 'ptc_MR_target.txt')

    if exists(smiles_path) and exists(labels_path):
        # manually download seperated SMILES and label
        smiles_list = pd.read_csv(smiles_path, sep=' ', header=None)[0]
        labels = pd.read_csv(labels_path, header=None)[0].replace(-1, 0).values

    data_list, data_smiles_list = [], []
    for i in range(len(smiles_list)):
        s = smiles_list[i]
        rdkit_mol = AllChem.MolFromSmiles(s)
        if not rdkit_mol is None:  # ignore invalid mol objects
            data = mol_to_graph_data(rdkit_mol)
            data['label'] = labels[i].reshape([-1])
            data_list.append(data)
            data_smiles_list.append(smiles_list[i])

    return data_list, data_smiles_list
コード例 #11
0
def main():
    """Entry for data preprocessing."""
    tokenizer = ProteinTokenizer()
    for dataset in ['davis', 'kiba']:
        data_dir = os.path.join(args.dataset_root, dataset)
        if not os.path.exists(data_dir):
            print('Cannot find {}'.format(data_dir))
            continue

        train_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt')))
        train_fold = [ee for e in train_fold for ee in e]  # flatten
        test_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt')))
        ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')),
                            object_pairs_hook=OrderedDict)
        proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')),
                             object_pairs_hook=OrderedDict)
        # Use encoding 'latin1' to load py2 pkl from py3
        # pylint: disable=E1123
        affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'),
                               encoding='latin1')

        smiles_lst, protein_lst = [], []
        for k in ligands.keys():
            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]),
                                      isomericSmiles=True)
            smiles_lst.append(smiles)

        for k in proteins.keys():
            protein_lst.append(proteins[k])

        if dataset == 'davis':
            # Kd data
            affinity = [-np.log10(y / 1e9) for y in affinity]

        affinity = np.asarray(affinity)

        # pylint: disable=E1123
        os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True)
        for split in ['train', 'test']:
            print('processing {} set of {}'.format(split, dataset))

            split_dir = os.path.join(data_dir, 'processed', split)
            # pylint: disable=E1123
            os.makedirs(split_dir, exist_ok=True)

            fold = train_fold if split == 'train' else test_fold
            rows, cols = np.where(np.isnan(affinity) == False)
            rows, cols = rows[fold], cols[fold]

            data_lst = []
            for idx in range(len(rows)):
                mol = AllChem.MolFromSmiles(smiles_lst[rows[idx]])
                mol_graph = mol_to_graph_data(mol)
                data = {k: v for k, v in mol_graph.items()}

                seqs = []
                for seq in protein_lst[cols[idx]].split('\x01'):
                    seqs.extend(tokenizer.gen_token_ids(seq))
                data['protein_token_ids'] = np.array(seqs)

                af = affinity[rows[idx], cols[idx]]
                if dataset == 'davis':
                    data['Log10_Kd'] = np.array([af])
                elif dataset == 'kiba':
                    data['KIBA'] = np.array([af])

                data_lst.append(data)

            random.shuffle(data_lst)
            npz = os.path.join(split_dir, '{}_{}.npz'.format(dataset, split))
            save_data_list_to_npz(data_lst, npz)

        print('==============================')
        print('dataset:', dataset)
        print('train_fold:', len(train_fold))
        print('test_fold:', len(test_fold))
        print('unique drugs:', len(set(smiles_lst)))
        print('unique proteins:', len(set(protein_lst)))
コード例 #12
0
 def test_mol_to_graph_data(self, add_self_loop=True):
     smiles = 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1'
     mol = AllChem.MolFromSmiles(smiles)
     data = mol_to_graph_data(mol)
     self.assertTrue(data)
コード例 #13
0
def load_davis_dataset(data_path, featurizer):
    """tbd"""
    tokenizer = ProteinTokenizer()
    for dataset in ['davis']:
        data_dir = os.path.join(data_path, dataset)
        if not os.path.exists(data_dir):
            print('Cannot find {}'.format(data_dir))
            continue

        train_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'train_fold_setting1.txt')))
        train_fold = [ee for e in train_fold for ee in e]  # flatten
        test_fold = json.load(
            open(os.path.join(data_dir, 'folds', 'test_fold_setting1.txt')))
        ligands = json.load(open(os.path.join(data_dir, 'ligands_can.txt')),
                            object_pairs_hook=OrderedDict)
        proteins = json.load(open(os.path.join(data_dir, 'proteins.txt')),
                             object_pairs_hook=OrderedDict)
        # Use encoding 'latin1' to load py2 pkl from py3
        # pylint: disable=E1123
        affinity = pickle.load(open(os.path.join(data_dir, 'Y'), 'rb'),
                               encoding='latin1')

        smiles_lst, protein_lst = [], []
        # print("keys :",ligands.keys())
        for k in ligands.keys():
            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[k]),
                                      isomericSmiles=True)
            smiles_lst.append(smiles)

        for k in proteins.keys():
            protein_lst.append(proteins[k])

        if dataset == 'davis':
            # Kd data
            affinity = [-np.log10(y / 1e9) for y in affinity]

        affinity = np.asarray(affinity)

        # pylint: disable=E1123
        os.makedirs(os.path.join(data_dir, 'processed'), exist_ok=True)
        train_test_dataset = []
        for split in ['train', 'test']:
            print('processing {} set of {}'.format(split, dataset))

            split_dir = os.path.join(data_dir, 'processed', split)
            # pylint: disable=E1123
            os.makedirs(split_dir, exist_ok=True)

            fold = train_fold if split == 'train' else test_fold
            rows, cols = np.where(np.isnan(affinity) == False)
            rows, cols = rows[fold], cols[fold]
            # changed from npz files to 1
            data_lst = [[] for _ in range(1)]
            for idx in range(len(rows)):
                # mol_graph = smiles_to_graph_data(smiles_lst[rows[idx]])
                # if idx >= 1000:
                #     break
                mol_graph = mol_to_graph_data(
                    Chem.MolFromSmiles(smiles_lst[rows[idx]]))
                data = {k: v for k, v in mol_graph.items()}

                seqs = []
                for seq in protein_lst[cols[idx]].split('\x01'):
                    seqs.extend(tokenizer.gen_token_ids(seq))
                data['protein_token_ids'] = np.array(seqs)

                af = affinity[rows[idx], cols[idx]]
                if dataset == 'davis':
                    data['Log10_Kd'] = np.array([af])
                elif dataset == 'kiba':
                    data['KIBA'] = np.array([af])

                data_lst[idx % 1].append(data)

            random.shuffle(data_lst)
            # how to deal with the distributed feature ?
            # Now return the whone dataset
            # print("data lst:",data_lst)
            train_test_dataset.append(InMemoryDataset(data_lst[0]))
        print('==============================')
        print('dataset:', dataset)
        print('train_fold:', len(train_fold))
        print('test_fold:', len(test_fold))
        print('unique drugs:', len(set(smiles_lst)))
        print('unique proteins:', len(set(protein_lst)))
        return train_test_dataset