Ejemplo n.º 1
0
    def __getitem__(self, idx):
        #idx = 0
        key = self.keys[idx]
        with open(self.data_dir+'/'+key, 'rb') as f:
            m1, m2 = pickle.load(f)

        #prepare ligand
        n1 = m1.GetNumAtoms()
        c1 = m1.GetConformers()[0]
        d1 = np.array(c1.GetPositions())
        adj1 = GetAdjacencyMatrix(m1)+np.eye(n1)
        H1 = get_atom_feature(m1, True)

        #prepare protein
        n2 = m2.GetNumAtoms()
        c2 = m2.GetConformers()[0]
        d2 = np.array(c2.GetPositions())
        adj2 = GetAdjacencyMatrix(m2)+np.eye(n2)
        H2 = get_atom_feature(m2, False)
        
        #aggregation
        H = np.concatenate([H1, H2], 0)
        agg_adj1 = np.zeros((n1+n2, n1+n2))
        agg_adj1[:n1, :n1] = adj1
        agg_adj1[n1:, n1:] = adj2
        agg_adj2 = np.copy(agg_adj1)
        dm = distance_matrix(d1,d2)
        agg_adj2[:n1,n1:] = np.copy(dm)
        agg_adj2[n1:,:n1] = np.copy(np.transpose(dm))

        #node indice for aggregation
        valid = np.zeros((n1+n2,))
        valid[:n1] = 1
        
        #pIC50 to class
        Y = 1 if 'CHEMBL' in key else 0

        #if n1+n2 > 300 : return None
        sample = {
                  'H':H, \
                  'A1': agg_adj1, \
                  'A2': agg_adj2, \
                  'Y': Y, \
                  'V': valid, \
                  'key': key, \
                  }

        return sample
Ejemplo n.º 2
0
    def __getitem__(self, idx):
        item = self.smiles_dataset[idx]
        #item = Chem.MolToSmiles(Chem.MolFromSmiles(i))
        input_random, input_label, input_adj_mask = self.random_masking(item)

        input_data = [self.vocab.start_index
                      ] + input_random + [self.vocab.end_index]
        input_label = [self.vocab.pad_index
                       ] + input_label + [self.vocab.pad_index]
        input_adj_mask = [0] + input_adj_mask + [0]
        if self.mat_pos == 'start':
            input_adj_mask = [1]

        smiles_bert_input = input_data[:self.seq_len]
        smiles_bert_label = input_label[:self.seq_len]
        smiles_bert_adj_mask = input_adj_mask[:self.seq_len]

        padding = [0 for _ in range(self.seq_len - len(smiles_bert_input))]
        smiles_bert_input.extend(padding)
        smiles_bert_label.extend(padding)
        smiles_bert_adj_mask.extend(padding)
        mol = Chem.MolFromSmiles(self.adj_dataset[idx])
        adj_mat = GetAdjacencyMatrix(mol)
        smiles_bert_adjmat = self.zero_padding(adj_mat,
                                               (self.seq_len, self.seq_len))

        output = {"smiles_bert_input": smiles_bert_input, "smiles_bert_label": smiles_bert_label,  \
           "smiles_bert_adj_mask": smiles_bert_adj_mask, "smiles_bert_adjmat": smiles_bert_adjmat, "smiles_bert_value": QED.qed(mol)}

        return {key: torch.tensor(value) for key, value in output.items()}
Ejemplo n.º 3
0
def get_adj_matrix(mol):
    ''''Get self-loop added adjacency matrix'''
    n = mol.GetNumAtoms()
    adj_matrix = GetAdjacencyMatrix(mol) + np.eye(n)

    adj_matrix = np.array(adj_matrix)
    return adj_matrix
Ejemplo n.º 4
0
    def __getitem__(self, idx):
        item = self.smiles_dataset[idx]
        label = self.label[idx]

        input_token, input_adj_masking = self.CharToNum(item)

        input_data = [self.vocab.start_index
                      ] + input_token + [self.vocab.end_index]
        input_adj_masking = [0] + input_adj_masking + [0]
        if self.mat_pos == 'start':
            input_adj_mask = [1] + [0 for _ in range(len(input_adj_mask) - 1)]

        smiles_bert_input = input_data[:self.seq_len]
        smiles_bert_adj_mask = input_adj_masking[:self.seq_len]

        padding = [0 for _ in range(self.seq_len - len(smiles_bert_input))]
        smiles_bert_input.extend(padding)
        smiles_bert_adj_mask.extend(padding)

        mol = Chem.MolFromSmiles(self.adj_dataset[idx])
        #features = add_descriptors(mol)
        #smiles_bert_ECFP = np.array(features, dtype=np.float32)
        if mol != None:
            adj_mat = GetAdjacencyMatrix(mol)
            smiles_bert_adjmat = self.zero_padding(
                adj_mat, (self.seq_len, self.seq_len))
        else:
            smiles_bert_adjmat = np.zeros((self.seq_len, self.seq_len),
                                          dtype=np.float32)

        output = {"smiles_bert_input": smiles_bert_input, "smiles_bert_label": label,  \
           "smiles_bert_adj_mask": smiles_bert_adj_mask, "smiles_bert_adjmat": smiles_bert_adjmat}

        return {key: torch.tensor(value) for key, value in output.items()}
Ejemplo n.º 5
0
Archivo: run.py Proyecto: vvhanxing/GCN
    def __getitem__(self, idx):

        s = self.smiles[idx]

        m = Chem.MolFromSmiles(s)

        natoms = m.GetNumAtoms()

        #adjacency matrix

        A = GetAdjacencyMatrix(m) + np.eye(natoms)

        A_padding = np.zeros((self.max_natoms, self.max_natoms))

        A_padding[:natoms, :natoms] = A

        #atom feature

        X = [self.atom_feature(m, i) for i in range(natoms)]

        for i in range(natoms, max_natoms):

            X.append(np.zeros(28))

        X = np.array(X)

        sample = dict()

        sample['X'] = torch.from_numpy(X)

        sample['A'] = torch.from_numpy(A_padding)

        sample['Y'] = self.properties[idx]

        return sample
Ejemplo n.º 6
0
    def __getitem__(self, idx):

        s = self.smiles[idx]

        m = Chem.MolFromSmiles(s)

        natoms = m.GetNumAtoms()
        #from plot_mol import  plot_mol_with_index(m)

        #adjacency matrix

        A = GetAdjacencyMatrix(m) + np.eye(natoms)
        #print(A)
        #print(A.shape)
        ########################################################
        #D = np.array(np.sum(A, axis=0))
        #print(D)
        #print(D.shape)

        #D = np.matrix(np.diag(D))
        #print(D.shape)

        #A = D**-1*A
        #print(A.shape)
        #input()
        ########################################################
        A_padding = np.zeros((self.max_natoms, self.max_natoms))

        A_padding[:natoms, :natoms] = A

        A_padding = torch.from_numpy(A_padding)

        #atom feature

        X = [self.atom_feature(m, i) for i in range(natoms)]
        #print("X")
        #print(len(X))

        for i in range(natoms, max_natoms):

            X.append(np.zeros(28))

        X = np.array(X)

        #from   help_tools import get_mol_feature
        #print(X)
        #print(get_mol_feature(s).all()  ==X.all())

        sample = dict()

        sample['X'] = torch.from_numpy(X)

        sample['A'] = A_padding

        sample['Y'] = self.properties[idx]

        sample["smi"] = s

        return sample
Ejemplo n.º 7
0
def get_mol_A_(s):

    m = Chem.MolFromSmiles(s)
    natoms = m.GetNumAtoms()
    A = GetAdjacencyMatrix(m) + np.eye(natoms)
    A_padding = np.zeros((max_natoms, max_natoms))
    A_padding[:natoms, :natoms] = A

    return A_padding
Ejemplo n.º 8
0
    def __getitem__(self, idx):

        s = self.smiles[idx]

        m = Chem.MolFromSmiles(s)

        natoms = m.GetNumAtoms()

        #adjacency matrix

        A = GetAdjacencyMatrix(m) + np.eye(natoms)
        #print(A)
        #print(A.shape)
        ########################################################
        #D = np.array(np.sum(A, axis=0))
        #print(D)
        #print(D.shape)

        #D = np.matrix(np.diag(D))
        #print(D.shape)

        #A = D**-1*A
        #print(A.shape)
        #input()
        ########################################################
        A_padding = np.zeros((self.max_natoms, self.max_natoms))

        A_padding[:natoms, :natoms] = A

        A_padding = torch.from_numpy(A_padding)

        #d = A_padding.sum(1)
        #D = torch.diag(torch.pow(d , -0.5))
        #A_padding = D.mm(A_padding).mm(D)

        #atom feature

        X = [self.atom_feature(m, i) for i in range(natoms)]

        for i in range(natoms, max_natoms):

            X.append(np.zeros(28))

        X = np.array(X)

        sample = dict()

        sample['X'] = torch.from_numpy(X)

        sample['A'] = A_padding

        sample['Y'] = self.properties[idx]

        return sample
Ejemplo n.º 9
0
def distance_fix_pair(m):
    # adjacency matrix
    adj = GetAdjacencyMatrix(m).astype(float)
    adj += np.eye(len(adj)).astype(float)
    adj_sec_neighbor = np.matmul(adj, adj)
    adj += make_ring_matrix(m).astype(float)
    adj += make_conjugate_matrix(m).astype(float)
    #adj[adj>1.0] = 1.0
    adj = np.matmul(adj, adj)
    adj += adj_sec_neighbor
    adj[adj > 1] = 1
    return adj
Ejemplo n.º 10
0
def get_mol_fea(drug_smiles_list):
    drug_node_list = []
    drug_edge_list = []
    drug_n2n_list = []
    drug_e2n_list = []
    node_dim = len(atom_type_list + hybridization_list + num_h_list+formal_charge_list) + 1
    edge_dim = len(bond_type_list) + 1 + 1

    for i, smiles in tqdm.tqdm(enumerate(drug_smiles_list), total=len(drug_smiles_list)):
        if smiles[-1] == ' ':
            smiles = smiles[:-1]

        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print("molecule {} is not defined well".format(smiles))
            exit(1)

        atom_list = mol.GetAtoms()
        bond_list = mol.GetBonds()

        n_node = len(atom_list)
        n_edge = len(bond_list)

        node = np.zeros((n_node, node_dim))
        for j, atom in enumerate(atom_list):
            node[j] += get_atom_feature(atom)
        node = np.array(node)

        n2n = GetAdjacencyMatrix(mol)

        edge = np.zeros((n_edge, edge_dim))
        e2n = np.zeros((n_node, n_edge))
        edge_idx = 0
        for j in range(n_node):
            for k in range(j+1, n_node):
                bond = mol.GetBondBetweenAtoms(j, k)
                if bond is not None:
                    edge[edge_idx] += bond_features(bond)
                    e2n[j, edge_idx] += 1
                    e2n[k, edge_idx] += 1
                    edge_idx += 1

        drug_node_list.append(node)
        drug_n2n_list.append(n2n)
        drug_edge_list.append(edge)
        drug_e2n_list.append(e2n)

    return (drug_node_list, drug_edge_list, 
            drug_n2n_list, drug_e2n_list)
Ejemplo n.º 11
0
def molecule_to_adjacency(molecule):
    """
    Construct an adjacency matrix using RDKit.
    
    Parameters
    ----------
    molecule: :molLego:`Molecule`
        Molecule to calculate adjacency matrix for.

    """
    # Convert molecule to rdkit mol.
    rdkit_mol = molecule_to_rdkit(molecule)

    # Calculate adjacency matrix.
    return GetAdjacencyMatrix(rdkit_mol)
Ejemplo n.º 12
0
 def getNNAtoms(self, molStr, cAtoms, hight):
     atoms = copy.deepcopy(cAtoms)
     # create an RDKit mol
     mol = Chem.MolFromMolBlock(molStr, True, False)
     if not mol:
         print "Could not create mol for compound "
         return []
     adj = GetAdjacencyMatrix(mol)
     visitedAtoms = []
     for n in range(hight):
         for atom in copy.deepcopy(atoms):
             if atom not in visitedAtoms:
                 lNN = findNeighbors(atom, adj)
                 visitedAtoms.append(atom)
                 for lnn in lNN:
                     if lnn not in atoms:
                         atoms.append(lnn)
     atoms.sort()
     return atoms
Ejemplo n.º 13
0
def characteristic_poly(mol_list, useBO=False):

    eigenvalue_list = []
    max_length = 0

    for mol in mol_list:
        evs = CharacteristicPolynomial(mol, GetAdjacencyMatrix(mol,
                                                               useBO=True))
        #evs = sorted(evs, reverse=True) #sort
        eigenvalue_list += [list(evs)]
        length = len(evs)
        if (length > max_length):
            max_length = length

    #zero padding
    for i in range(len(eigenvalue_list)):
        pad_width = max_length - len(eigenvalue_list[i])
        eigenvalue_list[i] += [0] * pad_width

    return np.array(eigenvalue_list)
Ejemplo n.º 14
0
def adjacency_matrix_eigenvalues(mol_list, useBO=False):

    eigenvalue_list = []
    max_length = 0

    for mol in mol_list:
        adj_matrix = GetAdjacencyMatrix(mol, useBO=useBO)
        evs = list(np.linalg.eigvals(adj_matrix))
        #evs = sorted(evs, reverse=True) #sort
        eigenvalue_list += [evs]
        length = len(evs)
        if (length > max_length):
            max_length = length

    #zero padding
    for i in range(len(eigenvalue_list)):
        pad_width = max_length - len(eigenvalue_list[i])
        eigenvalue_list[i] += [0] * pad_width

    return np.array(eigenvalue_list)
Ejemplo n.º 15
0
def cal_internal_vdw(m):
    retval = 0
    n = m.GetNumAtoms()
    c = m.GetConformers()[0]
    d = np.array(c.GetPositions())
    dm = distance_matrix(d, d)
    adj = GetAdjacencyMatrix(m)
    topological_dm = GetDistanceMatrix(m)
    for i1 in range(n):
        for i2 in range(0, i1):
            param = GetUFFVdWParams(m, i1, i2)
            if param is None:
                continue
            d, e = param
            d = d * 1.0
            if adj[i1, i2] == 1:
                continue
            if topological_dm[i1, i2] < 4:
                continue
            retval += e * ((d / dm[i1, i2])**12 - 2 * ((d / dm[i1, i2])**6))
            # print (i1, i2, e, d)
    return retval
Ejemplo n.º 16
0
def process_molecule(molecule, max_num_atoms):
    num_atoms = molecule.GetNumAtoms()
    assert num_atoms <= max_num_atoms

    # Atom features
    features = []
    for i in range(num_atoms):
        atomic_number = molecule.GetAtomWithIdx(i).GetAtomicNum()
        if atomic_number > 1:
            features.append([0, 1])
        else:
            features.append([1, 0])
    #features = [[atom.GetAtomicNum()] for atom in molecule.GetAtoms()]

    # Adjacency matrix
    adj = GetAdjacencyMatrix(molecule)

    # Padding
    diff = max_num_atoms - num_atoms
    padded_features = np.pad(features, ((0, diff), (0, 0)))
    padded_adj = np.pad(adj, (0, diff))
    return padded_adj, padded_features
Ejemplo n.º 17
0
def mol_to_feature(m1, m1_uff, m2, interaction_data, pos_noise_std):
    # Remove hydrogens
    m1 = Chem.RemoveHs(m1)
    m2 = Chem.RemoveHs(m2)

    # extract valid amino acids
    # m2 = extract_valid_amino_acid(m2, self.amino_acids)

    # random rotation
    angle = np.random.uniform(0, 360, 1)[0]
    axis = np.random.uniform(-1, 1, 3)
    # m1 = rotate(m1, angle, axis, False)
    # m2 = rotate(m2, angle, axis, False)

    angle = np.random.uniform(0, 360, 1)[0]
    axis = np.random.uniform(-1, 1, 3)
    m1_rot = rotate(copy.deepcopy(m1), angle, axis, True)

    # prepare ligand
    n1 = m1.GetNumAtoms()
    d1 = np.array(m1.GetConformers()[0].GetPositions())
    d1 += np.random.normal(0.0, pos_noise_std, d1.shape)
    d1_rot = np.array(m1_rot.GetConformers()[0].GetPositions())
    adj1 = GetAdjacencyMatrix(m1) + np.eye(n1)
    h1 = get_atom_feature(m1, True)

    # prepare protein
    n2 = m2.GetNumAtoms()
    c2 = m2.GetConformers()[0]
    d2 = np.array(c2.GetPositions())
    d2 += np.random.normal(0.0, pos_noise_std, d2.shape)
    adj2 = GetAdjacencyMatrix(m2) + np.eye(n2)
    h2 = get_atom_feature(m2, True)

    # prepare distance vector
    dmv = dm_vector(d1, d2)
    dmv_rot = dm_vector(d1_rot, d2)

    # get interaction matrix
    # A_int = get_interaction_matrix(d1, d2, interaction_data)
    A_int = np.zeros(
        (len(interaction_types), m1.GetNumAtoms(), m2.GetNumAtoms()))
    A_int[-2] = get_A_hydrophobic(m1, m2)
    A_int[1] = get_A_hbond(m1, m2)
    A_int[-1] = get_A_metal_complexes(m1, m2)

    # cal sasa
    sasa = cal_sasa(m1)
    dsasa = sasa - cal_sasa(m1_uff)

    # count rotatable bonds
    rotor = CalcNumRotatableBonds(m1)
    # dm = distance_matrix(d1, d2)
    # rotor = count_active_rotatable_bond(m1, dm)
    # charge
    # charge1 = cal_charge(m1)
    # charge2 = cal_charge(m2)
    charge1 = np.zeros((n1, ))
    charge2 = np.zeros((n2, ))
    """
    mp1 = AllChem.MMFFGetMoleculeProperties(m1)
    mp2 = AllChem.MMFFGetMoleculeProperties(m2)
    charge1 = [mp1.GetMMFFPartialCharge(i) for i in range(m1.GetNumAtoms())]
    charge2 = [mp2.GetMMFFPartialCharge(i) for i in range(m2.GetNumAtoms())]
    """

    # partial charge calculated by gasteiger
    charge1 = np.array(charge1)
    charge2 = np.array(charge2)

    # There is nan for some cases.
    charge1 = np.nan_to_num(charge1, nan=0, neginf=0, posinf=0)
    charge2 = np.nan_to_num(charge2, nan=0, neginf=0, posinf=0)

    # valid
    valid1 = np.ones((n1, ))
    valid2 = np.ones((n2, ))

    # no metal
    metal_symbols = ["Zn", "Mn", "Co", "Mg", "Ni", "Fe", "Ca", "Cu"]
    no_metal1 = np.array([
        1 if a.GetSymbol() not in metal_symbols else 0 for a in m1.GetAtoms()
    ])
    no_metal2 = np.array([
        1 if a.GetSymbol() not in metal_symbols else 0 for a in m2.GetAtoms()
    ])
    # vdw radius
    vdw_radius1 = np.array([get_vdw_radius(a) for a in m1.GetAtoms()])
    vdw_radius2 = np.array([get_vdw_radius(a) for a in m2.GetAtoms()])

    vdw_epsilon, vdw_sigma = get_epsilon_sigma(m1, m2, False)

    # uff energy difference
    # delta_uff = cal_uff(m1)-cal_uff(m1_uff)
    # delta_uff = get_torsion_energy(m1) - get_torsion_energy(m1_uff)
    # delta_uff = cal_torsion_energy(m1)+cal_internal_vdw(m1)
    delta_uff = 0.0
    sample = {
        "h1": h1,
        "adj1": adj1,
        "h2": h2,
        "adj2": adj2,
        "A_int": A_int,
        "dmv": dmv,
        "dmv_rot": dmv_rot,
        "pos1": d1,
        "pos2": d2,
        "sasa": sasa,
        "dsasa": dsasa,
        "rotor": rotor,
        "charge1": charge1,
        "charge2": charge2,
        "vdw_radius1": vdw_radius1,
        "vdw_radius2": vdw_radius2,
        "vdw_epsilon": vdw_epsilon,
        "vdw_sigma": vdw_sigma,
        "delta_uff": delta_uff,
        "valid1": valid1,
        "valid2": valid2,
        "no_metal1": no_metal1,
        "no_metal2": no_metal2,
    }
    return sample
Ejemplo n.º 18
0
def anal_mols(key, m1, m2):
    fnm = whoami()

    #
    # prepare ligand
    #
    m1 = Chem.AddHs(m1, addCoords=True, addResidueInfo=True)
    n1 = m1.GetNumAtoms()
    c1 = m1.GetConformers(
    )[0]  # m1.GetConformers() 함수는 1개의 rdkit.Chem.rdchem.Conformer object 만을 되돌려 줌
    d1 = np.array(c1.GetPositions())
    #print('{}:#1:n_atoms:{} --> shape:{}\n{}'.format(fnm, n1, d1.shape, d1))
    print('{}:#1:n_atoms:{} --> shape:{}'.format(fnm, n1, d1.shape))

    print('+' * 3)

    for i, coord in enumerate(d1):
        symbol = m1.GetAtomWithIdx(i).GetSymbol()
        print('  #{:>3}:{}:{}'.format(i, symbol, coord))
        pass

    print('+' * 10)

    adj1 = GetAdjacencyMatrix(m1) + np.eye(n1)  # adj1.dtype: float64
    print('{}:#2:adj1:shape:{}, dtype:{}\n{}'.format(fnm, adj1.shape,
                                                     adj1.dtype, adj1))
    print('+' * 3)

    print('{}:m1:{}, n1:{}'.format(fnm, m1, n1))
    H1 = get_atom_feature(m1, n1, True)

    print('#' * 80)

    #
    # prepare protein
    #
    m2 = Chem.AddHs(m2, addCoords=True, addResidueInfo=True)
    n2 = m2.GetNumAtoms()
    c2 = m2.GetConformers(
    )[0]  # m2.GetConformers() 함수는 1개의 rdkit.Chem.rdchem.Conformer object 만을 되돌려 줌
    d2 = np.array(c2.GetPositions())
    print('{}:#1:n_atoms:{} --> shape:{}\n{}'.format(fnm, n2, d2.shape, d2))

    print('+' * 10)

    adj2 = GetAdjacencyMatrix(m2) + np.eye(n2)
    print('{}:#2:adj2:shape:{}, dtype:{}\n{}'.format(fnm, adj2.shape,
                                                     adj2.dtype, adj2))
    print('+' * 3)

    H2 = get_atom_feature(m2, n2, False)

    print('#' * 80)

    # aggregation
    H = np.concatenate([H1, H2], axis=0)
    print('{}: H:shape:{}, type:{}'.format(fnm, H.shape, H.dtype), flush=True)

    print('+' * 10)
    print('n:{} = n1:{} + n2:{}'.format(n1 + n2, n1, n2))

    #
    # agg_adj1: 인접행렬(1)
    #
    #    - 행렬의 upper-left  부분: ligand  내부의 인접행렬
    #    - 행렬의 lower-right 부분: protein 내부의 인접행렬
    #    - 위의 2영역을 제외한 나머지는 0(zero)로 패딩됨
    #
    agg_adj1 = np.zeros((n1 + n2, n1 + n2))
    agg_adj1[:n1, :n1] = adj1
    agg_adj1[n1:, n1:] = adj2

    print('{}: agg_adj1:shape:{}, type:{}'.format(fnm, agg_adj1.shape,
                                                  agg_adj1.dtype),
          flush=True)

    #
    # agg_adj2: 인접행렬(2)
    #
    #    - 행렬의 upper-left  부분: ligand  내부의 인접행렬
    #    - 행렬의 upper-right 부분: row기준으로(ligand기준 ) ligand와 protein간의 거리
    #    - 행렬의 lower-left  부분: row기준으로(protein기준) protein과 ligand간의 거리
    #    - 행렬의 lower-right 부분: protein 내부의 인접행렬
    #
    agg_adj2 = np.copy(agg_adj1)

    print('{}: agg_adj2:shape:{}, type:{}'.format(fnm, agg_adj2.shape,
                                                  agg_adj2.dtype),
          flush=True)

    dm = distance_matrix(d1, d2)
    print('{}: dm:shape:{}, type:{}, min:{}, max:{}'.format(
        fnm, dm.shape, dm.dtype, dm.min(), dm.max()),
          flush=True)

    agg_adj2[:n1, n1:] = np.copy(dm)
    agg_adj2[n1:, :n1] = np.copy(np.transpose(dm))

    #node indice for aggregation
    valid = np.zeros((n1 + n2, ))
    valid[:n1] = 1
    print('valid:{}, sum(valid):{}'.format(valid, sum(valid)))

    #pIC50 to class
    Y = 1 if 'CHEMBL' in key else 0

    sample = {
        'H': H,
        'A1': agg_adj1,
        'A2': agg_adj2,
        'Y': Y,
        'V': valid,
        'key': key
    }
    return sample
Ejemplo n.º 19
0
def get_adj(mol):
    return GetAdjacencyMatrix(mol) + np.eye(mol.GetNumAtoms())
Ejemplo n.º 20
0
    def process(self):
        print('processing data from ({}) and saving it to ({})'.format(self.qm9_directory,
                                                                       os.path.join(self.qm9_directory, 'processed')))

        # load qm9 data with spatial coordinates
        data_qm9 = dict(np.load(os.path.join(self.qm9_directory, self.raw_spatial_data), allow_pickle=True))
        coordinates = torch.tensor(data_qm9['R'], dtype=torch.float)
        # Read the QM9 data with SMILES information
        molecules_df = pd.read_csv(os.path.join(self.qm9_directory, self.raw_qm9_file))

        atom_slices = [0]
        edge_slices = [0]
        total_eigvecs = []
        total_eigvals = []
        all_atom_features = []
        all_edge_features = []
        edge_indices = []  # edges of each molecule in coo format
        targets = []  # the 19 properties that should be predicted for the QM9 dataset
        total_atoms = 0
        total_edges = 0
        avg_degree = 0  # average degree in the dataset
        # go through all molecules in the npz file
        for mol_idx, n_atoms in tqdm(enumerate(data_qm9['N'])):
            # get the molecule using the smiles representation from the csv file
            mol = Chem.MolFromSmiles(molecules_df['smiles'][data_qm9['id'][mol_idx]])
            # add hydrogen bonds to molecule because they are not in the smiles representation
            mol = Chem.AddHs(mol)

            atom_features_list = []
            for atom in mol.GetAtoms():
                atom_features_list.append(atom_to_feature_vector(atom))
            all_atom_features.append(torch.tensor(atom_features_list, dtype=torch.long))

            adj = GetAdjacencyMatrix(mol, useBO=False, force=True)
            max_freqs = 10
            adj = torch.tensor(adj).float()
            D = torch.diag(adj.sum(dim=0))
            L = D - adj
            N = adj.sum(dim=0) ** -0.5
            L_sym = torch.eye(n_atoms) - N * L * N
            eig_vals, eig_vecs = torch.symeig(L_sym, eigenvectors=True)
            idx = eig_vals.argsort()[0: max_freqs]  # Keep up to the maximum desired number of frequencies
            eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx]

            # Sort, normalize and pad EigenVectors
            eig_vecs = eig_vecs[:, eig_vals.argsort()]  # increasing order
            eig_vecs = F.normalize(eig_vecs, p=2, dim=1, eps=1e-12, out=None)
            if n_atoms < max_freqs:
                eig_vecs = F.pad(eig_vecs, (0, max_freqs - n_atoms), value=float('nan'))
                eig_vals = F.pad(eig_vals, (0, max_freqs - n_atoms), value=float('nan'))

            total_eigvecs.append(eig_vecs)
            total_eigvals.append(eig_vals.unsqueeze(0))

            edges_list = []
            edge_features_list = []
            for bond in mol.GetBonds():
                i = bond.GetBeginAtomIdx()
                j = bond.GetEndAtomIdx()
                edge_feature = bond_to_feature_vector(bond)

                # add edges in both directions
                edges_list.append((i, j))
                edge_features_list.append(edge_feature)
                edges_list.append((j, i))
                edge_features_list.append(edge_feature)
            # Graph connectivity in COO format with shape [2, num_edges]
            edge_index = torch.tensor(edges_list, dtype=torch.long).T
            edge_features = torch.tensor(edge_features_list, dtype=torch.long)

            avg_degree += (len(edges_list) / 2) / n_atoms

            # get all 19 attributes that should be predicted, so we drop the first two entries (name and smiles)
            target = torch.tensor(molecules_df.iloc[data_qm9['id'][mol_idx]][2:], dtype=torch.float)
            targets.append(target)
            edge_indices.append(edge_index)
            all_edge_features.append(edge_features)

            total_edges += len(edges_list)
            total_atoms += n_atoms
            edge_slices.append(total_edges)
            atom_slices.append(total_atoms)

        # convert targets to eV units
        targets = torch.stack(targets) * torch.tensor(list(self.unit_conversion.values()))[None, :]
        data_dict = {'mol_id': data_qm9['id'],
                     'n_atoms': torch.tensor(data_qm9['N'], dtype=torch.long),
                     'atom_slices': torch.tensor(atom_slices, dtype=torch.long),
                     'edge_slices': torch.tensor(edge_slices, dtype=torch.long),
                     'eig_vecs': torch.cat(total_eigvecs).float(),
                     'eig_vals': torch.cat(total_eigvals).float(),
                     'edge_indices': torch.cat(edge_indices, dim=1),
                     'atom_features': torch.cat(all_atom_features, dim=0),
                     'edge_features': torch.cat(all_edge_features, dim=0),
                     'atomic_number_long': torch.tensor(data_qm9['Z'], dtype=torch.long)[:, None],
                     'coordinates': coordinates,
                     'targets': targets,
                     'avg_degree': avg_degree / len(data_qm9['id'])
                     }

        if not os.path.exists(os.path.join(self.qm9_directory, 'processed')):
            os.mkdir(os.path.join(self.qm9_directory, 'processed'))
        torch.save(data_dict, os.path.join(self.qm9_directory, 'processed', self.processed_file))
Ejemplo n.º 21
0
    def createSignImg(self,
                      smi,
                      signature,
                      atomColor,
                      imgPath,
                      endHeight=None):
        colors = []
        print "Creating signature image..."
        if not signature or not atomColor or not smi:
            print "Missing inputs:", str([smi, signature, atomColor])
            return "", "", [], []
        if hasattr(self.model, "specialType") and self.model.specialType == 1:
            # Create an Orange ExampleTable with a smiles attribute
            smilesAttr = orange.EnumVariable("SMILEStoPred", values=[smi])
            myDomain = orange.Domain([smilesAttr], 0)
            smilesData = dataUtilities.DataTable(myDomain, [[smi]])
            preCalcData = None
            startHeight = 0
            dataSign, cmpdSignDict, cmpdSignList, sdfStr = getSignatures.getSignatures(
                smilesData,
                startHeight,
                endHeight,
                preCalcData,
                returnAtomID=True)
            cmpdSignList = cmpdSignList[0]
            CLabDesc = []
            # create a mol file
            tmpFile = miscUtilities.generateUniqueFile(desc="NN", ext="mol")
            file = open(tmpFile, "w")
            molStr = ""
            for line in sdfStr[0]:
                if "$$$$" in line:
                    break
                molStr += line
                file.write(line)
            file.close()
        else:
            CLabDesc, cmpdSignList, tmpFile, molStr = self.getClabDescSignList(
                smi, getMolFile=True)
        if not cmpdSignList or not tmpFile:
            print "Couldn't get the cmpd list or the mol file"
            return "", "", [], []
        # create an RDKit mol
        mol = Chem.MolFromMolFile(tmpFile, True, False)
        if not mol:
            mol = Chem.MolFromMolFile(tmpFile, False, False)
        if not mol:
            print "Could not create mol for: ", smi
            return "", "", [], []
        adj = GetAdjacencyMatrix(mol)
        # find the NN
        hights = []
        for i in miscUtilities.Range(0, len(cmpdSignList), mol.GetNumAtoms()):
            hList = cmpdSignList[i:i + mol.GetNumAtoms()]
            if len(hList):
                hights.append(cmpdSignList[i:i + mol.GetNumAtoms()])

        atoms = []
        hight = None
        for idx, h in enumerate(hights):
            if signature in h:
                for i, a in enumerate(h):
                    if a == signature:
                        atoms.append(i)
                hight = idx
                break
        if len(atoms) == 0:
            print "ERROR: Could not find the atom for ", signature
            return "signatureNOTfound", "", [], []
        #print "IniAtoms: ",atoms
        visitedAtoms = []
        for n in range(hight):
            for atom in copy.deepcopy(atoms):
                if atom not in visitedAtoms:
                    lNN = findNeighbors(atom, adj)
                    visitedAtoms.append(atom)
                    for lnn in lNN:
                        if lnn not in atoms:
                            atoms.append(lnn)
        atoms.sort()
        os.system("rm " + tmpFile)
        #Specify the atom colors
        colors = [atomColor] * len(atoms)

        if not imgPath:
            return "", molStr, atoms, colors
        try:
            #Draw the image
            MolDrawing.elemDict = defaultdict(lambda: (0, 0, 0))
            Draw.MolToImageFile(mol,
                                imgPath,
                                size=(300, 300),
                                kekulize=True,
                                wedgeBonds=True,
                                highlightAtoms=atoms)
            #Color the Highlighted atoms with the choosen atomColor.
            # Only using one color
            if atomColor == 'r':
                rgb = (255, 0, 0)
            elif atomColor == 'g':
                rgb = (0, 255, 0)
            else:
                rgb = (0, 0, 255)  #Blue

            img = Image.open(imgPath)
            img = img.convert("RGBA")
            pixdata = img.getdata()
            newData = list()
            for item in pixdata:
                if item[0] == 255 and item[1] == 0 and item[2] == 0:
                    newData.append(rgb + (255, ))
                else:
                    newData.append(item)
            img.putdata(newData)
            img.save(imgPath)

            if os.path.isfile(imgPath):
                return imgPath, molStr, atoms, colors
            else:
                return "", molStr, atoms, colors
        except:
            return "", molStr, atoms, colors
    def process(self):
        print('processing data from ({}) and saving it to ({})'.format(
            self.directory, os.path.join(self.directory, 'processed')))

        with open(os.path.join(self.directory, "summary_qm9.json"), "r") as f:
            summary = json.load(f)

        atom_slices = [0]
        edge_slices = [0]
        total_eigvecs = []
        total_eigvals = []
        all_atom_features = []
        all_edge_features = []
        targets = {
            'ensembleenergy': [],
            'ensembleentropy': [],
            'ensemblefreeenergy': [],
            'lowestenergy': [],
            'poplowestpct': [],
            'temperature': [],
            'uniqueconfs': []
        }
        edge_indices = []  # edges of each molecule in coo format
        atomic_number_long = []
        n_atoms_list = []

        coordinates = []
        smiles_list = []
        total_atoms = 0
        total_edges = 0
        avg_degree = 0  # average degree in the dataset
        for smiles, sub_dic in tqdm(list(summary.items())):
            pickle_path = os.path.join(self.directory,
                                       sub_dic.get("pickle_path", ""))
            if os.path.isfile(pickle_path):
                pickle_file = open(pickle_path, 'rb')
                mol_dict = pickle.load(pickle_file)
                if 'ensembleenergy' in mol_dict:
                    conformers = mol_dict['conformers']
                    mol = conformers[0]['rd_mol']
                    n_atoms = len(mol.GetAtoms())
                    atom_features_list = []
                    for atom in mol.GetAtoms():
                        atom_features_list.append(atom_to_feature_vector(atom))
                    all_atom_features.append(
                        torch.tensor(atom_features_list, dtype=torch.long))

                    adj = GetAdjacencyMatrix(mol, useBO=False, force=True)
                    max_freqs = 10
                    adj = torch.tensor(adj).float()
                    D = torch.diag(adj.sum(dim=0))
                    L = D - adj
                    N = adj.sum(dim=0)**-0.5
                    L_sym = torch.eye(n_atoms) - N * L * N
                    try:
                        eig_vals, eig_vecs = torch.symeig(L_sym,
                                                          eigenvectors=True)
                    except Exception as e:  # if we have disconnected components
                        deg = adj.sum(dim=0)
                        deg[deg == 0] = 1
                        N = deg**-0.5
                        L_sym = torch.eye(n_atoms) - N * L * N
                        eig_vals, eig_vecs = torch.symeig(L_sym,
                                                          eigenvectors=True)
                    idx = eig_vals.argsort(
                    )[0:
                      max_freqs]  # Keep up to the maximum desired number of frequencies
                    eig_vals, eig_vecs = eig_vals[idx], eig_vecs[:, idx]

                    # Sort, normalize and pad EigenVectors
                    eig_vecs = eig_vecs[:,
                                        eig_vals.argsort()]  # increasing order
                    eig_vecs = F.normalize(eig_vecs,
                                           p=2,
                                           dim=1,
                                           eps=1e-12,
                                           out=None)
                    if n_atoms < max_freqs:
                        eig_vecs = F.pad(eig_vecs, (0, max_freqs - n_atoms),
                                         value=float('nan'))
                        eig_vals = F.pad(eig_vals, (0, max_freqs - n_atoms),
                                         value=float('nan'))

                    total_eigvecs.append(eig_vecs)
                    total_eigvals.append(eig_vals.unsqueeze(0))

                    edges_list = []
                    edge_features_list = []
                    for bond in mol.GetBonds():
                        i = bond.GetBeginAtomIdx()
                        j = bond.GetEndAtomIdx()
                        edge_feature = bond_to_feature_vector(bond)

                        # add edges in both directions
                        edges_list.append((i, j))
                        edge_features_list.append(edge_feature)
                        edges_list.append((j, i))
                        edge_features_list.append(edge_feature)
                    # Graph connectivity in COO format with shape [2, num_edges]
                    edge_index = torch.tensor(edges_list, dtype=torch.long).T
                    edge_features = torch.tensor(edge_features_list,
                                                 dtype=torch.long)

                    avg_degree += (len(edges_list) / 2) / n_atoms

                    targets['ensembleenergy'].append(
                        mol_dict['ensembleenergy'])
                    targets['ensembleentropy'].append(
                        mol_dict['ensembleentropy'])
                    targets['ensemblefreeenergy'].append(
                        mol_dict['ensemblefreeenergy'])
                    targets['lowestenergy'].append(mol_dict['lowestenergy'])
                    targets['poplowestpct'].append(mol_dict['poplowestpct'])
                    targets['temperature'].append(mol_dict['temperature'])
                    targets['uniqueconfs'].append(mol_dict['uniqueconfs'])
                    conformers = [
                        torch.tensor(
                            conformer['rd_mol'].GetConformer().GetPositions(),
                            dtype=torch.float) for conformer in conformers[:10]
                    ]
                    if len(
                            conformers
                    ) < 10:  # if there are less than 10 conformers we add the first one a few times
                        conformers.extend([conformers[0]] *
                                          (10 - len(conformers)))

                    all_edge_features.append(edge_features)
                    coordinates.append(torch.cat(conformers, dim=1))
                    edge_indices.append(edge_index)
                    total_edges += len(edges_list)
                    total_atoms += n_atoms
                    smiles_list.append(smiles)
                    edge_slices.append(total_edges)
                    atom_slices.append(total_atoms)
                    n_atoms_list.append(n_atoms)

        for key, value in targets.items():
            targets[key] = torch.tensor(value)[:, None]
        data_dict = {
            'smiles':
            smiles_list,
            'n_atoms':
            torch.tensor(n_atoms_list, dtype=torch.long),
            'atom_slices':
            torch.tensor(atom_slices, dtype=torch.long),
            'edge_slices':
            torch.tensor(edge_slices, dtype=torch.long),
            'atom_features':
            torch.cat(all_atom_features, dim=0),
            'edge_features':
            torch.cat(all_edge_features, dim=0),
            'atomic_number_long':
            torch.tensor(atomic_number_long, dtype=torch.long),
            'edge_indices':
            torch.cat(edge_indices, dim=1),
            'coordinates':
            torch.cat(coordinates, dim=0).float(),
            'targets':
            targets,
            'avg_degree':
            avg_degree / len(n_atoms_list)
        }
        data_dict.update(targets)
        if not os.path.exists(os.path.join(self.directory, 'processed')):
            os.mkdir(os.path.join(self.directory, 'processed'))
        torch.save(
            data_dict,
            os.path.join(self.directory, 'processed', self.processed_file))
Ejemplo n.º 23
0
 def get_adjacency_matrix(self):
     """
     Returning the adjacency matrix of the molecular graph (defined atoms)
     :return:
     """
     return GetAdjacencyMatrix(self.export_mol())
Ejemplo n.º 24
0
    N = len(atoms)

    if N <= 2:
        continue

    for j, atom in enumerate(atoms):
        X[j] = atom_features(atom)

    for j in range(N):
        for k in range(N):
            bond = mol.GetBondBetweenAtoms(j, k)
            if bond is not None:
                E_idx.append([j, k])
                E_fea.append(bond_features(bond))

    A = GetAdjacencyMatrix(mol)

    Y = [outs[i]]

    # global properties
    g = nx.Graph()
    g.add_nodes_from(list(range(N)))
    g.add_edges_from(E_idx)

    if not nx.is_connected(g):
        print("{} is not connected".format(smile))
        continue

    for j in range(N):
        for k in range(N):
            if j == k:
Ejemplo n.º 25
0
    def __getitem__(self, idx):
        fnm = __class__.__name__ + '.' + whoami()
        self.n_queried += 1

        #idx = 0
        key = self.keys[idx]
        data_file_path = os.path.join(self.data_dir, key)

        #with open(self.data_dir+'/'+key, 'rb') as f:
        with open(data_file_path, 'rb') as f:
            m1, m2 = pickle.load(f)

            self.n_file_opened += 1
            pass

        if not self.proc_info_printed:
            print('{}: data_file_path:{}, type(m1):{}, type(m2):{}'.format(
                fnm, data_file_path, type(m1), type(m2)))
            pass

        #
        # prepare ligand
        #
        #m1   = Chem.AddHs(m1, addCoords=True, addResidueInfo=True) # 2020-03-26 added by caleb
        n1 = m1.GetNumAtoms()
        c1 = m1.GetConformers(
        )[0]  # m1.GetConformers() 함수는 1개의 rdkit.Chem.rdchem.Conformer object 만을 되돌려 줌
        d1 = np.array(c1.GetPositions())
        #adj1 = GetAdjacencyMatrix(m1) + np.eye(n1)
        adj = GetAdjacencyMatrix(m1) + np.eye(n1)

        if n1 <= N_PADDED_LIGAND:
            adj1 = np.zeros((N_PADDED_LIGAND, N_PADDED_LIGAND),
                            dtype=np.float64)
            adj1[:n1, :n1] = adj
            pass
        else:
            adj1 = adj[:N_PADDED_LIGAND, :N_PADDED_LIGAND]
            pass

        #H1   = get_atom_feature(m1, True)
        H1 = get_atom_feature(m1, n1, True)

        #
        # prepare protein
        #
        #m2   = Chem.AddHs(m2, addCoords=True, addResidueInfo=True) # 2020-03-26 added by caleb
        n2 = m2.GetNumAtoms()
        c2 = m2.GetConformers()[0]
        d2 = np.array(c2.GetPositions())
        #adj2 = GetAdjacencyMatrix(m2)+np.eye(n2)
        adj = GetAdjacencyMatrix(m2) + np.eye(n2)

        if n2 <= N_PADDED_PROTEIN:
            adj2 = np.zeros((N_PADDED_PROTEIN, N_PADDED_PROTEIN),
                            dtype=np.float64)
            adj2[:n2, :n2] = adj
            pass
        else:
            adj2 = adj[:N_PADDED_PROTEIN, :N_PADDED_PROTEIN]
            pass

        #H2   = get_atom_feature(m2, False)
        H2 = get_atom_feature(m2, n2, False)

        # aggregation
        H = np.concatenate([H1, H2], axis=0)
        '''
        agg_adj1 = np.zeros((n1+n2, n1+n2))
        agg_adj1[:n1, :n1] = adj1
        agg_adj1[n1:, n1:] = adj2

        agg_adj2 = np.copy(agg_adj1)

        dm = distance_matrix(d1,d2)
        agg_adj2[:n1,n1:] = np.copy(dm)
        agg_adj2[n1:,:n1] = np.copy(np.transpose(dm))

        #node indice for aggregation
        valid = np.zeros((n1+n2,))
        valid[:n1] = 1
        '''
        agg_adj1 = np.zeros((N_PADDED_ALL, N_PADDED_ALL))
        agg_adj1[:N_PADDED_LIGAND, :N_PADDED_LIGAND] = adj1
        agg_adj1[N_PADDED_LIGAND:, N_PADDED_LIGAND:] = adj2

        agg_adj2 = np.copy(agg_adj1)

        dm = distance_matrix(d1, d2)
        #
        # 2020-03-27
        #   * (계산의 편의를 위해) 무식하게 최대크기(라고 가정한) 매트릭스를 특정값으로 세팅함
        #   * 거리정보가 없는 녀석들은 먼거리(여기서는 100.0)로 세팅해 놓음 --> 그냥 0으로 세팅함
        #
        #dm_padded = np.full((N_PADDED_LIGAND_MAX, N_PADDED_PROTEIN_MAX), fill_value=100.0, dtype=np.float64)
        dm_padded = np.zeros((N_PADDED_LIGAND_MAX, N_PADDED_PROTEIN_MAX),
                             dtype=np.float64)
        dm_padded[:n1, :n2] = dm
        dm = dm_padded[:N_PADDED_LIGAND, :N_PADDED_PROTEIN]

        #agg_adj2[:n1,n1:] = np.copy(dm)
        #agg_adj2[n1:,:n1] = np.copy(np.transpose(dm))
        agg_adj2[:N_PADDED_LIGAND, N_PADDED_LIGAND:] = np.copy(dm)
        agg_adj2[N_PADDED_LIGAND:, :N_PADDED_LIGAND] = np.copy(
            np.transpose(dm))

        #node indice for aggregation
        #valid = np.zeros((n1+n2,))
        #valid[:n1] = 1
        valid = np.zeros((N_PADDED_ALL, ))
        valid[:N_PADDED_LIGAND] = 1

        #pIC50 to class
        Y = 1 if 'CHEMBL' in key else 0

        #if n1+n2 > 300 : return None
        sample = {
                  'H'  : H       , \
                  'A1' : agg_adj1, \
                  'A2' : agg_adj2, \
                  'Y'  : Y       , \
                  'V'  : valid   , \
                  'key': key     , \
                 }

        if self.n_max_n1 < n1:
            self.n_max_n1 = n1
            pass

        if self.n_max_n2 < n2:
            self.n_max_n2 = n2
            pass

        if self.n_max_adj < n1 + n2:
            self.n_max_adj = n1 + n2
            pass

        if not self.proc_info_printed:
            #print('{}: n1:{}, n2:{}, H.shape:{}, A1.shape:{}, A2.shape:{}, Y.shape:{}, V.shape:{}, key:{}'.format(
            #       fnm, n1, n2, H.shape, adj1.shape, adj2.shape, Y.shape, V.shape, key))
            #print('{}: n1:{}, n2:{}, type(H):{}, type(adj1):{}, type(adj2):{}, type(Y):{}, type(valid):{}({}), key:{}'.format(
            #       fnm, n1, n2, type(H), type(adj1), type(adj2), type(Y), type(valid)(valid[:10]), key[:10]))
            print(
                '{}: n1:{}, n2:{}, H.shape:{}, adj1.shape:{}, adj2.shape:{}, type(Y):{}, type(valid):{}, type(key):{}:{}'
                .format(fnm, n1, n2, H.shape, adj1.shape, adj2.shape, type(Y),
                        type(valid), type(key), key))
            pass

        self.proc_info_printed = True

        return sample