Esempio n. 1
0
def pair_features(mol, idx1, idx2, max_distance=7):
    features = np.zeros((6 + max_distance + 1))

    # bond type
    bond = mol.GetBondBetweenAtoms(idx1, idx2)
    if bond is not None:
        bt = bond.GetBondType()
        features[:6] = np.array([
            bt == Chem.rdchem.BondType.SINGLE, bt
            == Chem.rdchem.BondType.DOUBLE, bt == Chem.rdchem.BondType.TRIPLE,
            bt == Chem.rdchem.BondType.AROMATIC,
            bond.GetIsConjugated(),
            bond.IsInRing()
        ],
                                dtype=int)

    # whether two atoms are in same ring
    rings = mol.GetRingInfo().AtomRings()
    for ring in rings:
        if idx1 in ring and idx2 in ring and idx1 != idx2:
            features[6] = 1

    # graph distance between two atoms
    distance = rdmolops.GetDistanceMatrix(mol)
    distance = np.where(distance < max_distance, distance,
                        max_distance - 1)[idx1][idx2]
    distance = to_categorical(distance, num_classes=max_distance)
    features[7:] = distance

    return features
Esempio n. 2
0
def get_molecules():
    """
    Constructs rdkit mol objects derrived from the .xyz files. Also returns:
        - mol ids (unique numerical ids)
        - set of molecule level features
        - arrays of xyz coordinates
        - euclidean distance matrices
        - graph distance matrices.
    All objects are returned in dictionaries with 'mol_name' as keys.
    """
    mols, mol_ids, mol_feats = {}, {}, {}
    xyzs, dist_matrices, graph_dist_matrices = {}, {}, {}
    print('Create molecules and distance matrices.')
    for i in range(C.N_MOLS):
        print_progress(i, C.N_MOLS)
        filepath = xyz_filepath_list[i]
        mol_name = filepath.split('/')[-1][:-4]
        mol, xyz, dist_matrix = mol_from_xyz(filepath)  #读取XYZ文件获取结构mol和距离矩阵,坐标
        mols[mol_name] = mol
        xyzs[mol_name] = xyz
        dist_matrices[mol_name] = dist_matrix
        mol_ids[mol_name] = i  # 数据集中分子序号作为分子的id

        # make padded graph distance matrix dataframes
        n_atoms = len(xyz)
        graph_dist_matrix = pd.DataFrame(
            np.pad(rdmolops.GetDistanceMatrix(mol),
                   [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)],
                   'constant'))  #通过ramolops.GetDistanceMatrix获取 图距离矩阵
        graph_dist_matrix['molecule_id'] = n_atoms * [
            i
        ]  # eg: CH4 5 * [0] = [0, 0, 0, 0, 0] list数据可以为dataframe赋值
        graph_dist_matrices[mol_name] = graph_dist_matrix  #字典:value: dataframe

        # compute molecule level features
        adj_matrix = rdmolops.GetAdjacencyMatrix(
            mol)  #通过ramolops.GetDistanceMatrix获取 图邻接矩阵
        atomic_num_list, _, _ = read_xyz_file(
            filepath)  #读取XYZ文件获取分子中各原子的原子序数和坐标
        dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() ==
                                    1]  #通过邻接矩阵的下三角获取与相邻原子之间的距离
        mol_feats[mol_name] = pd.Series(
            [np.mean(dists),
             np.std(dists),
             np.mean(atomic_num_list)],
            index=mol_feat_columns)  #获取与领接原子之间距离均值和标准差、原子序数的均值(分子级特征)
    return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices  #返回训练集所有分子结构mol和分子ids,分子级特征,原子坐标,距离矩阵,图距离矩阵
Esempio n. 3
0
def get_molecules():
    """
    Constructs rdkit mol objects derrived from the .xyz files. Also returns:
        - mol ids (unique numerical ids)
        - set of molecule level features
        - arrays of xyz coordinates
        - euclidean distance matrices
        - graph distance matrices.
    All objects are returned in dictionaries with 'mol_name' as keys.
    """
    mols, mol_ids, mol_feats = {}, {}, {}
    xyzs, dist_matrices, graph_dist_matrices = {}, {}, {}
    print('Create molecules and distance matrices.')
    for i in range(C.N_MOLS):
        print_progress(i, C.N_MOLS)
        filepath = xyz_filepath_list[i]
        mol_name = filepath.split('/')[-1][:-4]
        mol, xyz, dist_matrix = mol_from_xyz(filepath)
        mols[mol_name] = mol
        xyzs[mol_name] = xyz
        dist_matrices[mol_name] = dist_matrix
        mol_ids[mol_name] = i

        # make padded graph distance matrix dataframes
        n_atoms = len(xyz)
        graph_dist_matrix = pd.DataFrame(
            np.pad(rdmolops.GetDistanceMatrix(mol),
                   [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)], 'constant'))
        graph_dist_matrix['molecule_id'] = n_atoms * [i]
        graph_dist_matrices[mol_name] = graph_dist_matrix

        # compute molecule level features
        adj_matrix = rdmolops.GetAdjacencyMatrix(mol)
        atomic_num_list, _, _ = read_xyz_file(filepath)
        dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() == 1]
        mol_feats[mol_name] = pd.Series(
            [np.mean(dists),
             np.std(dists),
             np.mean(atomic_num_list)],
            index=mol_feat_columns)

    return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices
Esempio n. 4
0
def mol_to_vec(smifile, shared_fg, voltype, max_path_length, verbose=False):
    """
	mol_to_vec function

	Allows for calculating a graph-based sum of atomic contributions in "layers"
    based off atomic connectivity ranging away from a specified common 
    functional group.

	arguments:
        smifile: a file listing SMILES strings, properties can optionally be provided after each string.
        shared_fg: a SMILES pattern shared by provided SMILES strings, to be used as a reference.
        voltype: type of volume contributions to measure, current options are "crippen" or "mcgowan".
        max_path_length: number of layers to include contributions from base functional group.
        verbose: print information to terminal.
        
    returns:
        Pandas DataFrame containing sum of atomic contributions for each layer requested in format:
            ["layer0",..."layerN","Structure" (SMILES string),"Property" (optional)]
	"""
    # computes the atomic contributions to volume/sterics at discrete number of bond lengths away from a particular atom/functional group
    mollist, y_val, vec_df, columns = [], [], [], []
    [
        columns.append(str(col) + '_' + voltype.lower())
        for col in range(0, max_path_length)
    ]
    if shared_fg == False:
        sys.exit(
            "Please specify a common functional group pattern as a SMILES string using the --fg argument. Exiting."
        )
    for line in open(smifile):
        toks = line.split()
        mol2vec = []
        base_id = None
        if len(
                toks
        ) > 1:  # expects a smiles string, followed by a property value on each line
            # parse structure from input
            smi, prop = toks[0:2]
            y_val.append(prop)
        elif len(toks) == 1:
            smi = toks[0]
        mollist.append(smi)

        mol = Chem.MolFromSmiles(smi)

        if mol is None:
            print("Warning - Could not parse SMILES for", smi,
                  "\nEvaluating with looser constrictions.")
            mol = Chem.MolFromSmiles(smi, sanitize=False)
            mol.UpdatePropertyCache(strict=False)
            if mol is None:
                print("Parsing Failed. Skipping this structure")
                vec_df.append(pd.Series())
                continue

        mat = rdmolops.GetDistanceMatrix(mol)

        # the origin is defined by a particular functional group/atom of interest shared by all structures
        patt = Chem.MolFromSmarts(shared_fg)

        # if a functional group SMILES pattern is specified, the base atom is expected to be first in the smiles string
        fg_atoms = mol.GetSubstructMatch(patt)
        if len(fg_atoms) == 0:
            print("ERR: Functional group", shared_fg, "not found in molecule:",
                  smi)
            lower_fg = shared_fg.lower()
            patt = Chem.MolFromSmarts(lower_fg)
            fg_atoms = mol.GetSubstructMatch(patt)
            if len(fg_atoms) == 0:
                print(
                    "Parsing functional group from this structure failed. Skipping this structure"
                )
                vec_df.append(pd.Series())
                continue
            else:
                if verbose:
                    print("Found by parsing functional group as", lower_fg)

        base_id = mol.GetSubstructMatch(patt)[0]

        dist_from_base = mat[base_id]

        # This uses Crippen's atomic contributions to molecular refractivity as volumes
        # Atomic contributions to logP are also available ...
        if voltype.lower() == 'crippen':
            molH = Chem.AddHs(mol)
            mrContribs = Crippen.rdMolDescriptors._CalcCrippenContribs(molH)
            logps, mrs = zip(*mrContribs)
            # condense H atom contributions to attached heavy atom
            mr, apolsCondensed = crippenHContribs(molH, mrs)
            vols = apolsCondensed
        elif voltype.lower() == 'mcgowan':
            #grab mcgowan volumes
            molH = Chem.AddHs(mol)
            vols = mcgowanHContribs(molH)
        elif voltype.lower() == 'degree':
            vols = degreeContribs(mol)

        # this is the radial count up to the max_path_length
        for level in range(0, max_path_length):
            mol2vec.append(0)
            for at, dist in enumerate(dist_from_base):
                # This will try to exclude the other atoms in the defined FG apart from the base
                if dist == level and at not in fg_atoms:
                    mol2vec[level] += vols[at]
                elif at == base_id and level == 0:  # add contributions from base atom at level 0
                    """Add contributions from other base atoms here ? or not at all"""
                    mol2vec[level] += vols[at]

        # create the vector from the successive graph levels
        vec_df.append(mol2vec)
    vec_df = pd.DataFrame(vec_df, columns=columns)
    vec_df['Structure'] = np.array(mollist)
    if len(y_val) > 0:
        vec_df['Property'] = np.array(y_val)
    return vec_df
Esempio n. 5
0
def add_sample(url,features,target,As,sizes,num_molecules):
    """Extracts information from .xyz file and returns the features array, 
       target array, array of adjacency matrices, array of molecule sizes, and total
       number of molecules."""
    try:
        properties = [];
        with open(url,'r') as file:
            for row in file:
                properties += row.split()
        
        SMILES = properties[-4]
        #INCHI = properties[-2]
        m = Chem.MolFromSmiles(SMILES)
        #m = Chem.AddHs(m)    #uncomment to include hydrogen atoms
        
        vertices = m.GetAtoms()
        d = len(vertices)
        partial_charges=properties[22:(23+5*(d-1)):5]

        rdPartialCharges.ComputeGasteigerCharges(m)
        
        # Targets
        dipole_moment = float(properties[6])
        polarizability = float(properties[7])
        h**o = float(properties[8])
        lumo = float(properties[9])
        gap = float(properties[10])
        r2 = float(properties[11])
        zpve = float(properties[12])
        U0 = float(properties[13])
        internal_energy = float(properties[14])
        enthalpy = float(properties[15])
        free_nrg = float(properties[16])
        heat_capacity = float(properties[17])
        
        # This is where you decide which properties you would like to predict
        target.append([heat_capacity])
        #target.append([dipole_moment,polarizability,h**o,lumo,gap,
        #    r2,zpve,U0,internal_energy,enthalpy,free_nrg,heat_capacity])

        #populate the adjacency matrix and write features
        tempA = rdmolops.GetDistanceMatrix(m) #np.zeros((d,d)); #Adjacency matrix
        tempBO = np.zeros((d,d))
        tempAR = np.zeros((d,d))
        num_features = 11
        tempfeatures = [[0]*num_features for _ in range(d)]; # d=#nodes,  f=#features available

        for atom in vertices:
            # Get features of the atom
            v_i = atom.GetIdx()
            atomic_num = atom.GetAtomicNum()
            degree = atom.GetDegree()
            valence = atom.GetTotalValence()
            hybrid = int(atom.GetHybridization())
            atom_aromatic = atom.GetIsAromatic()
            ring = atom.IsInRing()
            rad = atom.GetNumRadicalElectrons()
            pc = atom.GetDoubleProp("_GasteigerCharge")
            
            # Get bonds
            linked_atoms =[x.GetIdx() for x in atom.GetNeighbors()]

            # Populate adjacency matrix
            tempBO[v_i][v_i] = 10 #arbitrarily large number to indicate connectivity to self
            for v_j in linked_atoms:
                bond_order = m.GetBondBetweenAtoms(v_i,v_j).GetBondTypeAsDouble()
                #bond_length = m.GetBondBetweenAtoms(v_i,v_j).GetBondLength()
                bond_aromatic = m.GetBondBetweenAtoms(v_i,v_j).GetIsAromatic()
                tempBO[v_i][v_j] = bond_order
                tempAR[v_i][v_j] = bond_aromatic
            
            # Write features
            tempfeatures[v_i][0] = atomic_num
            tempfeatures[v_i][1] = int(atomic_num==1) #H
            tempfeatures[v_i][2] = int(atomic_num==6) #C
            tempfeatures[v_i][3] = int(atomic_num==7) #N
            tempfeatures[v_i][4] = int(atomic_num==8) #O
            tempfeatures[v_i][5] = int(atomic_num==9) #F
            tempfeatures[v_i][6] = int(atom_aromatic)
            tempfeatures[v_i][7] = hybrid
            tempfeatures[v_i][8] = degree
            tempfeatures[v_i][9] = valence
            tempfeatures[v_i][10] = atom.GetDoubleProp("_GasteigerCharge")

        As[0].append(sp.coo_matrix(tempA))
        As[1].append(sp.coo_matrix(tempBO))
        As[2].append(sp.coo_matrix(tempAR))
        
        if (num_molecules == 0):
            sizes = sizes + [d-1]
        else:
            sizes = sizes + [d]
        num_molecules=num_molecules+1

        if (num_molecules % 5000 == 0):
            print("On the "+str(num_molecules)+"th molecule")
        
        features+=tempfeatures
        return features, target, As, sizes, num_molecules
    except Exception as e:
        # Write exception to file
        print(str(e))
        # with open("analysis/problem_files.txt","a+") as file:
        #     file.write(str(num_molecules)+" :  "+str(url)+ "   " + str(e) + "\n")
        return features, target, A, sizes, num_molecules
Esempio n. 6
0
def add_sample(url, features, target, As, sizes, num_molecules, elements_info):
    #extract information from xyz file
    #try:
    if (True):
        properties = []
        with open(url, 'r') as file:
            for row in file:
                properties += row.split()

        SMILES = properties[-4]
        INCHI = properties[-2]
        m = Chem.MolFromSmiles(SMILES)
        m = Chem.AddHs(m)

        vertices = m.GetAtoms()
        #edges = m.GetBonds()
        d = len(vertices)
        partial_charges = properties[22:(23 + 5 * (d - 1)):5]
        atomic_number_mean = elements_info[1]
        atomic_number_stdev = elements_info[2]

        #pc = m.ComputeGasteigerCharges()
        #print(pc,partial_charges)

        # Targets
        dipole_moment = float(properties[6])
        polarizability = float(properties[7])
        h**o = float(properties[8])
        lumo = float(properties[9])
        gap = float(properties[10])
        r2 = float(properties[11])
        zpve = float(properties[12])
        U0 = float(properties[13])
        internal_energy = float(properties[14])
        enthalpy = float(properties[15])
        free_nrg = float(properties[16])
        heat_capacity = float(properties[17])

        target.append([heat_capacity])
        #target.append([dipole_moment,polarizability,h**o,lumo,gap,
        #    r2,zpve,U0,internal_energy,enthalpy,free_nrg,heat_capacity])

        #populate the adjacency matrix and write features
        tempA = rdmolops.GetDistanceMatrix(
            m)  #np.zeros((d,d)); #Adjacency matrix
        tempBO = np.zeros((d, d))
        tempAR = np.zeros((d, d))
        f = 8
        tempfeatures = [[0] * f for _ in range(d)]
        # d=#nodes,  f=#features available

        dic = {'C': 6, 'H': 1, 'O': 8, 'N': 7, 'F': 9}
        for atom in vertices:
            # Get features of the atom
            v_i = atom.GetIdx()
            atomic_num = atom.GetAtomicNum()
            degree = atom.GetDegree()
            valence = atom.GetTotalValence()
            hybrid = int(atom.GetHybridization())
            atom_aromatic = atom.GetIsAromatic()
            ring = atom.IsInRing()
            rad = atom.GetNumRadicalElectrons()

            # Get bonds
            linked_atoms = [x.GetIdx() for x in atom.GetNeighbors()]

            # Populate adjacency matrix
            tempBO[v_i][
                v_i] = 10  #arbitrarily large number to indicate connectivity to self
            for v_j in linked_atoms:
                bond_order = m.GetBondBetweenAtoms(v_i,
                                                   v_j).GetBondTypeAsDouble()
                #bond_length = m.GetBondBetweenAtoms(v_i,v_j).GetBondLength()
                bond_aromatic = m.GetBondBetweenAtoms(v_i, v_j).GetIsAromatic()
                tempBO[v_i][v_j] = bond_order
                tempAR[v_i][v_j] = bond_aromatic

            # Write features
            tempfeatures[v_i][0] = (atomic_num -
                                    atomic_number_mean) / atomic_number_stdev
            tempfeatures[v_i][1] = int(atomic_num == 1)  #H
            tempfeatures[v_i][2] = int(atomic_num == 6)  #C
            tempfeatures[v_i][3] = int(atomic_num == 7)  #N
            tempfeatures[v_i][4] = int(atomic_num == 8)  #O
            tempfeatures[v_i][5] = int(atomic_num == 9)  #F
            tempfeatures[v_i][6] = int(atom_aromatic)
            tempfeatures[v_i][7] = hybrid
            #tempfeatures[v_i][8] = ring #float(partial_charges[atom]) #Mulliken partial charge
            #tempfeatures[v_i][9] = degree
            #tempfeatures[v_i][10] = valence

        As[0].append(sp.coo_matrix(tempA))
        As[1].append(sp.coo_matrix(tempBO))
        As[2].append(sp.coo_matrix(tempAR))

        if (num_molecules == 0):
            sizes = sizes + [d - 1]
        else:
            sizes = sizes + [d]
        num_molecules = num_molecules + 1

        if (num_molecules % 5000 == 0):
            print("On the " + str(num_molecules) + "th molecule")

        features += tempfeatures
        return features, target, As, sizes, num_molecules
Esempio n. 7
0
def distance_matrix(mol):
    """ The topological distance matrix. """

    return rdmolops.GetDistanceMatrix(mol)