def test_adjacency_matrix(cell_size, threshold, periodic): """ Compare the construction of an adjacency matrix using a cell list and using a computationally expensive but simpler distance matrix. """ array = strucio.load_structure(join(data_dir, "3o5r.mmtf")) if periodic: # Create an orthorhombic box # with the outer coordinates as bounds array.box = np.diag( np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2)) cell_list = struc.CellList(array, cell_size=cell_size, periodic=periodic) matrix = cell_list.create_adjacency_matrix(threshold) # Create distance matrix # Convert to float64 to avoid errorenous warning # https://github.com/ContinuumIO/anaconda-issues/issues/9129 array.coord = array.coord.astype(np.float64) length = array.array_length() distance = struc.index_distance( array, np.stack([ np.repeat(np.arange(length), length), np.tile(np.arange(length), length) ], axis=-1), periodic) distance = np.reshape(distance, (length, length)) # Create adjacency matrix from distance matrix expected_matrix = (distance <= threshold) # Both ways to create an adjacency matrix # should give the same result assert np.array_equal(matrix, expected_matrix)
def test_get_atoms(cell_size): """ Test the correct functionality of a cell list on a simple test case with known solutions. """ array = struc.AtomArray(length=5) array.coord = np.array([[0,0,i] for i in range(5)]) cell_list = struc.CellList(array, cell_size=cell_size) assert cell_list.get_atoms(np.array([0,0,0.1]), 1).tolist() == [0,1] assert cell_list.get_atoms(np.array([0,0,1.1]), 1).tolist() == [1,2] assert cell_list.get_atoms(np.array([0,0,1.1]), 2).tolist() == [0,1,2,3] # Multiple positions pos = np.array([[0,0,0.1], [0,0,1.1], [0,0,4.1]]) expected_indices = [0, 1, 2, 0, 1, 2, 3, 3, 4] indices = cell_list.get_atoms(pos, 2) assert indices[indices != -1].tolist() == expected_indices # Multiple positions and multiple radii pos = np.array([[0,0,0.1], [0,0,1.1], [0,0,4.1]]) rad = np.array([1.0, 2.0, 3.0]) expected_indices = [0, 1, 0, 1, 2, 3, 2, 3, 4] indices = cell_list.get_atoms(pos, rad) assert indices[indices != -1].tolist() == expected_indices
def __getitem__(self, index): print(os.path.join(self.fileDir, self.files[index])) array = strucio.load_structure( os.path.join(self.fileDir, self.files[index])) if type(array) == biotite.structure.AtomArrayStack: array = array[0] # print(os.path.join(self.fileDir, self.files[index])) # print(type(array)) ca = array[array.atom_name == "CA"] cell_list = struc.CellList(ca, cell_size=self.threshold) # cell_list = struc.CellList(array, cell_size=self.threshold) adj_matrix = cell_list.create_adjacency_matrix( self.threshold).astype(int) shape = adj_matrix.shape if shape[0] % 2 != 0: print(shape) adj_matrix = np.append(adj_matrix, np.zeros((1, shape[0]), dtype=float), axis=0) adj_matrix = np.append(adj_matrix, np.zeros((shape[0] + 1, 1), dtype=float), axis=1) print(adj_matrix.shape) # return torch.tensor(adj_matrix.astype('float')) return adj_matrix.astype('double')
def test_outside_location(): # Test result for location outside any cell array = strucio.load_structure(join(data_dir, "3o5r.mmtf")) array = array[struc.filter_amino_acids(array)] cell_list = struc.CellList(array, cell_size=5) outside_coord = np.min(array.coord, axis=0) - 100 # Expect empty array assert len(cell_list.get_atoms(outside_coord, 5)) == 0
def detect_disulfide_bonds(structure, distance=2.05, distance_tol=0.05, dihedral=90, dihedral_tol=10): # Array where detected disulfide bonds are stored disulfide_bonds = [] # A mask that selects only S-gamma atoms of cysteins sulfide_mask = (structure.res_name == "CYS") & \ (structure.atom_name == "SG") # sulfides in adjacency to other sulfides are detected in an # efficient manner via a cell list cell_list = struc.CellList(structure, cell_size=distance + distance_tol, selection=sulfide_mask) # Iterate over every index corresponding to an S-gamma atom for sulfide_i in np.where(sulfide_mask)[0]: # Find indices corresponding to other S-gamma atoms, # that are adjacent to the position of structure[sulfide_i] # We use the faster 'get_atoms_in_cells()' instead of # `get_atoms()`, as precise distance measurement is done # afterwards anyway potential_bond_partner_indices = cell_list.get_atoms_in_cells( coord=structure.coord[sulfide_i]) # Iterate over every index corresponding to an S-gamma atom # as bond partner for sulfide_j in potential_bond_partner_indices: if sulfide_i == sulfide_j: # A sulfide cannot create a bond with itself: continue # Create 'Atom' instances # of the potentially bonds S-gamma atoms sg1 = structure[sulfide_i] sg2 = structure[sulfide_j] # For dihedral angle measurement the corresponding # C-beta atoms are required, too cb1 = structure[(structure.chain_id == sg1.chain_id) & (structure.res_id == sg1.res_id) & (structure.atom_name == "CB")] cb2 = structure[(structure.chain_id == sg2.chain_id) & (structure.res_id == sg2.res_id) & (structure.atom_name == "CB")] # Measure distance and dihedral angle and check criteria bond_dist = struc.distance(sg1, sg2) bond_dihed = np.abs(np.rad2deg(struc.dihedral(cb1, sg1, sg2, cb2))) if bond_dist > distance - distance_tol and \ bond_dist < distance + distance_tol and \ bond_dihed > dihedral - dihedral_tol and \ bond_dihed < dihedral + dihedral_tol: # Atom meet criteria -> we found a disulfide bond # -> the indices of the bond S-gamma atoms # are put into a tuple with the lower index first bond_tuple = sorted((sulfide_i, sulfide_j)) # Add bond to list of bonds, but each bond only once if bond_tuple not in disulfide_bonds: disulfide_bonds.append(bond_tuple) return np.array(disulfide_bonds, dtype=int)
def find_leaflets(structure, head_atom_mask, cutoff_distance=15.0, periodic=False): """ Identify which lipids molecules belong to the same lipid bilayer leaflet. Parameters ---------- structure : AtomArray, shape=(n,) The structure containing the membrane. May also include other molecules, e.g. water or an embedded protein. head_atom_mask : ndarray, dtype=bool, shape=(n,) A boolean mask that selects atoms from `structure` that represent lipid head groups. cutoff_distance : float, optional When the distance of two head groups is larger than this value, they are not (directly) connected in the same leaflet. periodic : bool, optional, If true, periodic boundary conditions are considered. This requires that `structure` has an associated `box`. Returns ------- leaflets : ndarray, dtype=bool, shape=(m,n) Multiple boolean masks, one for each identified leaflet. Each masks indicates which atoms of the input `structure` are in the leaflet. """ cell_list = struc.CellList(structure, cell_size=cutoff_distance, selection=head_atom_mask, periodic=periodic) adjacency_matrix = cell_list.create_adjacency_matrix(cutoff_distance) graph = nx.Graph(adjacency_matrix) head_leaflets = [ sorted(c) for c in nx.connected_components(graph) # A leaflet cannot consist of a single lipid # This also removes all entries # for atoms not in 'head_atom_mask' if len(c) > 1 ] # 'leaflets' contains indices to head atoms # Broadcast each head atom index to all atoms in its corresponding # residue leaflet_masks = np.empty((len(head_leaflets), structure.array_length()), dtype=bool) for i, head_leaflet in enumerate(head_leaflets): leaflet_masks[i] = struc.get_residue_masks(structure, head_leaflet) \ .any(axis=0) return leaflet_masks
def test_selection(): """ Test whether the `selection` parameter in the constructor works. This is tested by comparing the selection done prior to cell list creation with the selection done in the cell list construction. """ array = strucio.load_structure(join(data_dir, "3o5r.mmtf")) selection = np.array([False, True] * (array.array_length() // 2)) # Selection prior to cell list creation selected = array[selection] cell_list = struc.CellList(selected, cell_size=10) ref_near_atoms = selected[cell_list.get_atoms(array.coord[0], 20.0)] # Selection in cell list creation cell_list = struc.CellList(array, cell_size=10, selection=selection) test_near_atoms = array[cell_list.get_atoms(array.coord[0], 20.0)] assert test_near_atoms == ref_near_atoms
def water_in_prox(atoms, sele, cutoff): """ Get the atom indices of water oxygen atoms that are in vicinity of the selected atoms. """ cell_list = struct.CellList(atoms, cell_size=5, selection=atoms.atom_name == "OW") adjacent_atoms = cell_list.get_atoms(atoms[sele].coord, cutoff) adjacent_atoms = np.unique(adjacent_atoms.flatten()) adjacent_atoms = adjacent_atoms[adjacent_atoms > 0] return adjacent_atoms
def get_matrices(array): """ Create a periodic and non-periodic adjacency matrix. """ nonlocal CUTOFF if isinstance(array, struc.AtomArray): matrix = struc.CellList(array, CUTOFF, periodic=False) \ .create_adjacency_matrix(CUTOFF) matrix_pbc = struc.CellList(array, CUTOFF, periodic=True) \ .create_adjacency_matrix(CUTOFF) elif isinstance(array, struc.AtomArrayStack): matrix = np.array([ struc.CellList(model, CUTOFF, periodic=False).create_adjacency_matrix(CUTOFF) for model in array ]) matrix_pbc = np.array([ struc.CellList(model, CUTOFF, periodic=True).create_adjacency_matrix(CUTOFF) for model in array ]) return matrix, matrix_pbc
def test_adjacency_matrix(cell_size, threshold, periodic, use_selection): """ Compare the construction of an adjacency matrix using a cell list and using a computationally expensive but simpler distance matrix. """ array = strucio.load_structure(join(data_dir, "3o5r.mmtf")) if periodic: # Create an orthorhombic box # with the outer coordinates as bounds array.box = np.diag( np.max(array.coord, axis=-2) - np.min(array.coord, axis=-2) ) if use_selection: np.random.seed(0) selection = np.random.choice((False, True), array.array_length()) else: selection = None cell_list = struc.CellList( array, cell_size=cell_size, periodic=periodic, selection=selection ) test_matrix = cell_list.create_adjacency_matrix(threshold) length = array.array_length() distance = struc.index_distance( array, np.stack( [ np.repeat(np.arange(length), length), np.tile(np.arange(length), length) ], axis=-1 ), periodic ) distance = np.reshape(distance, (length, length)) # Create adjacency matrix from distance matrix exp_matrix = (distance <= threshold) if use_selection: # Set rows and columns to False for filtered out atoms exp_matrix[~selection, :] = False exp_matrix[:, ~selection] = False # Both ways to create an adjacency matrix # should give the same result assert np.array_equal(test_matrix, exp_matrix)
def test_adjacency_matrix(cell_size, threshold): array = strucio.load_structure(join(data_dir, "3o5r.mmtf")) array = array[struc.filter_amino_acids(array)] cell_list = struc.CellList(array, cell_size=cell_size) matrix = cell_list.create_adjacency_matrix(threshold) coord = array.coord # Create distance matrix diff = coord[:, np.newaxis, :] - coord[np.newaxis, :, :] # Convert to float64 to avoid errorenous warning # https://github.com/ContinuumIO/anaconda-issues/issues/9129 diff = diff.astype(np.float64) distance = np.sqrt(np.sum(diff**2, axis=-1)) # Create adjacency matrix from distance matrix expected_matrix = (distance <= threshold) # Both ways to create an adjacency matrix # should give the same result assert matrix.tolist() == expected_matrix.tolist()
structure = mmtf.get_structure(mmtf_file, model=1) # Separate structure into the DNA and the two identical protein chains dna = structure[np.isin(structure.chain_id, ["A", "B"]) & (structure.hetero == False)] protein_l = structure[(structure.chain_id == "L") & (structure.hetero == False)] protein_r = structure[(structure.chain_id == "R") & (structure.hetero == False)] # Quick check if the two protein chains are really identical assert len(struc.get_residues(protein_l)) == len(struc.get_residues(protein_r)) # Fast identification of contacts via a cell list: # The cell list is initiliazed with the coordinates of the DNA # and later provided with the atom coordinates of the two protein chains cell_list = struc.CellList(dna, cell_size=THRESHOLD_DISTANCE) # Sets to store the residue IDs of contact residues # for each protein chain id_set_l = set() id_set_r = set() for protein, res_id_set in zip((protein_l, protein_r), (id_set_l, id_set_r)): # For each atom in the protein chain, # find all atoms in the DNA that are in contact with it contacts = cell_list.get_atoms(protein.coord, radius=THRESHOLD_DISTANCE) # Only retain atoms in the protein with contact # to at least one atom of the DNA contact_indices = np.where((contacts != -1).any(axis=1))[0] # Get residue IDs for the atoms in the protein contact_res_ids = protein.res_id[contact_indices]
def pdb2Gdata(dirName, fileName, saveDir=False): # print(os.path.join(dirName, fileName)) array = strucio.load_structure( os.path.join(dirName, fileName), # extra_fields=['atom_id', 'b_factor', 'occupancy', 'charge'], extra_fields=['b_factor', 'occupancy'], model=1) # уникальные цепи chainIdUnique = [] for chain in array.chain_id: if chain not in chainIdUnique: chainIdUnique.append(chain) # вторичная структура используя алгоритм DSSP sse = dssp.DsspApp.annotate_sse(array) # "маски" цепи и остатки СА атомов chainMask = array[array.atom_name == 'CA'].chain_id resMask = array[array.atom_name == 'CA'].res_id # если sse короче масок, то расширим tmp = resMask.shape[0] - sse.shape[0] if tmp > 0: sse = np.append(sse, ['Null'] * tmp) # для каждой цепи, для каждого остатка - вторичная структура sseMaskDict = dict([(chain, {}) for chain in chainIdUnique]) for chainId, resId, sseId in zip(chainMask, resMask, sse): sseMaskDict[chainId][resId] = sseId # матрица смежности cell_list = struc.CellList(array, cell_size=cfg.threshold) adj_matrix = cell_list.create_adjacency_matrix(cfg.threshold) # (adj_matrix[adj_matrix == True].shape[0] - 5385) / 2 edge_index = [[], []] nodeFeatures = [] # переводим матрицу смежности в COO и собираем признаки arrayShp = array.shape[0] for i in range(arrayShp - 1): for j in range(i + 1, arrayShp): if adj_matrix[i][j]: edge_index[0].append(i) edge_index[1].append(j) nodeFeatures.append( list(array.coord[i]) + [ array.res_id[i], array.b_factor[i], float(array.hetero[i]), array.occupancy[i] ] + atomsDict.get(array.atom_name[i], atomsDict['Null']) + residualesDict.get(array.res_name[i], residualesDict['Null']) + ssesTypeDict.get( sseMaskDict[array.chain_id[i]].get(array.res_id[i], 'Null'), ssesTypeDict['Null'])) nodeFeatures.append( list(array.coord[arrayShp - 1]) + [ array.res_id[arrayShp - 1], array.b_factor[arrayShp - 1], float(array.hetero[arrayShp - 1]), array.occupancy[arrayShp - 1] ] + atomsDict.get(array.atom_name[arrayShp - 1], atomsDict['Null']) + residualesDict.get(array.res_name[arrayShp - 1], residualesDict['Null']) + ssesTypeDict.get( sseMaskDict[array.chain_id[arrayShp - 1]].get( array.res_id[arrayShp - 1], 'Null'), ssesTypeDict['Null'])) # графовый формат # nodeFeaturesT = torch.tensor(nodeFeatures, dtype=torch.float) # edge_indexT = torch.tensor(edge_index, dtype=torch.long) # data = Data(x=nodeFeaturesT, edge_index=edge_indexT) data = Data(x=torch.tensor(nodeFeatures, dtype=torch.float), edge_index=torch.tensor(edge_index, dtype=torch.long)) if saveDir: torch.save(data, os.path.join(saveDir, fileName)) return data
def pdb2Gdata(dirName, fileName, saveDir=False): # print(os.path.join(dirName, fileName)) array = strucio.load_structure( os.path.join(dirName, fileName), # extra_fields=['atom_id', 'b_factor', 'occupancy', 'charge'], extra_fields=['b_factor', 'occupancy'], model=1) # if type(array) == biotite.structure.AtomArrayStack: # array = array[0] # ca = array[array.atom_name == "CA"] # cell_list = struc.CellList(ca, cell_size=self.threshold) chain_id = [] for chain in array.chain_id: if chain not in chain_id: chain_id.append(chain) sseDict = dict([(chain, struc.annotate_sse(array, chain_id=chain)) for chain in chain_id]) sseMaskDict = {} for key, value in sseDict.items(): mask = array[(array.chain_id == key) & (array.atom_name == 'CA')].res_id tmp = mask.shape[0] - value.shape[0] if tmp > 0: sseDict[key] = np.append(value, ['Null'] * tmp) sseMaskDict[key] = {} for maskId, sseId in zip(mask, sseDict[key]): sseMaskDict[key][maskId] = sseId cell_list = struc.CellList(array, cell_size=cfg.threshold) adj_matrix = cell_list.create_adjacency_matrix(cfg.threshold) # (adj_matrix[adj_matrix == True].shape[0] - 5385) / 2 edge_index = [[], []] nodeFeatures = [] arrayShp = array.shape[0] for i in range(arrayShp - 1): for j in range(i + 1, arrayShp): if adj_matrix[i][j]: edge_index[0].append(i) edge_index[1].append(j) nodeFeatures.append( list(array.coord[i]) + [atomsDict.get(array.atom_name[i], atomsDict['Null'])] + [elementsDict.get(array.element[i], elementsDict['Null'])] + [array.res_id[i]] + [residualesDict.get(array.res_name[i], residualesDict['Null'])] + [float(array.hetero[i])] + [array.occupancy[i]] + [array.b_factor[i]] + [ ssesTypeDict.get( sseMaskDict[array.chain_id[i]].get( array.res_id[i], 'Null'), ssesTypeDict['Null']) ]) nodeFeatures.append( list(array.coord[arrayShp - 1]) + [atomsDict.get(array.atom_name[arrayShp - 1], atomsDict['Null'])] + [elementsDict.get(array.element[arrayShp - 1], elementsDict['Null'])] + [array.res_id[arrayShp - 1]] + [ residualesDict.get(array.res_name[arrayShp - 1], residualesDict['Null']) ] + [float(array.hetero[arrayShp - 1])] + [array.occupancy[arrayShp - 1]] + [array.b_factor[arrayShp - 1]] + [ ssesTypeDict.get( sseMaskDict[array.chain_id[arrayShp - 1]].get( array.res_id[arrayShp - 1], 'Null'), ssesTypeDict['Null']) ]) nodeFeaturesT = torch.tensor(nodeFeatures, dtype=torch.float) edge_indexT = torch.tensor(edge_index, dtype=torch.long) data = Data(x=nodeFeaturesT, edge_index=edge_indexT) if saveDir: torch.save(data, os.path.join(saveDir, fileName)) return data
def pdb2Gdata(dirName, fileName, saveDir=False): array = strucio.load_structure(os.path.join(dirName, fileName), model=1) # уникальные цепи chainIdUnique = np.unique(array.chain_id) data = {} # для каждой цепи for chain in chainIdUnique: sseMaskDict = {} # берем текущую цепь, исключаем heatem атомы (== numpy.False) oneChainArray = array[(array.chain_id == chain) & (array.hetero == False)] # только СА атомы backbone = oneChainArray[oneChainArray.atom_name == 'CA'] backboneShp = backbone.shape[0] # НЕ считаем вторичную стуктуру, если в цепи нет (или мало) CA атомов if backboneShp < 5: continue # вторичная структура используя алгоритм DSSP sse = dssp.DsspApp.annotate_sse(oneChainArray) # если sse короче маски, то расширим tmp = backboneShp - sse.shape[0] if tmp > 0: sse = np.append(sse, ['C'] * tmp) # для каждого остатка - вторичная структура for resId, sseId in zip(backbone.res_id, sse): sseMaskDict[resId] = sseId # матрица смежности cellList = struc.CellList(backbone, cell_size=cfg.threshold) adjMatrix = cellList.create_adjacency_matrix(cfg.threshold) # вычитаем центроиду - смещаем центр белка в точку (0, 0, 0) (для нормировки признака) backbone.coord -= backbone.coord.mean(axis=0) # длина максимального вектора (для нормировки признака) maxNorm = np.linalg.norm(backbone.coord, axis=1).max() if maxNorm != 0: backbone.coord /= maxNorm edgeIndex = [[], []] nodeFeatures = [] # переводим матрицу смежности в COO и собираем признаки for i in range(backboneShp - 1): for j in range(i + 1, backboneShp): if adjMatrix[i][j]: edgeIndex[0].append(i) edgeIndex[1].append(j) nodeFeatures.append( list(backbone.coord[i]) + residualesDict.get( backbone.res_name[i], residualesDict['Null']) + ssesTypeDict.get(sseMaskDict.get(backbone.res_id[i], 'C'))) nodeFeatures.append( list(backbone.coord[-1]) + residualesDict.get(backbone.res_name[-1], residualesDict['Null']) + ssesTypeDict.get(sseMaskDict.get(backbone.res_id[-1], 'C'))) # графовый формат data[chain] = Data(x=torch.tensor(nodeFeatures, dtype=torch.float), edge_index=torch.tensor(edgeIndex, dtype=torch.long)) # сохраняем все графы в отдельные файлы if saveDir: for chain, graph in data.items(): fileNameSplit = fileName.split('.') # приписываем к названию файла название цепи fileNameSplit[0] += chain torch.save(graph, os.path.join(saveDir, '.'.join(fileNameSplit))) # возвращаем словарь return data
def pdb2Gdata(dirName, fileName, saveDir=False): # print(os.path.join(dirName, fileName)) array = strucio.load_structure( os.path.join(dirName, fileName), # extra_fields=['atom_id', 'b_factor', 'occupancy', 'charge'], extra_fields=['b_factor', 'occupancy'], model=1) # уникальные цепи chainIdUnique = [] for chain in array.chain_id: if chain not in chainIdUnique: chainIdUnique.append(chain) # вторичная структура используя алгоритм DSSP для каждой цепи # НЕ считаем вторичную стуктуру, если в цепи нет CA атомов sseChainDict = dict([ (chain, dssp.DsspApp.annotate_sse(array[array.chain_id == chain])) for chain in chainIdUnique if array[(array.chain_id == chain) & (array.atom_name == 'CA')].shape[0] != 0 ]) data = {} sseMaskDict = dict([(chain, {}) for chain in chainIdUnique]) for chain, sse in sseChainDict.items(): # "маска" остатков СА атомов resMask = array[(array.chain_id == chain) & (array.atom_name == 'CA')].res_id # если sse короче маски, то расширим tmp = resMask.shape[0] - sse.shape[0] if tmp > 0: sseChainDict[chain] = np.append(sse, ['Null'] * tmp) # для каждой цепи, для каждого остатка - вторичная структура for resId, sseId in zip(resMask, sseChainDict[chain]): sseMaskDict[chain][resId] = sseId oneChainArray = array[array.chain_id == chain] # матрица смежности cell_list = struc.CellList(oneChainArray, cell_size=cfg.threshold) adj_matrix = cell_list.create_adjacency_matrix(cfg.threshold) edge_index = [[], []] nodeFeatures = [] # переводим матрицу смежности в COO и собираем признаки arrayShp = oneChainArray.shape[0] for i in range(arrayShp - 1): for j in range(i + 1, arrayShp): if adj_matrix[i][j]: edge_index[0].append(i) edge_index[1].append(j) nodeFeatures.append( list(oneChainArray.coord[i]) + [ oneChainArray.res_id[i], oneChainArray.b_factor[i], float(oneChainArray.hetero[i]), oneChainArray.occupancy[i] ] + atomsDict.get(oneChainArray.atom_name[i], atomsDict['Null']) + residualesDict.get(oneChainArray.res_name[i], residualesDict['Null']) + ssesTypeDict.get( sseMaskDict[oneChainArray.chain_id[i]].get( oneChainArray.res_id[i], 'Null'), ssesTypeDict['Null']) ) nodeFeatures.append( list(oneChainArray.coord[arrayShp - 1]) + [ oneChainArray.res_id[arrayShp - 1], oneChainArray.b_factor[arrayShp - 1], float(oneChainArray.hetero[arrayShp - 1]), oneChainArray.occupancy[arrayShp - 1] ] + atomsDict.get(oneChainArray.atom_name[arrayShp - 1], atomsDict['Null']) + residualesDict.get(oneChainArray.res_name[arrayShp - 1], residualesDict['Null']) + ssesTypeDict.get( sseMaskDict[oneChainArray.chain_id[arrayShp - 1]].get( oneChainArray.res_id[arrayShp - 1], 'Null'), ssesTypeDict['Null'])) # графовый формат data[chain] = Data(x=torch.tensor(nodeFeatures, dtype=torch.float), edge_index=torch.tensor(edge_index, dtype=torch.long)) # сохраняем все графы в отдельные файлы if saveDir: for chain, graph in data.items(): fileNameSplit = fileName.split('.') fileNameSplit[0] += chain torch.save(graph, os.path.join(saveDir, '.'.join(fileNameSplit))) # возвращаем словарь return data
import biotite import biotite.structure as struc import biotite.structure.io as strucio import biotite.database.rcsb as rcsb import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap file_name = rcsb.fetch("1aki", "mmtf", biotite.temp_dir()) array = strucio.load_structure(file_name) # We only consider CA atoms ca = array[array.atom_name == "CA"] # 7 Angstrom adjacency threshold threshold = 7 # Create cell list of the CA atom array # for efficient measurement of adjacency cell_list = struc.CellList(ca, cell_size=threshold) adjacency_matrix = cell_list.create_adjacency_matrix(threshold) figure = plt.figure() ax = figure.add_subplot(111) cmap = ListedColormap(["white", biotite.colors["dimgreen"]]) #ax.matshow(adjacency_matrix, cmap=cmap, origin="lower") ax.pcolormesh(ca.res_id, ca.res_id, adjacency_matrix, cmap=cmap) ax.set_aspect("equal") ax.set_xlabel("Residue number") ax.set_xlabel("Residue number") ax.set_title("Adjacency matrix of the lysozyme crystal structure") figure.tight_layout() plt.show()
pymol_obj.color("black") ammolite.cmd.set("stick_color", "red") ammolite.cmd.set("stick_radius", 0.5) ammolite.cmd.set("sphere_scale", 1.0) ammolite.cmd.set("sphere_quality", 4) # Adjust camera pymol_obj.orient() pymol_obj.zoom(buffer=10) ammolite.cmd.rotate("z", 90) ammolite.show(PNG_SIZE) ######################################################################## CUTOFF = 13 # Find contacts within cutoff distance adjacency_matrix = struc.CellList(aptamer, CUTOFF) \ .create_adjacency_matrix(CUTOFF) for i, j in zip(*np.where(adjacency_matrix)): pymol_obj.distance("", i, j, show_label=False, gap=0) ammolite.cmd.set("dash_color", "firebrick") # Add black outlines ammolite.cmd.bg_color("white") ammolite.cmd.set("ray_trace_mode", 1) ammolite.cmd.set("ray_trace_disco_factor", 0.5) ammolite.show(PNG_SIZE) # sphinx_gallery_thumbnail_number = 2
def pdb2Gdata(dirName, fileName, saveDir=False): # print(os.path.join(dirName, fileName)) array = strucio.load_structure(os.path.join(dirName, fileName), # extra_fields=['atom_id', 'b_factor', 'occupancy', 'charge'], extra_fields=['b_factor', 'occupancy'], model=1) # уникальные цепи chainIdUnique = np.unique(array.chain_id) data = {} sseMaskDict = dict([(chain, {}) for chain in chainIdUnique]) for chain in chainIdUnique: # берем текущую цепь oneChainArray = array[array.chain_id == chain] # исключаем heatem атомы для вычисления sse (== numpy.False) notHeatemChain = oneChainArray[oneChainArray.hetero == False] # "маска" остатков СА (не heatem) атомов resMask = notHeatemChain[notHeatemChain.atom_name == 'CA'].res_id # НЕ считаем вторичную стуктуру, если в цепи нет (или мало) CA атомов if resMask.shape[0] < 5: continue # вторичная структура используя алгоритм DSSP для каждой цепи sse = dssp.DsspApp.annotate_sse(notHeatemChain) # если sse короче маски, то расширим tmp = resMask.shape[0] - sse.shape[0] if tmp > 0: sse = np.append(sse, ['Null'] * tmp) # для каждой цепи, для каждого остатка - вторичная структура for resId, sseId in zip(resMask, sse): sseMaskDict[chain][resId] = sseId # матрица смежности cellList = struc.CellList(oneChainArray, cell_size=cfg.threshold) adjMatrix = cellList.create_adjacency_matrix(cfg.threshold) # вычитаем центроиду - смещаем цетр белка в точку (0, 0, 0) (для нормировки признака) oneChainArray.coord -= oneChainArray.coord.mean(axis=0) # длина максимального вектора (для нормировки признака) maxNorm = max([np.linalg.norm(point) for point in oneChainArray.coord]) if maxNorm != 0: oneChainArray.coord /= maxNorm # максимальный температурный фактор (для нормировки признака) maxBFactor = oneChainArray.b_factor.max() if maxBFactor != 0: oneChainArray.b_factor /= maxBFactor edgeIndex = [[], []] nodeFeatures = [] # переводим матрицу смежности в COO и собираем признаки arrayShp = oneChainArray.shape[0] for i in range(arrayShp - 1): for j in range(i + 1, arrayShp): if adjMatrix[i][j]: edgeIndex[0].append(i) edgeIndex[1].append(j) nodeFeatures.append( list(oneChainArray.coord[i]) + [oneChainArray.b_factor[i], float(oneChainArray.hetero[i]), oneChainArray.occupancy[i]] + atomsDict.get(oneChainArray.atom_name[i], atomsDict['Null']) + residualesDict.get(oneChainArray.res_name[i], residualesDict['Null']) + ssesTypeDict.get(sseMaskDict[oneChainArray.chain_id[i]].get(oneChainArray.res_id[i], 'Null'), ssesTypeDict['Null']) ) nodeFeatures.append( list(oneChainArray.coord[-1]) + [oneChainArray.b_factor[-1], float(oneChainArray.hetero[-1]), oneChainArray.occupancy[-1]] + atomsDict.get(oneChainArray.atom_name[-1], atomsDict['Null']) + residualesDict.get(oneChainArray.res_name[-1], residualesDict['Null']) + ssesTypeDict.get(sseMaskDict[oneChainArray.chain_id[-1]].get(oneChainArray.res_id[-1], 'Null'), ssesTypeDict['Null']) ) # графовый формат data[chain] = Data(x=torch.tensor(nodeFeatures, dtype=torch.float), edge_index=torch.tensor(edgeIndex, dtype=torch.long)) # сохраняем все графы в отдельные файлы if saveDir: for chain, graph in data.items(): fileNameSplit = fileName.split('.') # приписываем к названию файла название цепи fileNameSplit[0] += chain torch.save(graph, os.path.join(saveDir, '.'.join(fileNameSplit))) # возвращаем словарь return data