def get_loop_length(log: Logger, sse1: Frame3D, sse2: Frame3D, loop_step: int, loop_range: int) -> Tuple[int, int]: """Calculate the expected number of residues to join two SSE. :param log: Job Logger. :param sse1: N-SSE. :param sse2: C-SSE. :param loop_step: Assumption on how much distance a residue can cover. :param loop_range: Plus-minus range of residue length. """ from SBI.structure import ChainFrame from SBI.structure.geometry.basics import distance res1 = ChainFrame(PDB(sse1)).last_compound res2 = ChainFrame(PDB(sse2)).first_compound distance = distance(res1[res1['label_atom_id'] == 'C'].coordinates, res2[res2['label_atom_id'] == 'N'].coordinates) log.debug(f'Distance between SSE is {distance} Angstrongs.') distance = math.ceil(distance / loop_step) log.debug( f'Assuming the need of {distance} residues with a {loop_range} residue range.' ) distance = [ x for x in range(distance - loop_range - 1, distance + loop_range + 1) if x > 0 ] return max(distance), min(distance)
def make_structure(sse1: dict, sse2: dict, outfile: Path) -> Tuple[PDBFrame, PDBFrame]: """ """ sse1 = PDB( pd.DataFrame(sse1['metadata']['atoms'], columns=[ 'auth_comp_id', 'auth_atom_id', 'auth_seq_id', 'Cartn_x', 'Cartn_y', 'Cartn_z' ])).renumber(1) sse2 = PDB( pd.DataFrame(sse2['metadata']['atoms'], columns=[ 'auth_comp_id', 'auth_atom_id', 'auth_seq_id', 'Cartn_x', 'Cartn_y', 'Cartn_z' ])).renumber(sse1.iloc[-1]['auth_seq_id'] + 5) structure = pd.concat([sse1, sse2]) structure['id'] = list(range(1, structure.shape[0] + 1)) if TBcore.get_option('system', 'verbose'): sys.stdout.write('-> generating structure {}\n'.format( outfile.resolve())) structure.write(output_file=str(outfile), format='pdb', clean=True, force=TBcore.get_option('system', 'overwrite')) return sse1, sse2
def assemble(pdb_id): struct = PDB(os.path.join(masif_opts['raw_pdb_dir'], '{}.pdb'.format(pdb_id)), header=True) try: struct_assembly = struct.apply_biomolecule_matrices()[0] except: return 0 struct_assembly.write( os.path.join(masif_opts['ligand']['assembly_dir'], '{}.pdb'.format(pdb_id))) return 1
def assemble(pdb_id): # Reads and builds the biological assembly of a structure struct = PDB(os.path.join(masif_opts["raw_pdb_dir"], "{}.pdb".format(pdb_id)), header=True) try: struct_assembly = struct.apply_biomolecule_matrices()[0] except: return 0 struct_assembly.write( os.path.join(masif_opts["ligand"]["assembly_dir"], "{}.pdb".format(pdb_id))) return 1
def make_PDBseq(self, log_file, resolution_threshold=None): if not self.has_local: raise NameError( 'A local PDB database must be defined to do create a PDBseq database.' ) outdir = self.PDBseq if self.PDBseq is not None else os.curdir Path.mkdir(self.PDBseq) fasta_file = File(file_name=os.path.join(outdir, 'PDBseq.fa'), action='w', overwrite=True) fasta_fd = fasta_file.descriptor idx_file = File(file_name=os.path.join(outdir, 'PDBseq.fa.idx'), action='w', overwrite=True) idx_fd = idx_file.descriptor # if resolution_threshold is not None: # filtered_file_name = self.get_PDBseq_filtered(resolution_threshold) # filtered_file = File(file_name = filtered_file_name, action = 'w', overwrite = True) # filtered_fd = filtered_file.descriptor # resolutions = self.get_resolutions(resolution_threshold = resolution_threshold) log_file = File(file_name=log_file, action='w', overwrite=True) log_idx = log_file.descriptor for pdb_file in self.localPDBs: log_idx.write("Reading File: {0}\n".format(pdb_file)) newPDB = PDB(pdb_file=pdb_file, dehydrate=True) fasta_idx = newPDB.FASTA_IDX(nucleotide=False) if len(fasta_idx['FASTA']) != len(fasta_idx['IDX']): log_idx.write( 'ERROR!!!!! Number of fastas and indexes are different for pdb {0}!!\n' .format(newPDB.id)) if len(fasta_idx['FASTA']) > 0: log_idx.write('\tPrinting FASTA and IDX...\n') else: log_idx.write('\tProblably just a nucleotide PDB...\n') for c in range(len(fasta_idx['FASTA'])): sequence = fasta_idx['FASTA'][c].split('\n')[1] sequence = sequence.replace('X', '').replace('x', '') if len(sequence) > 0: fasta_fd.write(fasta_idx['FASTA'][c] + "\n") if resolution_threshold is not None and newPDB.id in resolutions and not newPDB.is_all_ca: filtered_fd.write(fasta_idx['FASTA'][c] + "\n") idx_fd.write(fasta_idx['IDX'][c] + "\n") del (newPDB) #CLOSE & END fasta_file.close() idx_file.close() if resolution_threshold is not None: filtered_fd.close()
def get_loop_length(sse1: PDB, sse2: PDB, loop_step: int, loop_range: int) -> Tuple[int, int]: """ """ res1 = ChainFrame(PDB(sse1)).last_compound res2 = ChainFrame(PDB(sse2)).first_compound distance = SBIgeo.point_distance( res1[res1['label_atom_id'] == 'N'].coordinates, res2[res2['label_atom_id'] == 'N'].coordinates) distance = math.ceil(distance / loop_step) distance = [ x for x in range(distance - loop_range - 1, distance + loop_range + 1) if x > 0 ] return max(distance), min(distance)
def main(): #Initialize options = parse_user_arguments() verbose = options.show pdb_path = os.path.join(config.get('Paths', 'modppi_path'), config.get('Paths', 'pdb_path')) try: did_path = os.path.join(config.get('Paths', 'modppi_path'), config.get('Paths', '3did_path')) data_path = os.path.join(config.get('Paths', 'modppi_path'), config.get('Paths', 'data_path')) except: did_path = options.outdir data_path = options.outdir if not os.path.exists(did_path): sys.stderr.write( "No 3DID directory, please check your installation or INPUT\n") if not os.path.exists(data_path): sys.stderr.write( "No DATA directory, please check your installation or INPUT\n") #Create list of interactions and FASTA sequences of 3DiD did_interactions = open(os.path.join(data_path, options.interactions_file), "w") did_fasta = open(os.path.join(data_path, options.seq_file), "w") for brk in os.listdir(did_path): if verbose: sys.stderr.write("\t\t-- Reading %s \n" % os.path.join(did_path, brk)) try: pdb = PDB(os.path.join(did_path, brk)) id_chain = [] for c in pdb.chain_identifiers: pdb_chain = pdb.get_chain_by_id(c) id_chain.append(pdb.id + "_" + c) printfasta(did_fasta, pdb.id + "_" + c, pdb_chain.gapped_protein_sequence) did_interactions.write("%s\t%s\n" % (id_chain[0], id_chain[1])) except Exception as e: if verbose: sys.stderr.write( "\t\t-- %s cannot be read\n\t\t Error: %s\n" % (os.path.join(did_path, brk), e)) continue did_interactions.close() did_fasta.close()
def build_pdb_object( sses: List[Dict], loops: Union[List[int], int]) -> Tuple[Frame3D, List[int]]: """ """ if isinstance(loops, int): loops = [ loops, ] * (len(sses) - 1) if len(sses) != len(loops) + 1: raise ValueError( 'Number of loops should equal number of SSE minus one.') pieces = [] columns = [ 'auth_comp_id', 'auth_atom_id', 'auth_seq_id', 'Cartn_x', 'Cartn_y', 'Cartn_z' ] start = 1 for i, sse in enumerate(sses): start = 1 if i == 0 else int(sses[i - 1]['length']) + loops[i - 1] + start if TBcore.get_option('system', 'verbose'): sys.stdout.write( 'PDB: Building SSE {:02d}:{} starting at {}\n'.format( i + 1, sse['id'], start)) pieces.append( PDB(pd.DataFrame(sse['metadata']['atoms'], columns=columns)).renumber(start)) structure = pd.concat(pieces, sort=False).reset_index() structure['id'] = list(range(1, structure.shape[0] + 1)) return structure, [int(p.iloc[-1]['auth_seq_id']) for p in pieces]
def pdb2pin(options): filelist=options.listfile rootname=options.out verbose=options.show PPI_distance = options.PPI_distance PPI_type = options.PPI_type edges=[] nodes=[] if fileExist(filelist): for line in parse_list_file(filelist): pdb=PDB(line) complexes=cn.Complex(pdb,PPI_distance = PPI_distance, PPI_type = PPI_type) for pair in complexes.PPInterfaces: #if not pair.is_empty: if len(pair.contacts)>0: edges.append((pair.interactor_id,pair.protein_id)) if verbose: sys.stdout.write("Add {0:s}\t{1:s}\n".format(pair.interactor_id,pair.protein_id)) nodes=get_nodes(edges) else: sys.stderr.write("Missing list of PDB files\n") exit(0) ppi=open(rootname+".ppi","w") nds=open(rootname+".dat","w") for (x,y) in edges: ppi.write("%s\t%s\n"%(x,y)) for x in nodes: nds.write("%s\n"%(x))
def get_sequence(pdb): seqs = [] sys.stdout.write(pdb[0] + '\n') if not os.path.isdir(pdb[0]): os.mkdir(pdb[0]) pdbf = os.path.join(pdb[0], '{}.pdb'.format(pdb[0])) if not os.path.isfile(pdbf): wget.download('http://files.rcsb.org/view/{}.pdb'.format(pdb[0]), out=pdbf) pdbstr = PDB(pdbf) qchains = pdb[1] if len(pdb) > 1 else pdbstr.chain_identifiers for chain in qchains: print('\t--' + chain + '--') seqs.append( Sequence('{}_{}'.format(pdb[0], chain), pdbstr.get_chain_by_id(chain).protein_sequence)) return seqs
def on_ppi(self, receptor_path, ligand_path, fold_name=None): ''' Compute the split potentials for a ppi receptor_path = Complete path to the structure of the receptor ligand_path = Complete path to the structure of the ligand fold_name = Optional parameter, name given to the PPI ''' # Get PDB structure (fold structure) receptor = PDB.read_pdb(receptor_path) ligand = PDB.read_pdb(ligand_path) receptor_name = self.get_fold_name_from_fold_path(receptor_path) ligand_name = self.get_fold_name_from_fold_path(ligand_path) if not fold_name: fold_name = '{}-{}'.format(receptor_name, ligand_name) if isinstance(receptor, list): # if the structure obtained is an instance of a list, it means that it is not a unique structure: it is a COMPLEX of more than one chain receptor = PDB.read_pdb(receptor_path, merge_chains=True) # The complex will always be merged if isinstance(ligand, list): # if the structure obtained is an instance of a list, it means that it is not a unique structure: it is a COMPLEX of more than one chain ligand = PDB.read_pdb(ligand_path, merge_chains=True) # The complex will always be merged receptor.set_dssp() receptor.clean() receptor.normalize_residues() ligand.set_dssp() ligand.clean() ligand.normalize_residues() ppi = Interaction(receptor, ligand) # Compute split potentials if self.pot_type == 'CB': cutoff = 12 else: cutoff = 5 split_potentials = SplitPotentialsPPI(c_type=self.pot_type, cutoff=cutoff) # definition of SplitPotentialsPPI class (BioLib.Docking) #global_energies = [] global_energies = split_potentials.calculate_global_energies(ppi, Zscores=True) # calculation of global energies using the method of the SplitPotentialsPPI class print('\nGLOBAL ENERGIES') #residues_energies = [] residues_energies = split_potentials.calculate_residue_energies_between_pairs(ppi, 'R', Zscores=True) # same for residues energies print('\nRESIDUE ENERGIES') self.ppi_xml(fold_name, global_energies, residues_energies, receptor, ligand) # creation of xml results file return
def check_pdb_format(self, structure_path): file_name = self.get_fold_name_from_fold_path(structure_path) try: struct = PDB.read_pdb(structure_path) except: self.print_error(8, file_name) self.write_output_file(self.xml_errors, self.xml_err_file) self.write_output_file(self.xml_result, self.xml_out_file) # Output an empty results file sys.exit(10) return
def pdb_geometry_from_rules(pdb_file: Union[Path, str, Frame3D], rules: List[Tuple], log: Optional[Logger] = None) -> pd.DataFrame: """Calculates the geometry statistic from a PDB. :param log: Job Logger. :param pdb_file: The pdb file to calculate the geometry from. :param rules: The rules to be applied. """ if isinstance(pdb_file, (Path, str)): pdb_file = Path(pdb_file) if not pdb_file.is_file(): raise IOError('PDB structure {} not found.'.format(pdb_file)) pdb3d = PDB(str(pdb_file), format='pdb', clean=True, dehydrate=True, hetatms=False)['AtomTask:PROTEINBACKBONE'] elif isinstance(pdb_file, Frame3D): pdb3d = pdb_file['AtomTask:PROTEINBACKBONE'] else: raise ValueError('Unexpected type for pdb_file.') if log: log.info(f'PDB:Analyzing geometry of {pdb3d.id}\n') log.debug( f'PDB:Available secondary structures {",".join([x[0] for x in rules])}\n' ) log.debug( f'PDB:With ranges {",".join(["{}-{}".format(*x[1]) for x in rules])}\n' ) log.debug( f'PDB:With flip policy {",".join([str(x[2]) for x in rules])}\n') else: sys.stdout.write(f'PDB:Analyzing geometry of {pdb3d.id}\n') sys.stdout.write( f'PDB:Available secondary structures {",".join([x[0] for x in rules])}\n' ) sys.stdout.write( f'PDB:With ranges {",".join(["{}-{}".format(*x[1]) for x in rules])}\n' ) sys.stdout.write( f'PDB:With flip policy {",".join([str(x[2]) for x in rules])}\n') pieces = make_pieces(pdb3d, rules) pieces = make_vectors(pieces, rules) pieces = make_planes(pieces) df = make_angles_and_distances(pieces) df = df.assign(pdb_path=[ str(pdb_file) if not isinstance(pdb_file, Frame3D) else pdb3d.id, ] * df.shape[0]) return df
def reverse_motif( log: Logger, source: Union[str, Path], selection: List[str], attach: List[str], hotspot: str, identifier: str, binder: Optional[str] = None, ) -> Dict: """Process a provided motif so that it can be attached to a :term:`FORM`. :param log: Logger from the calling :class:`.Node` to keep requested verbosity. :param source: File containing the structural data. :param selection: Selection defining the motif of interest. :param hotspot: Single position defining the exposed side. :param identifier: Single position defining the exposed side. :param binder: Selection defining the binder. """ # Load Structure pdbSTR = PDB(source, header=False, dehydrate=True) # Get the full motif to define its planes motif, hotspots = pick_motif(log, pdbSTR, selection, attach, hotspot) # Pick Binder binder = pick_binder(log, pdbSTR, binder) # # Find motif's orientation # eigens = dict(map(reversed, zip(motif['AtomType:CA'].eigenvectors(10), ('perpendicular', 'side', 'major')))) # edist = [np.linalg.norm(hotspot.coordinates - eigens['perpendicular'][0]), np.linalg.norm(hotspot.coordinates - eigens['perpendicular'][-1]), # np.linalg.norm(hotspot.coordinates - eigens['side'][0]), np.linalg.norm(hotspot.coordinates - eigens['side'][-1])] # mdist = edist.index(min(edist)) # if mdist > 1: # swap side and perpendicular # tmp = eigens['side'] # eigens['side'] = eigens['perpendicular'] # eigens['perpendicular'] = tmp # if mdist == 2: # Change perpendicular orientation # eigens['perpendicular'] = np.flip(eigens['perpendicular'], axis=0) # if mdist == 0: # Change perpendicular orientation # eigens['perpendicular'] = np.flip(eigens['perpendicular'], axis=0) # # # Try to identify the orientation of the motif. # for k in eigens: # pymol_arrow(f'arrow_{k}', eigens[k][0], eigens[k][-1], # 'white' if k == 'major' else 'red' if k == 'side' else 'green') return motif, binder, hotspots, attach, selection, identifier
def get_resolutions(self): # resolutions (-1) are for methods that do not define resolution resolutions = {} ftp = ftplib.FTP(PDBftp['address']) ftp.login() ftp.cwd(PDBftp['derived']) resoluIDX = [] ftp.retrlines('RETR ' + PDBftp['resolution'], resoluIDX.append) ftp.quit() SBIglobals.alert('debug', self, 'Retrieving resolution data from PDB FTP...') active = False for line in resoluIDX: if line.startswith('-'): active = True continue if active and len(line.strip()) > 0: data = [x.strip() for x in line.split(';')] if len(data[1]) > 0: SBIglobals.alert( 'debug', self, '\tResolution for {0[0]} is {0[1]}...'.format(data)) # if resolution_threshold is None: resolutions[data[0]] = data[1] #rsync is accumulative, we might have structures that are not in the residu.idx anymore.. must check for pdb_file in self.localPDBs: newfile = File(file_name=pdb_file, action='r') pdbid = newfile.prefix.lstrip('pdb').upper() if pdbid not in resolutions: pdbobj = PDB(pdb_file=pdb_file, header=True, onlyheader=True) SBIglobals.alert( 'debug', self, '\tGrabbing Resolution for {0} is {1}...'.format( pdbid, pdbobj.header.resolution)) resolutions[pdbid] = pdbobj.header.resolution return resolutions
def pdb_geometry_from_rules(pdb_file: Union[Path, str, Frame3D], rules: List[Tuple]) -> pd.DataFrame: """ """ if isinstance(pdb_file, (Path, str)): pdb_file = Path(pdb_file) if not pdb_file.is_file(): raise IOError('PDB structure {} not found.'.format(pdb_file)) pdb3d = PDB(str(pdb_file), format='pdb', clean=True, dehydrate=True, hetatms=False)['AtomTask:PROTEINBACKBONE'] elif isinstance(pdb_file, Frame3D): pdb3d = pdb_file['AtomTask:PROTEINBACKBONE'] else: raise ValueError('Unexpected type for pdb_file.') if TBcore.get_option('system', 'verbose'): sys.stdout.write('PDB:Analyzing geometry of {}\n'.format(pdb3d.id)) if TBcore.get_option('system', 'debug'): sys.stdout.write('PDB:Available secondary structures {}\n'.format( ','.join([x[0] for x in rules]))) sys.stdout.write('PDB:With ranges {}\n'.format(','.join( ['{}-{}'.format(*x[1]) for x in rules]))) sys.stdout.write('PDB:With flip policy {}\n'.format(','.join( [str(x[2]) for x in rules]))) pieces = make_pieces(pdb3d, rules) pieces = make_vectors(pieces, rules) pieces = make_planes(pieces) df = make_angles_and_distances(pieces) df = df.assign(pdb_path=[ str(pdb_file) if not isinstance(pdb_file, Frame3D) else pdb3d.id, ] * df.shape[0]) return df
def toSQL(self): pdbheader = self.header #if pdbheader.valid_resolution and pdbheader.resolution != 'NULL': if pdbheader.experiment.resolution > 0: command = "INSERT INTO {0} VALUES ('{1.id}','{2.date}','{2.header}','{2.xpdta}',{2.resolution:.2f},{2.rfactor:.3f},{2.freeR:.3f});\n".format( tables['main'], self, pdbheader) else: command = "INSERT INTO {0} (pdb,date,header,method) VALUES ('{1.id}','{2.date}','{2.header}','{2.xpdta}');\n".format( tables['main'], self, pdbheader) for deprec in pdbheader.deprecated: command += "INSERT INTO {0} VALUES ('{1}','{2.id}');\n".format( tables['old'], deprec, self) for chain in self.chains: m = pdbheader.get_molecule4chain(chain.chain) f = chain.first_structure l = chain.last_structure command += "INSERT INTO {0}(pdb,chain,name,type,start,idxs,end,idxe) VALUES ('{1.pdb}','{1.chain}','{2.name}','{1.chaintype}',{3.number},'{3.version}',{4.number},'{4.version}');\n".format( tables['chain'], chain, m, f, l) command += "SET @chain = LAST_INSERT_ID();\n" for ec in m.ec: command += PDB._choose_ec(ec) command += "INSERT INTO {0} VALUES (@chain,@ec);\n".format( tables['ec']) for tx in m.taxid: command += "SET @taxid = (SELECT COALESCE((SELECT taxid FROM {0} WHERE oldid={2}),(SELECT id FROM {1} WHERE id={2})));\n".format( tables['taxidold'], tables['taxid'], tx) command += "INSERT INTO {0} VALUES (@chain,@taxid);\n".format( tables['ctaxid']) for dbr in pdbheader.dbrefs: if dbr.chain in self._chain_id: command += "SET @chain = (SELECT nid FROM {0} WHERE pdb='{1}' AND chain='{2}');\n".format( tables['chain'], self.id, dbr.chain) command += "SET @uniprot = (COALESCE((SELECT entry FROM {0} WHERE accession='{1}' LIMIT 1), 'FAKE_PROTEIN'));\n".format( tables['ent2acc'], dbr.uniprot) command += "INSERT INTO {0} VALUES (@chain,@uniprot,{1.start},'{1.idxs}',{1.end},'{1.idxe}');\n".format( tables['pdbuniprot'], dbr) for h in pdbheader.hetero: if h.chain in self._chain_id: command += "INSERT IGNORE INTO {0} VALUES ('{1.id}','{1._name}','{1.form}', 0, 0);\n".format( tables['hetatm'], h) command += "SET @chain = (SELECT nid FROM {0} WHERE pdb='{1}' AND chain='{2}');\n".format( tables['chain'], self.id, h.chain) dataset = 0 if int(h.pos) <= self.get_chain_by_id( h.chain).last_structure.number: dataset = 1 command += "INSERT INTO {0}(chain,position,hetero,inchain) VALUES (@chain, {1.pos}, '{1.id}', {2});\n".format( tables['hetchn'], h, dataset) for m in pdbheader.molecules: command += "INSERT INTO {0} VALUES ();\n".format(tables['repeidx']) for c in pdbheader.molecules[m].chains: if c in self._chain_id: #sometimes a chain is described in the header that has no relation in the coordinates command += "SET @chain = (SELECT nid FROM {0} WHERE pdb='{1}' AND chain='{2}');\n".format( tables['chain'], self.id, c) command += "INSERT INTO {0} VALUES (LAST_INSERT_ID(), @chain);\n".format( tables['repe']) for s in pdbheader.sites: if pdbheader.sites[s].bind is not None: bdata = pdbheader.sites[s].bind if not isinstance(bdata[1], list): bdata[1] = bdata[1].split() if len(bdata[1]) == 1: bdata[1].append(bdata[1][0][1:].strip()) bdata[1][0] = bdata[1][0][0] if (len(bdata[1]) == 0): #only one het of this type in the pdb for e in pdbheader.hetero: if bdata[0] == e.id: bdata[1].append(e.chain) bdata[1].append(e.pos) import re posidx = re.compile('(\-*\d+)(\w*)') m = posidx.match(bdata[1][1]) pos = m.group(1) idx = m.group(2) command += "SET @chain = (SELECT nid FROM {0} WHERE pdb='{1}' AND chain='{2[1][0]}');\n".format( tables['chain'], self.id, bdata) command += "SET @bind = (SELECT nid FROM {0} WHERE chain=@chain AND position={3} AND hetero='{2[0]}');\n".format( tables['hetchn'], self.id, bdata, pos) command += "INSERT INTO {0}(pdb,name,description,bind) VALUES ('{1}','{2.id}','{2.desc}', @bind);\n".format( tables['site'], self.id, pdbheader.sites[s]) else: command += "INSERT INTO {0}(pdb,name,description) VALUES ('{1}','{2.id}','{2.desc}');\n".format( tables['site'], self.id, pdbheader.sites[s]) for p in pdbheader.sites[s].spec: command += "SET @chain = (SELECT nid FROM {0} WHERE pdb='{1}' AND chain='{2}');\n".format( tables['chain'], self.id, p[1]) idx = ' ' num = p[2] try: int(num[-1]) idx = ' ' num = p[2] except: idx = num[-1] num = num[:-1] command += "INSERT IGNORE INTO {0} VALUES(LAST_INSERT_ID(),@chain,{1},'{2}','{3}');\n".format( tables['sitepos'], num, idx, p[0]) command += self.innercontacts.toSQL() command += self.interfaces.toSQL() return command
'auth_seq_id', 'auth_asym_id', 'sse_id', 'internal_num', 'Cartn_x', 'Cartn_y', 'Cartn_z' ]).reset_index(drop=True) binder = pd.DataFrame(binder, columns=[ 'auth_comp_id', 'auth_atom_id', 'auth_seq_id', 'auth_asym_id', 'Cartn_x', 'Cartn_y', 'Cartn_z' ]).reset_index(drop=True) self.log.debug(f'processing motif id: {identifier}') mcolumns, bcolumns = motif.columns.tolist( ), binder.columns.tolist() initial = PDB( pd.concat([motif[bcolumns], binder[bcolumns]], sort=False)) # Get segments of interest segments = [] for i, j, sse in case: if sse['id'] in attach: segment = pd.DataFrame(sse['metadata']['atoms'], columns=[ 'residue', 'auth_atom_id', 'resi_id', 'Cartn_x', 'Cartn_y', 'Cartn_z' ]) segment = segment.assign(sse_id=[sse['id']] * len(segment))
def build(self, pick_aa: Optional[str] = None): """ """ if self._MONO is None or self._PERIODE is None: raise NotImplementedError() # 1. Locate center point for each residue we need to build vector_module = float(self._PERIODE * (self.desc['length'] - 1)) upper_bound = np.copy(np.array([0., 0., 0.], dtype='float64')) + np.array( [0, vector_module / 2, 0]) points = [ np.copy(upper_bound) - np.array([0, self._PERIODE * x, 0]) for x in range(self.desc['length']) ] # 2. Build. For each point, we build one periode at [0, 0, 0]. Then, we rotate and then shift. self.pdb = [] _MONO = pd.DataFrame(self._MONO).T for i, p in enumerate(points): coords = rotate_degrees(_MONO.values, y=self._ROTATION * i) coords = translate(coords, p) self.pdb.append(coords) self.pdb = np.vstack(self.pdb) # We want undirected structures to start always looking up self.pdb = rotate_degrees(self.pdb, x=180) # Apply the case-defined placements for each structure if TBcore.get_option('system', 'debug'): sys.stdout.write('tilt: ' + str(self.desc['tilt']) + '\n') sys.stdout.write('move: ' + str(self.desc['coordinates']) + '\n') self.pdb = rotate_degrees(self.pdb, x=self.desc['tilt']['x'], y=self.desc['tilt']['y'], z=self.desc['tilt']['z']) self.pdb = translate(self.pdb, [ self.desc['coordinates']['x'], self.desc['coordinates']['y'], self.desc['coordinates']['z'] ]) # Prepare other data to create a coordinate entity resis = np.repeat(list(range(1, i + 2)), _MONO.shape[0]) atoms = np.asarray([ _MONO.index.values, ] * (i + 1)).flatten() # Prepare sequence sequence = [] if pick_aa is not None: pick_aa = pick_aa if len( pick_aa) == 3 else alphabet.aminoacids1to3(pick_aa) sequence = [ pick_aa, ] * self.desc['length'] else: for _ in range(self.desc['length']): sequence.append( alphabet.aminoacids1to3(weighted_choice(self._AA_STAT))) sequence = np.repeat(np.asarray(sequence), _MONO.shape[0]) self.pdb = PDB( pd.DataFrame(self.pdb, columns=[ "Cartn_x", "Cartn_y", "Cartn_z" ]).assign(auth_comp_id=sequence).assign(auth_atom_id=atoms).assign( auth_seq_id=resis).assign( id=list(range(1, self.pdb.shape[0] + 1))))
def renumber_pdb(config, path, pdb_name, sequences, dummy_dir): ''' Renumber PDB file located in path folder with the real sequences path Folder where PDB file is located pdb PDB file sequences dictionary of sequences (of ProteinSequence Class from SeqIO) that define the Aa number chain identifier is the key of the dictionary dummy_dir Dummy directory to cerate files ''' #Initialize from SBI.structure.chain import Chain from SBI.sequence import Sequence from SBI.structure import PDB from Bio import SeqIO from Bio import ExPASy from Bio import AlignIO from Bio.Align import Applications clustal_exe = os.path.join(config.get('Paths', 'clustal_path'), 'clustalw2') name_pdb = ".".join(pdb_name.split('/')[-1].split('.')[:-1]) new_pdb = PDB() pdb_file = os.path.join(path, pdb_name) pdb = PDB(pdb_file) pdb.clean() for chain_id, chain_seq in sequences.iteritems(): name_chain = name_pdb + "_" + chain_id name_seq = chain_seq.get_identifier() pdb_chain = pdb.get_chain_by_id(chain_id) new_chain = Chain(name_pdb, chain_id) #define/create files infile = dummy_dir + "/tmp_" + name_chain + "_" + name_seq + ".fa" outfile = dummy_dir + "/tmp_" + name_chain + "_" + name_seq + ".aln" dndfile = dummy_dir + "/tmp_" + name_chain + "_" + name_seq + ".dnd" fd = open(infile, "w") fd.write(">{0:s}\n{1:s}\n".format(name_chain, pdb_chain.protein_sequence)) fd.write(">{0:s}\n{1:s}\n".format(name_seq, chain_seq.get_sequence())) fd.close() try: # run clustalw2 msa_cline = Applications.ClustalwCommandline(clustal_exe, infile=infile, outfile=outfile) child = subprocess.Popen(str(msa_cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell="/bin/bash") child.communicate() #store alignment in compare alignment = AlignIO.read(outfile, 'clustal') structure = alignment[0].seq reference = alignment[1].seq try: len_3d = len(structure) len_ref = len(reference) except Exception as e: sys.stderr.write("ERROR: %s\n" % e) return e except Exception as e: sys.stderr.write("ERROR: %s\n" % e) return e #remove temporary fasta and alignment files remove_files([infile, outfile, dndfile]) #mapping of residues to the original sequence mapping = create_mapping(pdb_chain.protein_idx.split(";"), structure, reference) #fill the new chain with the correct numbering of residues for residue in pdb_chain.aminoacids: pair = (str(residue.number), residue.version) number, version = mapping.get(pair) residue.number = number residue.version = version new_chain.add_residue(residue) #fill the new pdb new_pdb.add_chain(new_chain) return new_pdb
from default_config.masif_opts import masif_opts in_fields = sys.argv[1].split('_') pdb_id = in_fields[0] if not os.path.exists(masif_opts['ligand']['ligand_coords_dir']): os.mkdir(masif_opts['ligand']['ligand_coords_dir']) # Ligands of interest ligands = ['ADP', 'COA', 'FAD', 'HEM', 'NAD', 'NAP', 'SAM'] structure_ligands_type = [] structure_ligands_coords = [] try: structure = PDB( os.path.join(masif_opts['ligand']['assembly_dir'], '{}.pdb'.format(pdb_id))) except: print('Problem with opening structure', pdb) for chain in structure.chains: for het in chain.heteroatoms: # Check all ligands in structure and save coordinates if they are of interest if het.type in ligands: structure_ligands_type.append(het.type) structure_ligands_coords.append(het.all_coordinates) np.save( os.path.join(masif_opts['ligand']['ligand_coords_dir'], '{}_ligand_types.npy'.format(pdb_id)), structure_ligands_type) np.save( os.path.join(masif_opts['ligand']['ligand_coords_dir'], '{}_ligand_coords.npy'.format(pdb_id)),
# Including rotools without adding them to the PYTHONPATH scrdir = os.path.dirname(os.path.realpath(__file__)) rotools = os.path.join(scrdir, "../..") sys.path.append(rotools) from rotools.constraints import ConstraintSet def similar_to(d, mind, ca, cb, geo, bck): if mind >= float(d) - 3 and mind <= float(d) + 3: return "MIN" if ca >= float(d) - 3 and ca <= float(d) + 3: return "CA" if cb >= float(d) - 3 and cb <= float(d) + 3: return "CB" if geo >= float(d) - 3 and geo <= float(d) + 3: return "GEO" if bck >= float(d) - 3 and bck <= float(d) + 3: return "BCK" return "NONE" cs = ConstraintSet.parse(sys.argv[1]) for f in os.listdir(os.getcwd()): if f.endswith('pdb'): data = [] inc = InnerContacts(PDB(f), AA=True, AA_distance=35, HT=False) for cont in inc.AAcontacts[0].contacts: if cs.has_contact(cont.aminoacid1.number, cont.aminoacid2.number): d = cs.get_contact(cont.aminoacid1.number, cont.aminoacid2.number).value s = similar_to(d, cont.min_distance, cont.ca_distance, cont.cb_distance, cont.geometric_distance, cont.backbone_distance) data.append(s) print f, cont.aminoacid1.number, cont.aminoacid2.number, d, cont.min_distance, cont.ca_distance, cont.cb_distance, cont.geometric_distance, cont.backbone_distance, s cc = Counter(data) print "SUMMARY", f, cc, len(data)
#if case.data['metadata']['binder']: # Binder if 'binder' in case.data['metadata']: #full_structure = [pdb,] binders = [] for key in case.data['metadata']['binder']: binder = case.data['metadata']['binder'][key] binders.append(binder) binderfile = os.path.dirname(str(pdb_file)) + f'/binder_{key}.pdb' binder_chains.extend( binder['auth_asym_id'].drop_duplicates().tolist()) #full_structure.append(binder) log.debug(f'Adding binder chains: {binder_chains}') bindersfile = os.path.dirname(str(pdb_file)) + f'/binders.pdb' log.notice(f'Writing structure {bindersfile}') binders = PDB(pd.concat(binders, sort=False)) binders.write(bindersfile, format='pdb', clean=True, force=TBcore.get_option('system', 'overwrite')) full_structure = PDB( pd.concat([pdb[columns], binders[columns]], sort=False)) log.notice(f'Writing structure {full_file}') full_structure.write(str(full_file), format='pdb', clean=True, force=TBcore.get_option('system', 'overwrite')) #else: # pdb.write(str(pdb_file), format='pdb', clean=True, force=TBcore.get_option('system', 'overwrite'))
def main(): #Initialize options = parse_user_arguments() verbose = options.show pdb_path = os.path.join(config.get('Paths', 'modppi_path'), config.get('Paths', 'pdb_path')) dummy_dir = options.dummy_dir try: did_path = os.path.join(config.get('Paths', 'modppi_path'), config.get('Paths', '3did_path')) data_path = os.path.join(config.get('Paths', 'modppi_path'), config.get('Paths', 'data_path')) except: did_path = options.outdir data_path = options.outdir if not os.path.exists(did_path): os.makedirs(did_path) if not os.path.exists(dummy_dir): os.makedirs(dummy_dir) if not os.path.exists(data_path): sys.stderr.write( "No DATA directory, please check your installation or INPUT\n") #Parse did flat file did = parse_3did(options) #Create PDB files of 3DiD interactions for dd, cases in did.iteritems(): for label in xrange(0, len(cases)): #Define the name of the PDB output file with domain-domain interactions did_file = os.path.join( did_path, dd[0] + ":" + dd[1] + "#" + str(label) + ".brk.gz") if not os.path.exists(did_file.lower()): did_file = os.path.join( did_path, dd[0] + ":" + dd[1] + "#" + str(label) + ".brk") if not os.path.exists(did_file.lower()): if verbose: sys.stderr.write("\t\t--Create %s\n" % (did_file.lower())) pdb_code, d1, d2 = cases[label] pdb_file = os.path.join(pdb_path, pdb_code[1:3].lower(), "pdb" + pdb_code + ".ent") if not os.path.exists(pdb_file): pdb_file = os.path.join(pdb_path, pdb_code[1:3].lower(), "pdb" + pdb_code + ".ent.gz") if not os.path.exists(pdb_file): if verbose: sys.stderr.write("\t\t\t-- %s not found\n" % pdb_file) continue try: pdb = PDB(pdb_file) brk = PDB() pdb_chain_A = pdb.get_chain_by_id(d1[0]) start_A = d1[1] end_A = d1[2] pdb_chain_B = pdb.get_chain_by_id(d2[0]) start_B = d2[1] end_B = d2[2] brk_chain_A = pdb_chain_A.extract(init=start_A, end=end_A) brk_chain_A.chain = "A" brk.add_chain(brk_chain_A) brk_chain_B = pdb_chain_B.extract(init=start_B, end=end_B) brk_chain_B.chain = "B" brk.add_chain(brk_chain_B) brk.clean() brk.write(did_file.lower()) except Exception as e: if verbose: sys.stderr.write("\t\t\t Error: %s\n" % e) continue #Create list of interactions and FASTA sequences of 3DiD did_interactions = open(os.path.join(data_path, options.interactions_file), "w") did_fasta = open(os.path.join(data_path, options.seq_file), "w") for brk in os.listdir(did_path): if verbose: sys.stderr.write("\t\t-- Reading %s \n" % os.path.join(did_path, brk)) try: pdb = PDB(os.path.join(did_path, brk)) id_chain = [] for c in pdb.chain_identifiers: pdb_chain = pdb.get_chain_by_id(c) id_chain.append(pdb.id + "_" + c) printfasta(did_fasta, pdb.id + "_" + c, pdb_chain.gapped_protein_sequence) did_interactions.write("%s\t%s\n" % (id_chain[0], id_chain[1])) except Exception as e: if verbose: sys.stderr.write( "\t\t-- %s cannot be read\n\t\t Error: %s\n" % (os.path.join(did_path, brk), e)) continue did_interactions.close() did_fasta.close()
def add_hydrogens(config, path, inp, out, dummy_dir): #Initialize from SBI.structure import PDB import shutil src_path = config.get('Paths', 'modppi_path') hbplus = config.get('Paths', 'hbplus_path') reduce_exe = config.get('Paths', 'reduce_path') reduce_db = config.get('Paths', 'reduce_db_path') relax_exe = config.get('Paths', 'relax_exe') hydrogen_type = config.get('Parameters', 'hydrogens') relax = config.get('Parameters', 'relax') cwd = os.getcwd() os.chdir(path) if fileExist(inp): if len(inp.split('.')) > 0: output_hbplus = ".".join(inp.split('.')[:-1]) + ".h" else: output_hbplus = inp.strip() + ".h" if hydrogen_type == "full": os.system("%s -Quiet %s -DB %s> %s" % (reduce_exe, inp, reduce_db, output_hbplus)) else: os.system("%s -o %s >& hbplus.log" % (hbplus, inp)) if relax == "yes": sys.stdout.write( "\t\t\t-- Relaxing the hydrogen-intermediate model %s (see Rosetta output in relax.log and score.sc)...\n" % output_hbplus) os.system( "%s -s %s -in:file:fullatom -nstruct 1 -packing:repack_only -relax:jump_move false >& relax.log" % (relax_exe, output_hbplus)) opt_model = ".".join(output_hbplus.split('.')[:-1]) + "_0001.pdb" old_model = ".".join( output_hbplus.split('.')[:-1]) + "_non_optimized.pdb" shutil.move(output_hbplus, old_model) if fileExist(opt_model): check_pdb = PDB(opt_model) if check_pdb.has_protein: check_pdb.clean() check_pdb.write(output_hbplus) try: sys.path.remove(opt_model) except: sys.stdout.write("\t\t\t-- Keeping old file %s ...\n" % opt_model) else: shutil.copy(old_model, output_hbplus) else: shutil.copy(old_model, output_hbplus) #clean files if fileExist(opt_model): if fileExist(os.path.join(dummy_dir, opt_model)): os.remove(opt_model) else: shutil.move(opt_model, dummy_dir) if fileExist(old_model): if fileExist(os.path.join(dummy_dir, old_model)): os.remove(old_model) else: shutil.move(old_model, dummy_dir) if not fileExist(output_hbplus): raise ValueError("Cannot find file with hydrogen atoms") else: pdb = PDB(output_hbplus) pdb.clean() pdb.write(out, force=True) os.chdir(cwd)
def build_pdb_object( log: Logger, sses: List[Dict], loops: Union[List[int], int], concat: Optional[bool] = True, outfile: Optional[Union[str, Path]] = None) -> Tuple[Frame3D, List[int]]: """Make the parametrically build atoms in a :class:`.Case` into a PDB file. :param log: Job logger. :param sses: List of the secondary structures to build. Each SSE dictionary must contain the ``metadata.atoms`` keys, already in the final expected position. :param loops: Number of residues between SSE. It can be one less than the number of structures, which assumes no N- or C-terminal, or one more, which assumes N- and C-terminal residues. :param concat: When :data:`True`, return the full stucture as a single object, otherwise return a list of the individual parts. :param outfile: If provided, write the structure to file. """ if isinstance(loops, int): loops = [ loops, ] * (len(sses) - 1) if len(loops) != len(sses) - 1: raise ValueError( 'Number of loops should equal number of SSE minus one.') pieces = [] columns = [ 'auth_comp_id', 'auth_atom_id', 'auth_seq_id', 'Cartn_x', 'Cartn_y', 'Cartn_z' ] start = 1 if len(loops) < len(sses) else loops.pop(0) log.debug(f'starting numbering with: {start}') for i, sse in enumerate(sses): start = start if i == 0 else int( sses[i - 1]['length']) + loops[i - 1] + start pdb_numbering = pd.DataFrame(sse['metadata']['atoms'], columns=columns)['auth_seq_id'].values try: structure = PDB( pd.DataFrame(sse['metadata']['atoms'], columns=columns)).renumber(start) except: structure = PDB( pd.DataFrame(sse['metadata']['atoms'], columns=columns)) structure['auth_seq_id'] += (start - structure['auth_seq_id'].values[0]) structure = structure.assign(sse_id=[sse["id"]] * len(structure), pdb_num=pdb_numbering) pieces.append(structure) structure = pd.concat(pieces, sort=False).reset_index() structure['id'] = list(range(1, structure.shape[0] + 1)) if outfile is not None: structure.write(output_file=str(outfile), format='pdb', clean=True, force=TBcore.get_option('system', 'overwrite')) if not concat: return pieces return structure, [int(p.iloc[-1]['auth_seq_id']) for p in pieces]
from default_config.masif_opts import masif_opts in_fields = sys.argv[1].split("_") pdb_id = in_fields[0] if not os.path.exists(masif_opts["ligand"]["ligand_coords_dir"]): os.mkdir(masif_opts["ligand"]["ligand_coords_dir"]) # Ligands of interest ligands = ["ADP", "COA", "FAD", "HEM", "NAD", "NAP", "SAM"] structure_ligands_type = [] structure_ligands_coords = [] try: structure = PDB( os.path.join(masif_opts["ligand"]["assembly_dir"], "{}.pdb".format(pdb_id)) ) except: print("Problem with opening structure", pdb) for chain in structure.chains: for het in chain.heteroatoms: # Check all ligands in structure and save coordinates if they are of interest if het.type in ligands: structure_ligands_type.append(het.type) structure_ligands_coords.append(het.all_coordinates) np.save( os.path.join( masif_opts["ligand"]["ligand_coords_dir"], "{}_ligand_types.npy".format(pdb_id) ), structure_ligands_type,
def modelling(queriesA_original,queriesB_original,queriesA, queriesB, hit_items_A, hit_items_B, sections_modeled, remaining_sections_A, remaining_sections_B, options): # Initialize verbose = options.show output_dir = options.outdir dummy_dir = options.dummy_dir hydrogens = options.hbplus force_model =options.force python_path = config.get('Paths', 'python_path') src_path = config.get('Paths','modppi_path') modeller_path = os.path.join(config.get('Paths', 'modeller_path')) modpy_path = os.path.join(src_path, config.get('Paths', 'functions_path'),"modpy") numMod= options.nmodels renumerate = options.renumerate # Assign the PID to the dummy modeling and avoid overwritting files modelling_dummy_name = 'modelling_' + str(os.getpid()) + str(random.randint(0,os.getpid())) #modelling_dummy_name = 'modelling_' + str(os.getpid()) make_subdirs(dummy_dir, subdirs = [modelling_dummy_name]) modelling_dir = os.path.join(dummy_dir, modelling_dummy_name) # Get items from the hits query_A_orig = queriesA_original.get(hit_items_A[0]) query_B_orig = queriesB_original.get(hit_items_B[0]) query_A = queriesA.get(hit_items_A[0]).get_sequence() query_B = queriesB.get(hit_items_B[0]).get_sequence() query_name_A = hit_items_A[0] query_name_B = hit_items_B[0] query_id_A = query_name_A.split(':')[0] query_start = hit_items_A[4][0] query_end = int(hit_items_A[4][-1]) + int(hit_items_B[4][-1]) template_name_A_chain = hit_items_A[1] template_name_B_chain = hit_items_B[1] template_chain_A_chain = template_name_A_chain.split('_')[-1] template_chain_B_chain = template_name_B_chain.split('_')[-1] template_A_chain_start = hit_items_A[5][0] template_B_chain_start = hit_items_B[5][0] template_id_A = "_".join(template_name_A_chain.split('_')[:-1]) template_id_B = "_".join(template_name_B_chain.split('_')[:-1]) sequences_complex = {} sequences_complex.setdefault("A",query_A_orig) sequences_complex.setdefault("B",query_B_orig) # Get the positions of the current section extension_threshold = int(config.get('Parameters', 'extension_threshold')) current_A_section = [hit_items_A[4][0], hit_items_A[4][-1]] current_B_section = [hit_items_B[4][0], hit_items_B[4][-1]] current_sections = [current_A_section, current_B_section] current_interaction = '%s::%s' %(query_name_A, query_name_B) # Initialize 'sections_modeled' dictionary if not sections_modeled.get(current_interaction): section_group = sections_modeled.setdefault(current_interaction, []) section_group.append(current_sections) # Check if the segments of the current interaction belong to a previous group for section_pair in sections_modeled.get(current_interaction): # The segments must be within a given interval if (section_pair[0][0] - extension_threshold <= current_sections[0][0] <= section_pair[0][0] + extension_threshold and section_pair[0][1] - extension_threshold <= current_sections[0][1] <= section_pair[0][1] + extension_threshold and section_pair[1][0] - extension_threshold <= current_sections[1][0] <= section_pair[1][0] + extension_threshold and section_pair[1][1] - extension_threshold <= current_sections[1][1] <= section_pair[1][1] + extension_threshold): current_sections = section_pair break # If the segments are not within the interval, create a new group else: section_group = sections_modeled.setdefault(current_interaction, []) section_group.append(current_sections) # Get the sections that have not been used in the alignment query_A_fragment_used = hit_items_A[2].replace('-', '') query_B_fragment_used = hit_items_B[2].replace('-', '') remaining_terminus_A = query_A.split(query_A_fragment_used) remaining_terminus_B = query_B.split(query_B_fragment_used) Nterminus_name_A = '%s_1-%s' %(query_name_A, hit_items_A[4][0] - 1) Cterminus_name_A = '%s_%s-%s' %(query_name_A, hit_items_A[4][-1] + 1, len(query_A)) Nterminus_name_B = '%s_1-%s' %(query_name_B, hit_items_B[4][0] - 1) Cterminus_name_B = '%s_%s-%s' %(query_name_B, hit_items_B[4][-1] + 1, len(query_B)) # If there are remaining sections, store them in the dictionary if hit_items_A[4][0] > 1: remaining_sections_A[Nterminus_name_A] = ProteinSequence(Nterminus_name_A, remaining_terminus_A[0]) if hit_items_A[4][-1] < len(query_A): remaining_sections_A[Cterminus_name_A] = ProteinSequence(Cterminus_name_A, remaining_terminus_A[-1]) if hit_items_B[4][0] > 1: remaining_sections_B[Nterminus_name_B] = ProteinSequence(Nterminus_name_B, remaining_terminus_B[0]) if hit_items_B[4][-1] < len(query_B): remaining_sections_B[Cterminus_name_B] = ProteinSequence(Cterminus_name_B, remaining_terminus_B[-1]) #Create LOG for tests if verbose: dummy_log_file="%s/%s.log"%(modelling_dir, template_id_A) dummy_log=open(dummy_log_file,"a") # Create PDB file if verbose: sys.stdout.write('\t\t-- Using templates %s and %s...\n' %(template_name_A_chain, template_name_B_chain)) pdb_name = template_id_A dummy_pdb_file = '%s/%s.pdb' %(modelling_dir, pdb_name.replace(":","-")) # Initialize PDB object pdb_obj = PDB() #Check template in PDB files src_path = config.get('Paths','modppi_path') pdb_path = os.path.join(src_path, config.get('Paths', 'pdb_path'), template_id_A[1:3].lower()) pdb_file = os.path.join(pdb_path, 'pdb' + template_id_A.lower() + '.ent') if not os.path.exists(pdb_file): sys.stderr.write('WARNING: PDB file %s was not found, try compressed\n' %(pdb_file)) pdb_file = os.path.join(pdb_path, 'pdb' + template_id_A.lower() + '.ent.gz') #Check now template in 3DiD files if not os.path.exists(pdb_file): sys.stderr.write('WARNING: PDB file %s was not found, try 3DiD ".brk" suffix\n' %(pdb_file)) pdb_path = os.path.join(src_path, config.get('Paths', '3did_path')) pdb_file = os.path.join(pdb_path, template_id_A.lower() + '.brk') if not os.path.exists(pdb_file): sys.stderr.write('WARNING: PDB file %s was not found, try 3DiD ".brk" suffix compressed\n' %(pdb_file)) pdb_file = os.path.join(pdb_path, template_id_A.lower() + '.brk.gz') # If the PDB file is not found in the database, skips to the next interaction if not os.path.exists(pdb_file): sys.stderr.write('WARNING: PDB file %s was not found\n' %(pdb_file)) raise ModelException pdb_chain_obj = PDB(pdb_file) pdb_chain_obj.clean() # Add only the chains present in the alignment pdb_obj.add_chain(pdb_chain_obj.get_chain_by_id(template_chain_A_chain)) pdb_obj.add_chain(pdb_chain_obj.get_chain_by_id(template_chain_B_chain)) # Get sequences from PDB, where 'x' are gaps and 'X' are heteroatoms pdb_seqA = pdb_obj.chains[0].gapped_protein_sequence.replace('x', '-').replace('X', '.') pdb_seqB = pdb_obj.chains[1].gapped_protein_sequence.replace('x', '-').replace('X', '.') # Create the dummy PDB file pdb_obj.clean() pdb_obj.write(output_file = dummy_pdb_file,force=True) # Check contacts PPI_threshold_type = config.get('Parameters', 'PPI_threshold_type') PPI_distance_threshold = float(config.get('Parameters', 'PPI_distance_threshold')) protein_complex = Complex(pdb_obj, PPI_type = PPI_threshold_type, PPI_distance = PPI_distance_threshold) # If the proteins don't form a complex, avoids modelling if len(protein_complex.PPInterfaces[0].contacts) == 0: sys.stderr.write('WARNING: No interaction between %s and %s ( for %s %s)\n' %(template_name_A_chain, template_name_B_chain, query_name_A, query_name_B)) remove_files([dummy_pdb_file]) raise ModelException else: if verbose: sys.stdout.write('\t\t\t-- Accepted interaction between %s and %s ( for %s %s)...\n' %(template_name_A_chain, template_name_B_chain, query_name_A, query_name_B)) # Correct possible discrepancies between the template sequence found in the FASTA file of the nodes in the PIN and the sequence found in the PDB file # e.g. The sequence of a protein can have an 'X' in the FASTA file and an 'M' in the newest version of the PDB file template_seqA = hit_items_A[3] template_seqA_ungapped = re.sub('-', '', template_seqA) pdbA_section = pdb_seqA[hit_items_A[5][0]-1:hit_items_A[5][-1]] for pair in itertools.izip(template_seqA_ungapped, pdbA_section): if pair[0] == 'X' or pair[0] == 'x': template_seqA = re.sub('[xX]', pair[1], template_seqA, 1) template_seqB = hit_items_B[3] template_seqB_ungapped = re.sub('-', '', template_seqB) pdbB_section = pdb_seqB[hit_items_B[5][0]-1:hit_items_B[5][-1]] for pair in itertools.izip(template_seqB_ungapped, pdbB_section): if pair[0] == 'X' or pair[0] == 'x': template_seqB = re.sub('[xX]', pair[1], template_seqB, 1) if verbose: dummy_log.write("Hits_items_A: %s\n"%([str(x) for x in hit_items_A])) if verbose: dummy_log.write("Hits_items_B: %s\n"%([str(x) for x in hit_items_B])) if verbose: dummy_log.write("pdbA_section %s\n"%pdbA_section) if verbose: dummy_log.write("pdbB_section %s\n"%pdbB_section) if verbose: dummy_log.write("length PDB A: %d\n"%len(pdb_seqA)) if verbose: dummy_log.write("length PDB B: %d\n"%len(pdb_seqB)) # Add the remaining residues at the beginning or at the end of the template sequences, if needed template_seqA = re.sub('[xX]', '-', template_seqA) if template_A_chain_start > 1: template_A_first_residues = ''.join(pdb_seqA[:hit_items_A[5][0]-1]) template_seqA = template_A_first_residues + template_seqA if hit_items_A[5][-1] < len(pdb_seqA): template_seqA += ''.join(pdb_seqA[hit_items_A[5][-1]:]) template_seqB = re.sub('[xX]', '-', template_seqB) if template_B_chain_start > 1: template_B_first_residues = ''.join(pdb_seqB[:hit_items_B[5][0]-1]) template_seqB = template_B_first_residues + template_seqB if hit_items_B[5][-1] < len(pdb_seqB): template_seqB += ''.join(pdb_seqB[hit_items_B[5][-1]:]) if verbose: dummy_log.write("FINAL template_seqA %s\n"%template_seqA) if verbose: dummy_log.write("FINAL template_seqB %s\n"%template_seqB) # Add gaps at the beginning of the query sequences, if needed gaps_number_A_chain_beginning = 0 gaps_number_B_chain_beginning = 0 if template_A_chain_start > 1: gaps_number_A_chain_beginning = int(template_A_chain_start) - 1 if template_B_chain_start > 1: gaps_number_B_chain_beginning = int(template_B_chain_start) - 1 A_chain_query_seq = ''.join(['-' for i in range(gaps_number_A_chain_beginning)]) + re.sub('[xX]', '-', hit_items_A[2]) B_chain_query_seq = ''.join(['-' for i in range(gaps_number_B_chain_beginning)]) + re.sub('[xX]', '-', hit_items_B[2]) # Add gaps at the end of the query sequences, if needed for pair in itertools.izip_longest(A_chain_query_seq, template_seqA): if pair[0] == None: A_chain_query_seq += '-' for pair in itertools.izip_longest(B_chain_query_seq, template_seqB): if pair[0] == None: B_chain_query_seq += '-' # Create PIR alignment query_whole_seq = A_chain_query_seq + '/' + B_chain_query_seq + '*' template_whole_seq = template_seqA + '/' + template_seqB + '*' header1 = '>P1;%s\nsequence:%s:%s:.:%s:.:.:.:.:.' %(query_id_A, query_id_A, query_start, query_end) header2 = '>P1;%s\nstructureX:%s:1:%s:.:%s:.:.:.:.' %(template_id_A.replace(":","-"), template_id_A.replace(":","-"), template_chain_A_chain, template_chain_B_chain) lines = [] lines.append(header1) lines.extend([query_whole_seq[i:i+60] for i in range(0, len(query_whole_seq), 60)]) lines.append(header2) lines.extend([template_whole_seq[i:i+60] for i in range(0, len(template_whole_seq), 60)]) pir_alignment = '\n'.join(lines) pir_file = open('%s/alignment.pir' %(modelling_dir), 'w+') for line in lines: pir_file.write('%s\n' %(line)) pir_file.close() # Model # Create a folder for the models of each type of interaction if '-' in query_name_A: query_name_A = query_name_A.rsplit('_', 1)[0] if '-' in query_name_B: query_name_B = query_name_B.rsplit('_', 1)[0] interaction_dir = os.path.join(output_dir , '%s::%s' %(query_name_A, query_name_B)) if not os.path.exists(interaction_dir): make_subdirs(output_dir, subdirs = ['./%s::%s' %(query_name_A, query_name_B)]) # If the models do not yet exist, proceed and add in the list of MODELS do_model=False model_path = os.path.abspath(interaction_dir) for imodel in xrange(1,numMod+1): model_name = '%s_%s_%d-%d::%s_%s_%d-%d#%d.pdb' %(template_id_A, template_chain_A_chain, current_sections[0][0], current_sections[0][1], template_id_B, template_chain_B_chain, current_sections[1][0], current_sections[1][1],imodel) model_path_model = os.path.join(model_path , model_name) #print "CHECK %s %s\n"%(do_model,model_path_model) with open(interaction_dir + '/%s.list' %(current_interaction), 'a+') as paths_to_models_file: if model_path_model not in paths_to_models_file.read(): paths_to_models_file.write(model_path_model + '\n') if not do_model and not fileExist( model_path_model ): do_model=True #Complete the set of models if do_model or force_model: # Keep the current working directory, then change to the modelling folder cwd = os.getcwd() os.chdir(modelling_dir) try: if options.optimize: process = subprocess.check_output([os.path.join(modeller_path, 'modpy.sh'), os.path.join(python_path, 'python'), os.path.join(modpy_path, 'simpleModel.py'), '--pir=' + './alignment.pir', '--out=%s-%s' %(template_name_A_chain, template_name_B_chain), '--models=%d'%(numMod), '--optimize'], stderr = subprocess.STDOUT) else: process = subprocess.check_output([os.path.join(modeller_path, 'modpy.sh'), os.path.join(python_path, 'python'), os.path.join(modpy_path, 'simpleModel.py'), '--pir=' + './alignment.pir', '--out=%s-%s' %(template_name_A_chain, template_name_B_chain), '--models=%d'%(numMod)], stderr = subprocess.STDOUT) except Exception as e: sys.stderr.write("ERROR: %s\n"%(e)) sys.stderr.write("LOCATION; %s\n"%modelling_dir) if verbose: os.system("grep get_ran %s"%(template_name_A_chain+"-"+template_name_B_chain+".log")) if verbose: sys.stderr.write("\t\tSkip models with template %s\n"%(model_name)) os.chdir(cwd) raise ModelException # Clean and rename all models for imodel in xrange(1,numMod+1): label_model=99990000+imodel input_model = '%s.B%s.pdb' %(query_id_A,str(label_model)) model_name = '%s_%s_%d-%d::%s_%s_%d-%d#%d.pdb' %(template_id_A, template_chain_A_chain, current_sections[0][0], current_sections[0][1], template_id_B, template_chain_B_chain, current_sections[1][0], current_sections[1][1],imodel) model_path_model = os.path.join(model_path , model_name) if fileExist(os.path.abspath('%s' %(input_model))): # Check contacts check_pdb_obj=PDB(os.path.abspath('%s' %(input_model))) PPI_threshold_type = config.get('Parameters', 'PPI_threshold_type') PPI_distance_threshold = float(config.get('Parameters', 'PPI_distance_threshold')) check_protein_complex = Complex(check_pdb_obj, PPI_type = PPI_threshold_type, PPI_distance = PPI_distance_threshold) if len(check_protein_complex.PPInterfaces[0].contacts) == 0: if verbose: sys.stdout.write("\t\t\t-- Skip model without contacts %s\n"%model_name) continue else: if verbose: sys.stdout.write("\t\t\t-- Accepted model %s\n"%model_name) if hydrogens: if verbose: sys.stdout.write("\t\t\t-- Adding hydrogens and relaxing the model %s\n"%model_name) output_model=model_name try: add_hydrogens(config,os.path.abspath("./"),input_model, output_model,dummy_dir) except ValueError as e: sys.stderr.write("WARNING %s\n"%e) os.rename(input_model, output_model) else: output_model=model_name os.rename(input_model, output_model) if renumerate: if verbose: sys.stdout.write("\t\t\t-- Renumerate residues as original sequence\n") output_model_renumber=model_name+".re" try: pdb_renumber=PDB() pdb_renumber=renumber_pdb(config,os.path.abspath("./"),output_model,sequences_complex,os.path.abspath("./")) pdb_renumber.write(output_model_renumber) os.rename(output_model_renumber,output_model) except Exception as e: sys.stderr.write("WARNING %s\n"%e) shutil.copy(output_model, model_path_model) os.chdir(cwd) try: shutil.rmtree(modelling_dir) except Exception as e: sys.stderr.write("WARNING first attempt to remove folder %s\n"%e) try: os.system("\\rm -r %s"%(modelling_dir)) except Exception as ee: sys.stderr.write("WARNING last attempt %s\n"%ee) return sections_modeled, remaining_sections_A, remaining_sections_B