def get_pdb_sstruct_info(pdb): """ """ statement = text(""" SELECT pdb_chain_id, pdb_res_name, pdb_res_num, pdb_ins_code, sstruct_serial FROM pdb_dev.res_map WHERE pdb = :pdb """) result = engine.execute(statement, pdb=pdb.upper()).fetchall() sstruct_info = {} # iterate through secondary structure mapping for this pdb sequence for row in result: # list is not hashable row = tuple(row.values()) # create a key/value pair in the form pdb id => sstruct serial sstruct_info[row[:-1]] = row[-1] app.log.debug( "{0} protein fragments were identified through SIFTS.".format( len(sstruct_info))) return sstruct_info
def get_pdb_ligand_info(pdb): """ """ SQL = text(""" SELECT pdb_chain_id, het_id, res_num, ins_code, 2 as entity_type_bm FROM pdb_dev.ligands WHERE pdb = :pdb AND (ins_code = '' OR ins_code = ' ') -- AND is_observed = true UNION SELECT pdb_chain_id, NULL, NULL, ' ', 34 as entity_type_bm FROM pdb_dev.peptide_ligands WHERE pdb = :pdb ORDER BY 1, 3, 4 """) result = engine.execute(SQL, pdb=pdb.upper()).fetchall() # create a mapping between the pdb identifier of the residue and its entity type ligands = (((row.pdb_chain_id, row.het_id, row.res_num, row.ins_code or ' '), row.entity_type_bm) for row in result) ligands = dict(ligands) app.log.debug( "structure {0} contains {1} ligands according to mmCIF:".format( pdb, len(ligands))) for tup in ligands.keys(): app.log.debug(" {0} {1} {2}".format(*tup)) return ligands
def get_credo_pdbs(generator=True): """ Returns a list of all the PDB entries that are currently stored in CREDO. Important for doing incremental updates. """ statement = text("SELECT pdb FROM credo.structures ORDER BY 1") result = engine.execute(statement).fetchall() if generator: return (row.pdb for row in result) return result
def get_pdb_polymer_info(pdb): """ Returns a complete polymer sequence residue mapping including the entity type bit mask. """ statement = text(""" SELECT pdb_strand_id as pdb_chain_id, pdb_mon_id as res_name, pdb_seq_num::integer AS res_num, CASE WHEN pdb_ins_code = '' THEN ' ' ELSE pdb_ins_code END AS ins_code, CASE -- PROTEIN WHEN ep.type = 'polypeptide(L)' OR ep.type = 'polypeptide(D)' THEN 32 -- DNA WHEN ep.type = 'polydeoxyribonucleotide' THEN 16 -- RNA WHEN ep.type = 'polyribonucleotide' THEN 8 -- DNA/RNA HYBRID WHEN ep.type = 'polydeoxyribonucleotide/polyribonucleotide hybrid' THEN 24 -- POLYSACCHARIDE WHEN ep.type = 'polysaccharide(D)' THEN 4 ELSE 0 END AS entity_type_bm FROM {mmcif}.pdbx_poly_seq_scheme p JOIN {mmcif}.entity_poly ep ON p.structure_id = ep.structure_id AND p.entity_id = ep.entity_id WHERE p.pdb_mon_id != '' AND p.Structure_ID = :pdb ORDER BY 1, 3, 4 """.format(mmcif=app.config.get('database', 'mmcif'))) result = engine.execute(statement, pdb=pdb.upper()).fetchall() # create a mapping between the pdb identifier of the residue and its entity type residues = (((row.pdb_chain_id, row.res_name, row.res_num, row.ins_code), row.entity_type_bm) for row in result) residues = dict(residues) app.log.debug( "structure contains {0} polymer residues according to mmCIF.".format( len(residues))) return residues
def get_current_pdbs(args): ''' Every structure not in this list is most likely obsolete. ''' statement = text(""" SELECT structure_id as pdb FROM {mmcif}.entry e LEFT JOIN pdb_dev.banned b ON e.structure_id = b.pdb WHERE b.pdb IS NULL ORDER BY 1 """.format(mmcif=app.config.get('database', 'mmcif'))) engine.echo = False # Forced echo off since SQLAlchemy prints to STDOUT and output gets mixed with the PDB list. result = engine.execute(statement).fetchall() pdbs = (row['pdb'] for row in result) if args.offset: pdbs = apply_offset(pdbs, args.offset.upper()) if args.limit: pdbs = apply_limit(pdbs, args.limit) return pdbs
def get_biomt(pdb): """ """ statement = text(""" SELECT assembly_serial, assembly_size, is_monomeric, pdb_chain_id, rotation, translation, is_at_identity FROM pdb_dev.biomt b JOIN pdb_dev.biomt_ops o USING(biomt_id) WHERE pdb = :pdb ORDER BY 1,3 """) # fetch data from pisa database result = engine.execute(statement, pdb=pdb.upper()).fetchall() biomt = {} is_monomeric = False # iterate through assemblies for (assembly_serial, assembly_size, is_monomeric), chain_iter in groupby(result, key=itemgetter(0, 1, 2)): biomt[assembly_serial] = {} # do not return pisa data for large assemblies / asu will be used instead if assembly_size > 26: app.log.warn( "one of the predicted assemblies contains {} chains - " "the asymmetric unit will be used instead.".format( assembly_size)) biomt = {} break # the complete ASU is monomeric and has to be split into individual chains if is_monomeric: break # iterate through chains for pdb_chain_id, operation_iter in groupby(chain_iter, key=itemgetter(3)): biomt[assembly_serial].update({str(pdb_chain_id): {}}) for operation_serial, operation in enumerate(operation_iter, 1): rotation, translation, is_at_identity = operation[4:] details = { 'rotation': OEFloatArray(rotation), 'translation': OEFloatArray(translation), 'is_at_identity': is_at_identity } biomt[assembly_serial][pdb_chain_id][ operation_serial] = details # debug assembly information try: app.log.debug("BIOMT contains {0} assembly/assemblies.".format( max(biomt.keys()))) # PDB entry does not have a REMARK 350 except ValueError: app.log.debug("NO REMARK 350 found.") return biomt, is_monomeric
def do(controller): """ """ # timer to clock functions and parts of the program timer = Timer() timer.start("app") # get the controller command cmd = controller.command # get the command line arguments and options args = controller.pargs insert = binding_site_fuzcav.insert() tracker = fuzcav.get_tracker() # get the fuzcav side chain representative table from the credoscript metadata metadata.reflect(schema='bio', only=('fuzcav_rep_sc_atoms', )) fuzcav_rep_sc_atoms = Table('bio.fuzcav_rep_sc_atoms', metadata, autoload=True) timer.start() session = Session() # get all ligands that have more than 7 heavy atoms and no clashes query = session.query(Ligand.ligand_id, Ligand.biomolecule_id) query = query.filter( and_(Ligand.num_hvy_atoms >= 7, Ligand.is_clashing == False)) if args.incremental: # subquery to get the current max ligand_id from the binding_site_fuzcav table sq = session.query( func.max(binding_site_fuzcav.c.ligand_id).label( 'ligand_id')).subquery('sq') # only include new ligands query = query.filter(Ligand.ligand_id > sq.c.ligand_id) ligand_ids = query.order_by(Ligand.ligand_id).all() # debug how much time it took to get all contacts app.log.debug( "all new ligand identifiers retrieved in {0:.2f} seconds.".format( timer.elapsed())) # query = BindingSiteResidue.query.join('Peptide', 'Atoms') #query = query.join(Peptide, Peptide.residue_id==BindingSiteResidue.residue_id) #query = query.join(Atom, Atom.residue_id==Peptide.residue_id) query = query.outerjoin( fuzcav_rep_sc_atoms, and_(fuzcav_rep_sc_atoms.c.res_name == Peptide.res_name, fuzcav_rep_sc_atoms.c.atom_name == Atom.atom_name)) query = query.filter( and_( Peptide.is_non_std == False, or_(Atom.atom_name == 'CA', fuzcav_rep_sc_atoms.c.atom_name != None))) query = query.with_entities(Peptide.res_name, Atom) if args.progressbar: bar = ProgressBar(widgets=[ 'Binding Sites: ', SimpleProgress(), ' ', Percentage(), Bar() ], maxval=len(ligand_ids)).start() # iterate through ligands for counter, row in enumerate(ligand_ids, 1): if args.progressbar: bar.update(counter) ligand_id, biomolecule_id = row.ligand_id, row.biomolecule_id timer.start() # get all the fuzcav atoms (either CA or representative) # important to use the proper atom partition! atoms = query.filter( and_(BindingSiteResidue.ligand_id == ligand_id, Atom.biomolecule_id == biomolecule_id)).all() # debug how much time it took to get all contacts app.log.debug("all FuzCav atoms retrieved in {0:.2f} seconds.".format( timer.elapsed())) # ignore hits with too few peptides if len(atoms) < 14: app.log.debug("Ligand {} has only {} FuzCav atoms and will be " "ignored.".format(ligand_id, len(atoms))) continue # get the calpha atom and its features for each residue calphas = ((np.array(atom.coords, dtype=float), (fuzcav.FEATURES[res_name])) for res_name, atom in atoms if atom.atom_name == 'CA') # get the representative atom and its features for each residue representatives = ( (np.array(atom.coords, dtype=float), (fuzcav.FEATURES[res_name])) for res_name, atom in atoms if atom.atom_name == fuzcav.REPRESENTATIVES[res_name]) timer.start() calphafp = fuzcav.make_fp(calphas, tracker) repfp = fuzcav.make_fp(representatives, tracker) # debug how much time it took to get all contacts app.log.debug("fingerprints generated in {0:.2f} seconds.".format( timer.elapsed())) # insert the fingerprints into the table if not args.dry_run: engine.execute(insert, ligand_id=ligand_id, calphafp=calphafp.tolist(), repfp=repfp.tolist()) # finish the optional progress bar if args.progressbar: bar.finish() session.close()