def import_3Ds(db_host = 'localhost', db_port = 27017, rna3dhub = False, canonical_only = True, annotate = False, limit = 5000): client = MongoClient(db_host, db_port) db_name = "" if rna3dhub: db_name = "RNA3DHub" else: rna3dHub = None db_name = "PDB" db = client[db_name] rnaview = Rnaview() if not rna3dhub: pdb = PDB() query ="""<orgPdbQuery> <version>head</version> <queryType>org.pdb.query.simple.ChainTypeQuery</queryType> <description>Chain Type: there is a Protein and a RNA chain but not any DNA or Hybrid</description> <containsProtein>N</containsProtein> <containsDna>N</containsDna> <containsRna>Y</containsRna> <containsHybrid>N</containsHybrid> </orgPdbQuery>""" pdb_ids = pdb.query(query) print "%i 3Ds to process"%len(pdb_ids) for pdb_id in pdb_ids: if db['tertiaryStructures'].find_one({'source':"db:pdb:%s"%pdb_id}): continue print "Recover %s"%pdb_id for ts in parsers.parse_pdb(pdb.get_entry(pdb_id)): try: ss = None if annotate: ss, ts = rnaview.annotate(ts, canonical_only = canonical_only) save(db, ss, ts, pdb_id, limit) except Exception, e: print e print "No annotation for %s"%pdb_id save(db, None, ts, pdb_id, limit)
def test(): print "Recovering entry 1EHZ from Protein Databank...\n" pdb = PDB() tertiary_structures = parse_pdb(pdb.get_entry('1EHZ')) print "## 3D annotation ##\n" print "List of base-pairs computed with RNAVIEW:\n" for ts in tertiary_structures: secondary_structure, tertiary_structure = Rnaview().annotate(ts) print secondary_structure_to_base_pairs(secondary_structure, keep_tertiaries = True) print "\n## 2D prediction ##\n" for ts in tertiary_structures: print "RNA sequence from 1EHZ:\n" print ts.rna.sequence print "\nList of base-pairs computed with RNAfold (RNA Vienna Package):\n" print Rnafold().fold(molecule=ts.rna)
save(db, ss, ts, pdb_id, limit) except Exception, e: print e print "No annotation for %s"%pdb_id else: pdb = PDB() rna3dHub = RNA3DHub() clusters = rna3dHub.get_clusters() print "%i 3Ds to process"%len(clusters) for cluster in clusters['pdb-ids']: if db['tertiaryStructures'].find_one({'source':"db:pdb:%s"%cluster[0]}): continue print "Recover %s"%cluster[0] #we use the first pdb_id in the list of ids making a cluster for ts in parsers.parse_pdb(pdb.get_entry(cluster[0])): try: ss = None if annotate: ss, ts = rnaview.annotate(ts, canonical_only = canonical_only) save(db, ss, ts, cluster[0], limit) except Exception, e: print e print "No annotation for %s"%cluster[0] def save(db, secondary_structure, tertiary_structure, pdbId, limit): if db['junctions'].count() >= limit: print "Limit of %i junctions reached"%limit sys.exit()
except Exception, e: print e print "No annotation for %s"%pdb_id save(db, None, ts, pdb_id, limit) else: pdb = PDB() rna3dHub = RNA3DHub() clusters = rna3dHub.get_clusters() print "%i 3Ds to process"%len(clusters) for cluster in clusters['pdb-ids']: pdb_id = cluster[0].split('|')[0] if db['tertiaryStructures'].find_one({'source':"db:pdb:%s"%pdb_id}): continue print "Recover %s"%pdb_id #we use the first pdb_id in the list of ids making a cluster for ts in parsers.parse_pdb(pdb.get_entry(pdb_id)): try: ss = None if annotate: ss, ts = rnaview.annotate(ts, canonical_only = canonical_only) save(db, ss, ts, pdb_id, limit) except Exception, e: print e print "No annotation for %s"%pdb_id save(db, None, ts, pdb_id, limit) def save(db, secondary_structure, tertiary_structure, pdbId, limit): if db['junctions'].count() >= limit: print "Limit of %i junctions reached"%limit sys.exit()
from pyrna.computations import Cmalign, Rnaview from bson.objectid import ObjectId import os pdb = PDB() cmalign = Cmalign() rnaview = Rnaview() rfam = Rfam(cache_dir = "/home/fjossinet/tmp/Rfam") families_with_structures = rfam.get_families_with_structures() for index, row in families_with_structures.iterrows(): rfam_id = row['rfam_id'] tertiary_structures = parse_pdb(pdb.get_entry(row['pdb_id'])) reference_rna = None ts = None for tertiary_structure in tertiary_structures: if tertiary_structure.rna.name == row['chain_name']: ts = tertiary_structure reference_rna = ts.rna break if ts: secondary_structure, tertiary_structure = rnaview.annotate(tertiary_structure = ts) rnas, orgs, consensus_2d = cmalign.align([reference_rna], rfam_id = rfam_id, rfam = rfam) os.mkdir("/home/fjossinet/tmp/%s"%rfam_id) os.mkdir("/home/fjossinet/tmp/%s/Molecules"%rfam_id)
from pyrna.computations import Cmalign, Rnaview from bson.objectid import ObjectId import os pdb = PDB() cmalign = Cmalign() rnaview = Rnaview() rfam = Rfam(cache_dir="/home/fjossinet/tmp/Rfam") families_with_structures = rfam.get_families_with_structures() for index, row in families_with_structures.iterrows(): rfam_id = row['rfam_id'] tertiary_structures = parse_pdb(pdb.get_entry(row['pdb_id'])) reference_rna = None ts = None for tertiary_structure in tertiary_structures: if tertiary_structure.rna.name == row['chain_name']: ts = tertiary_structure reference_rna = ts.rna break if ts: secondary_structure, tertiary_structure = rnaview.annotate( tertiary_structure=ts) rnas, orgs, consensus_2d = cmalign.align([reference_rna], rfam_id=rfam_id,