def load_data(self, dname, dfile, indexing=None, usevep=True, upload=True): """ Loads a data file into the PDBMap database """ if usevep: d = PDBMapData(vep=self.vep, vep_cache=self.vep_cache, dname=dname) else: d = PDBMapData(dname=dname) if not os.path.exists(dfile): dfile = "%s.ped" % dfile # Test if PEDMAP basename if not os.path.exists(dfile): msg = " ERROR (PDBMap) File does not exist: %s" % dfile raise Exception(msg) io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, dlabel=dname) # Determine file type ext = dfile.split('.')[-1].lower() if ext == 'gz': ext = dfile.split('.')[-2].lower() # Process and accordingly if ext == 'vcf': if upload: print "\nUploading VCF to supplemental database..." nrows = d.load_vcffile(dfile, io, args.buffer_size) print "%d VCF records uploaded to supplemental database before processing" % nrows generator = d.load_vcf(dfile, usevep) elif ext in ["bed", "txt", "csv"]: if usevep: print "\nNote: You have provided a %s file and requested VEP analysis." % ext print " There are three acceptable values for the 'name' column." print " 1. REF/ALT - SNP alleles will be used as input to VEP" print " 2. rsID - SNP names will be used as input to VEP." print " 3. HGVS - SNPs HGVS will be used as input to VEP" print " We highly recommend option 1 when possible. Option 2 may" print " exclude rare or otherwise unlabeled SNPs.\n" # Determine the column delimiter by file type delim = '\t' if ext != "bed": delim = ' ' if ext == 'txt' else ',' if not indexing: indexing = 'ucsc' if ext == 'bed' else 'pdbmap' print "Using %s indexing for %s." % (indexing, dfile) dfile, id_type = d.load_bedfile(dfile, io, delim, indexing, usevep) print "Creating BED generator..." generator = d.load_bed(dfile, id_type, usevep, indexing) elif ext in ["ped", "map"]: generator = d.load_pedmap(dfile) else: msg = " ERROR (PDBMap) Unsupported file type: %s" % ext raise Exception(msg) # Pass the relevant generator to be uploaded nrows = io.upload_genomic_data(generator, dname) return (nrows)
def load_unp(self, unp, label=None, use_pdb=True, use_modbase=True, update=False): """ Loads all known structures associated with UniProt ID """ if self.pdb and use_pdb: pdb_label = label if label else 'pdb' pdb_label = "%s_update" % pdb_label if update else pdb_label io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, slabel=pdb_label) pdbids = list(set(PDBMapProtein.unp2pdb(unp))) for pdbid in pdbids: print " # Processing (%s) PDB %s # " % (pdb_label, pdbid) self.load_pdb(pdbid, label=pdb_label, io=io) sys.stdout.flush() # Force stdout flush after each PDB if self.modbase and use_modbase: mod_label = label if label else 'modbase' mod_label = "%s_update" % mod_label if update else mod_label io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, slabel=mod_label) modelids = PDBMapModel.unp2modbase(unp) models = [PDBMapModel.get_info(modelid) for modelid in modelids] for model in models: print " # (%s) Processing ModBase %s #" % (mod_label, model['modelid']) self.load_model(model, label=mod_label, io=io) sys.stdout.flush() # Force stdout flush after each model if not pdbids and not models: msg = " WARNING (PDBMap) No PDB structures or Modbase models found for %s\n" % unp sys.stderr.write(msg)
def intersect_data(self, dname, slabel=None, dtype="Genomic", quick=False): """ Intersects a loaded dataset with the PDBMap structural domain """ io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, dlabel=dname, slabel=slabel) i = PDBMapIntersect(io) # Note: Only all-structures <-> genomic data intersections supported if quick: nrows = i.quick_intersect(dname, slabel, dtype) else: nrows = i.intersect(dname, slabel, dtype, args.buffer_size) return (nrows) # Return the number of intersections
def load_model(self, model_summary, label="", io=None): """ Loads a given ModBase model into the PDBMap database """ if not io: # Create a PDBMapIO object io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, slabel=label) # Check if model is already in the database modelid = model_summary['modelid'] # extract ModBase model ID model_fname = model_summary['filename'] if io.model_in_db(modelid, label): if not update: # silence if updating print " VALID (PDBMap) %s (%s) already in database.\n" % ( modelid, label) return 0 # Query UniProt ID if not provided if 'unp' not in model_summary: unp = PDBMapProtein.ensp2unp( modelid.split('.')[0].split('_')[0])[0] else: unp = model_summary['unp'] # Load the ModBase model if not model_fname: model_fname = PDBMapModel.get_coord_file(modelid.upper()) print " # Fetching %s" % modelid if not os.path.exists(model_fname): model_fname += '.gz' # check for compressed copy if not os.path.exists(model_fname): msg = " ERROR (load_model) %s not in ModBase mirror.\n" % modelid sys.stderr.write(msg) return 1 try: p = PDBMapParser() print " # Loading %s (%s) from %s..." % ( modelid, unp, model_fname.split('/')[-1]) m = p.get_model(model_summary, model_fname, unp=unp) io.set_structure(m) io.upload_model() except Exception as e: msg = " ERROR (load_model) %s: %s\n" % (modelid, str(e)) sys.stderr.write(msg) return 1 msg = " VALID (load_model) %s complete.\n" % modelid sys.stderr.write(msg) return 0
def load_swiss_to_MySQL(self, modelid, label="", io=None): if not io: # Create a PDBMapIO object io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, slabel=label) # Load the dictionary of information about the modelid model_summary = PDBMapSwiss.get_info(modelid) # Check if model is already in the database model_fname = PDBMapSwiss.get_coord_file(modelid) if io.swiss_in_db(modelid, label): msg = " VALID (SWISS) %s (%s) already in database.\n" % (modelid, label) print msg return 0 print "Will attempt to add new swiss %s. Fetching: %s" % (modelid, model_fname) if not os.path.exists(model_fname): model_fname += '.gz' # check for compressed copy if not os.path.exists(model_fname): msg = " ERROR (load_swiss_to_MySQL) %s not in local Swiss mirror.\nExpected file %s\n" % ( modelid, model_fname) sys.stderr.write(msg) return 1 try: # import pdb; pdb.set_trace() s = PDBMapParser.getBiopythonStructureOrFail(modelid, model_fname) m = PDBMapSwiss(s, model_summary) s = PDBMapParser.process_structure_dssp_unp2hgnc( m, model_summary, model_fname, m.unp) io.set_structure(m) io.upload_swiss() except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() emsg = str(e) print emsg msg = " ERROR (pdbmap.py: load_swiss_to_MySQL(%s):\n%s" % ( modelid, emsg) sys.stderr.write(msg) traceback.print_tb(exc_traceback, limit=2, file=sys.stderr) return 1 msg = " VALID (pdbmap.py: load_swiss_to_MySQL(%s)\n" % modelid sys.stderr.write(msg) return 0
def load_pdb(self, pdbid, pdb_fname=None, label="", io=None, update=False): """ Loads a given PDB into the PDBMap database """ if not io: # Create a PDBMapIO object io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, slabel=label) # Check if PDB is already in the database if io.structure_in_db(pdbid, label): if not update: # silence if updating print " VALID (PDBMap) %s already in database." % pdbid return 0 # Load the PDB structure if not pdb_fname: pdb_fname = "%s/structures/all/pdb/pdb%s.ent.gz" % (self.pdb_dir, pdbid.lower()) print " # Fetching %s" % pdbid if not os.path.exists(pdb_fname): msg = " ERROR (PDBMap) Cannot fetch %s. Not in PDB mirror.\n" % pdbid sys.stderr.write(msg) return 1 # Locate all biological assemblies biounit_fnames = glob.glob("%s/biounit/coordinates/all/%s.pdb*.gz" % (self.pdb_dir, pdbid.lower())) try: # Load the structure p = PDBMapParser() s = p.get_structure(pdbid, pdb_fname, biounit_fnames=biounit_fnames, io=io) io.set_structure(s) io.upload_structure() except Exception as e: msg = " ERROR (PDBMap) %s: %s\n\n" % (pdbid, str(e)) sys.stderr.write(msg) return 1 msg = " VALID (PDBMap) %s complete.\n" % pdbid sys.stderr.write(msg) return 0
def visualize(self, entity, biounits=[], struct_label='pdb', data_label='1kg', anno_list=['maf'], spectrum_range=[], colors=[]): """ Visualizes a PDBMap structure, model, or protein """ io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, slabel=struct_label, dlabel=data_label) v = PDBMapVisualize(io, args.pdb_dir) entity_type = io.detect_entity_type( entity) if not entity == 'all' else 'all' if entity_type == 'structure' and not biounits: if io.is_nmr(entity): biounits = [-1] else: # Query all biological assemblies, exclude the asymmetric unit query = "SELECT DISTINCT biounit FROM Chain WHERE label=%s AND structid=%s AND biounit>0" res = io.secure_query(query, ( struct_label, entity, ), cursorclass='Cursor') biounits = [r[0] for r in res] elif entity_type == 'model' and not biounits: biounits = [-1] eps, mins = False, False synonymous_flag = False if any(['.synonymous' in a for a in anno_list]): # Replace synonymous with DAF and set the flag synonymous_flag = True idx = ['.synonymous' in a for i, a in enumerate(anno_list)].index(True) anno_list[idx] = anno_list[idx].replace('.synonymous', '') print "\n%s will be plotted for synonymous variants." % anno_list[ idx] if 'popdaf' in anno_list: idx = anno_list.index('popdaf') anno_list = anno_list[0:idx] + anno_list[idx + 1:] anno_list += [ 'daf', 'amr_daf', 'eas_daf', 'sas_daf', 'afr_daf', 'eur_daf' ] sr = spectrum_range[idx] spectrum_range = spectrum_range[0:idx] + spectrum_range[idx + 1:] spectrum_range += [sr for i in range(6)] if 'popmaf' in anno_list: idx = anno_list.index('popmaf') anno_list = anno_list[0:idx] + anno_list[idx + 1:] anno_list += [ 'maf', 'amr_af', 'eas_af', 'sas_af', 'afr_af', 'eur_af' ] sr = spectrum_range[idx] spectrum_range = spectrum_range[0:idx] + spectrum_range[idx + 1:] spectrum_range += [sr for i in range(6)] if 'dbscan' in anno_list: idx = anno_list.index('dbscan') anno_list = anno_list[0:idx] + anno_list[idx + 1:] eps, mins = spectrum_range[idx] spectrum_range = spectrum_range[0:idx] + spectrum_range[idx + 1:] if len(anno_list): # more than one DBSCAN specification msg = "ERROR (PDBMap) Cannot run other annotations with DBSCAN" raise Exception(msg) try: if entity_type in ['structure', 'model']: for biounit in biounits: v.visualize_structure(entity, biounit, anno_list, eps, mins, spectrum_range, colors=colors, syn=synonymous_flag) elif entity_type == 'unp': v.visualize_unp(entity, anno_list, eps, mins, spectrum_range, colors=colors, syn=synonymous_flag) elif entity_type == 'all': v.visualize_all(anno_list, eps, mins, spectrum_range, colors=colors, syn=synonymous_flag) elif entity_type: print "%s matched with UniProt ID: %s" % (entity.upper(), entity_type) entity = entity_type # An HGNC ID was detected and converted to UNP ID v.visualize_unp(entity, anno_list, eps, mins, spectrum_range, colors=colors, syn=synonymous_flag) else: msg = "Sorry, but the specified entity is not in the PDBMap database.\n" sys.stderr.write(msg) return 1 except Exception as e: msg = "ERROR (PDBMap) Visualization failed: %s" % str(e) raise
args = parser.parse_args(remaining_argv) args.conf_file = conf_file parser.get_default("vep") args.create_new_db = bool(args.create_new_db) args.force = bool(args.force) args.cores = int(args.cores) if args.create_new_db and not args.force: print "You have opted to create a new database: %s." % args.dbname if raw_input("Are you sure you want to do this? (y/n):") != 'y': print "Aborting..." else: print "Creating database tables..." io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname, createdb=True) print "\nDatabase created. Please set create_new_db to False." print "\nIt is strongly recommended that you now refresh the local resource cache." if raw_input( "Would you like to refresh the cache now? (y/n):") == 'y': print "Refreshing local cache..." args.cmd = "refresh" # continue on to cache refresh else: sys.exit(0) # Initialize PDBMap, refresh mirrored data if specified if args.cmd == "refresh": io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname) try: