Example #1
0
 def load_data(self, dname, dfile, indexing=None, usevep=True, upload=True):
     """ Loads a data file into the PDBMap database """
     if usevep:
         d = PDBMapData(vep=self.vep, vep_cache=self.vep_cache, dname=dname)
     else:
         d = PDBMapData(dname=dname)
     if not os.path.exists(dfile):
         dfile = "%s.ped" % dfile  # Test if PEDMAP basename
         if not os.path.exists(dfile):
             msg = "  ERROR (PDBMap) File does not exist: %s" % dfile
             raise Exception(msg)
     io = PDBMapIO(args.dbhost,
                   args.dbuser,
                   args.dbpass,
                   args.dbname,
                   dlabel=dname)
     # Determine file type
     ext = dfile.split('.')[-1].lower()
     if ext == 'gz':
         ext = dfile.split('.')[-2].lower()
     # Process and accordingly
     if ext == 'vcf':
         if upload:
             print "\nUploading VCF to supplemental database..."
             nrows = d.load_vcffile(dfile, io, args.buffer_size)
             print "%d VCF records uploaded to supplemental database before processing" % nrows
         generator = d.load_vcf(dfile, usevep)
     elif ext in ["bed", "txt", "csv"]:
         if usevep:
             print "\nNote: You have provided a %s file and requested VEP analysis." % ext
             print "      There are three acceptable values for the 'name' column."
             print "       1. REF/ALT - SNP alleles will be used as input to VEP"
             print "       2. rsID    - SNP names will be used as input to VEP."
             print "       3. HGVS    - SNPs HGVS will be used as input to VEP"
             print "      We highly recommend option 1 when possible. Option 2 may"
             print "       exclude rare or otherwise unlabeled SNPs.\n"
         # Determine the column delimiter by file type
         delim = '\t'
         if ext != "bed":
             delim = ' ' if ext == 'txt' else ','
         if not indexing:
             indexing = 'ucsc' if ext == 'bed' else 'pdbmap'
         print "Using %s indexing for %s." % (indexing, dfile)
         dfile, id_type = d.load_bedfile(dfile, io, delim, indexing, usevep)
         print "Creating BED generator..."
         generator = d.load_bed(dfile, id_type, usevep, indexing)
     elif ext in ["ped", "map"]:
         generator = d.load_pedmap(dfile)
     else:
         msg = "  ERROR (PDBMap) Unsupported file type: %s" % ext
         raise Exception(msg)
     # Pass the relevant generator to be uploaded
     nrows = io.upload_genomic_data(generator, dname)
     return (nrows)
Example #2
0
 def load_unp(self,
              unp,
              label=None,
              use_pdb=True,
              use_modbase=True,
              update=False):
     """ Loads all known structures associated with UniProt ID """
     if self.pdb and use_pdb:
         pdb_label = label if label else 'pdb'
         pdb_label = "%s_update" % pdb_label if update else pdb_label
         io = PDBMapIO(args.dbhost,
                       args.dbuser,
                       args.dbpass,
                       args.dbname,
                       slabel=pdb_label)
         pdbids = list(set(PDBMapProtein.unp2pdb(unp)))
         for pdbid in pdbids:
             print " # Processing (%s) PDB %s # " % (pdb_label, pdbid)
             self.load_pdb(pdbid, label=pdb_label, io=io)
             sys.stdout.flush()  # Force stdout flush after each PDB
     if self.modbase and use_modbase:
         mod_label = label if label else 'modbase'
         mod_label = "%s_update" % mod_label if update else mod_label
         io = PDBMapIO(args.dbhost,
                       args.dbuser,
                       args.dbpass,
                       args.dbname,
                       slabel=mod_label)
         modelids = PDBMapModel.unp2modbase(unp)
         models = [PDBMapModel.get_info(modelid) for modelid in modelids]
         for model in models:
             print " # (%s) Processing ModBase %s #" % (mod_label,
                                                        model['modelid'])
             self.load_model(model, label=mod_label, io=io)
             sys.stdout.flush()  # Force stdout flush after each model
     if not pdbids and not models:
         msg = "  WARNING (PDBMap) No PDB structures or Modbase models found for %s\n" % unp
         sys.stderr.write(msg)
Example #3
0
 def intersect_data(self, dname, slabel=None, dtype="Genomic", quick=False):
     """ Intersects a loaded dataset with the PDBMap structural domain """
     io = PDBMapIO(args.dbhost,
                   args.dbuser,
                   args.dbpass,
                   args.dbname,
                   dlabel=dname,
                   slabel=slabel)
     i = PDBMapIntersect(io)
     # Note: Only all-structures <-> genomic data intersections supported
     if quick:
         nrows = i.quick_intersect(dname, slabel, dtype)
     else:
         nrows = i.intersect(dname, slabel, dtype, args.buffer_size)
     return (nrows)  # Return the number of intersections
Example #4
0
    def load_model(self, model_summary, label="", io=None):
        """ Loads a given ModBase model into the PDBMap database """

        if not io:
            # Create a PDBMapIO object
            io = PDBMapIO(args.dbhost,
                          args.dbuser,
                          args.dbpass,
                          args.dbname,
                          slabel=label)

        # Check if model is already in the database
        modelid = model_summary['modelid']  # extract ModBase model ID
        model_fname = model_summary['filename']
        if io.model_in_db(modelid, label):
            if not update:  # silence if updating
                print "  VALID (PDBMap) %s (%s) already in database.\n" % (
                    modelid, label)
                return 0

        # Query UniProt ID if not provided
        if 'unp' not in model_summary:
            unp = PDBMapProtein.ensp2unp(
                modelid.split('.')[0].split('_')[0])[0]
        else:
            unp = model_summary['unp']

        # Load the ModBase model
        if not model_fname:
            model_fname = PDBMapModel.get_coord_file(modelid.upper())
            print "  # Fetching %s" % modelid
            if not os.path.exists(model_fname):
                model_fname += '.gz'  # check for compressed copy
            if not os.path.exists(model_fname):
                msg = "  ERROR (load_model) %s not in ModBase mirror.\n" % modelid
                sys.stderr.write(msg)
                return 1
        try:
            p = PDBMapParser()
            print "   # Loading %s (%s) from %s..." % (
                modelid, unp, model_fname.split('/')[-1])
            m = p.get_model(model_summary, model_fname, unp=unp)
            io.set_structure(m)
            io.upload_model()
        except Exception as e:
            msg = "  ERROR (load_model) %s: %s\n" % (modelid, str(e))
            sys.stderr.write(msg)
            return 1
        msg = "  VALID (load_model) %s complete.\n" % modelid
        sys.stderr.write(msg)
        return 0
Example #5
0
    def load_swiss_to_MySQL(self, modelid, label="", io=None):
        if not io:
            # Create a PDBMapIO object
            io = PDBMapIO(args.dbhost,
                          args.dbuser,
                          args.dbpass,
                          args.dbname,
                          slabel=label)

        # Load the dictionary of information about the modelid
        model_summary = PDBMapSwiss.get_info(modelid)

        # Check if model is already in the database
        model_fname = PDBMapSwiss.get_coord_file(modelid)
        if io.swiss_in_db(modelid, label):
            msg = "  VALID (SWISS) %s (%s) already in database.\n" % (modelid,
                                                                      label)
            print msg
            return 0
        print "Will attempt to add new swiss %s.  Fetching: %s" % (modelid,
                                                                   model_fname)
        if not os.path.exists(model_fname):
            model_fname += '.gz'  # check for compressed copy
        if not os.path.exists(model_fname):
            msg = "  ERROR (load_swiss_to_MySQL) %s not in local Swiss mirror.\nExpected file %s\n" % (
                modelid, model_fname)
            sys.stderr.write(msg)
            return 1

        try:
            # import pdb; pdb.set_trace()
            s = PDBMapParser.getBiopythonStructureOrFail(modelid, model_fname)
            m = PDBMapSwiss(s, model_summary)
            s = PDBMapParser.process_structure_dssp_unp2hgnc(
                m, model_summary, model_fname, m.unp)

            io.set_structure(m)
            io.upload_swiss()
        except Exception as e:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            emsg = str(e)
            print emsg
            msg = "  ERROR (pdbmap.py: load_swiss_to_MySQL(%s):\n%s" % (
                modelid, emsg)
            sys.stderr.write(msg)
            traceback.print_tb(exc_traceback, limit=2, file=sys.stderr)
            return 1
        msg = "  VALID (pdbmap.py: load_swiss_to_MySQL(%s)\n" % modelid
        sys.stderr.write(msg)
        return 0
Example #6
0
 def load_pdb(self, pdbid, pdb_fname=None, label="", io=None, update=False):
     """ Loads a given PDB into the PDBMap database """
     if not io:
         # Create a PDBMapIO object
         io = PDBMapIO(args.dbhost,
                       args.dbuser,
                       args.dbpass,
                       args.dbname,
                       slabel=label)
     # Check if PDB is already in the database
     if io.structure_in_db(pdbid, label):
         if not update:  # silence if updating
             print "  VALID (PDBMap) %s already in database." % pdbid
             return 0
     # Load the PDB structure
     if not pdb_fname:
         pdb_fname = "%s/structures/all/pdb/pdb%s.ent.gz" % (self.pdb_dir,
                                                             pdbid.lower())
         print "  # Fetching %s" % pdbid
         if not os.path.exists(pdb_fname):
             msg = "  ERROR (PDBMap) Cannot fetch %s. Not in PDB mirror.\n" % pdbid
             sys.stderr.write(msg)
             return 1
     # Locate all biological assemblies
     biounit_fnames = glob.glob("%s/biounit/coordinates/all/%s.pdb*.gz" %
                                (self.pdb_dir, pdbid.lower()))
     try:  # Load the structure
         p = PDBMapParser()
         s = p.get_structure(pdbid,
                             pdb_fname,
                             biounit_fnames=biounit_fnames,
                             io=io)
         io.set_structure(s)
         io.upload_structure()
     except Exception as e:
         msg = "  ERROR (PDBMap) %s: %s\n\n" % (pdbid, str(e))
         sys.stderr.write(msg)
         return 1
     msg = "  VALID (PDBMap) %s complete.\n" % pdbid
     sys.stderr.write(msg)
     return 0
Example #7
0
 def visualize(self,
               entity,
               biounits=[],
               struct_label='pdb',
               data_label='1kg',
               anno_list=['maf'],
               spectrum_range=[],
               colors=[]):
     """ Visualizes a PDBMap structure, model, or protein """
     io = PDBMapIO(args.dbhost,
                   args.dbuser,
                   args.dbpass,
                   args.dbname,
                   slabel=struct_label,
                   dlabel=data_label)
     v = PDBMapVisualize(io, args.pdb_dir)
     entity_type = io.detect_entity_type(
         entity) if not entity == 'all' else 'all'
     if entity_type == 'structure' and not biounits:
         if io.is_nmr(entity):
             biounits = [-1]
         else:
             # Query all biological assemblies, exclude the asymmetric unit
             query = "SELECT DISTINCT biounit FROM Chain WHERE label=%s AND structid=%s AND biounit>0"
             res = io.secure_query(query, (
                 struct_label,
                 entity,
             ),
                                   cursorclass='Cursor')
             biounits = [r[0] for r in res]
     elif entity_type == 'model' and not biounits:
         biounits = [-1]
     eps, mins = False, False
     synonymous_flag = False
     if any(['.synonymous' in a for a in anno_list]):
         # Replace synonymous with DAF and set the flag
         synonymous_flag = True
         idx = ['.synonymous' in a
                for i, a in enumerate(anno_list)].index(True)
         anno_list[idx] = anno_list[idx].replace('.synonymous', '')
         print "\n%s will be plotted for synonymous variants." % anno_list[
             idx]
     if 'popdaf' in anno_list:
         idx = anno_list.index('popdaf')
         anno_list = anno_list[0:idx] + anno_list[idx + 1:]
         anno_list += [
             'daf', 'amr_daf', 'eas_daf', 'sas_daf', 'afr_daf', 'eur_daf'
         ]
         sr = spectrum_range[idx]
         spectrum_range = spectrum_range[0:idx] + spectrum_range[idx + 1:]
         spectrum_range += [sr for i in range(6)]
     if 'popmaf' in anno_list:
         idx = anno_list.index('popmaf')
         anno_list = anno_list[0:idx] + anno_list[idx + 1:]
         anno_list += [
             'maf', 'amr_af', 'eas_af', 'sas_af', 'afr_af', 'eur_af'
         ]
         sr = spectrum_range[idx]
         spectrum_range = spectrum_range[0:idx] + spectrum_range[idx + 1:]
         spectrum_range += [sr for i in range(6)]
     if 'dbscan' in anno_list:
         idx = anno_list.index('dbscan')
         anno_list = anno_list[0:idx] + anno_list[idx + 1:]
         eps, mins = spectrum_range[idx]
         spectrum_range = spectrum_range[0:idx] + spectrum_range[idx + 1:]
         if len(anno_list):  # more than one DBSCAN specification
             msg = "ERROR (PDBMap) Cannot run other annotations with DBSCAN"
             raise Exception(msg)
     try:
         if entity_type in ['structure', 'model']:
             for biounit in biounits:
                 v.visualize_structure(entity,
                                       biounit,
                                       anno_list,
                                       eps,
                                       mins,
                                       spectrum_range,
                                       colors=colors,
                                       syn=synonymous_flag)
         elif entity_type == 'unp':
             v.visualize_unp(entity,
                             anno_list,
                             eps,
                             mins,
                             spectrum_range,
                             colors=colors,
                             syn=synonymous_flag)
         elif entity_type == 'all':
             v.visualize_all(anno_list,
                             eps,
                             mins,
                             spectrum_range,
                             colors=colors,
                             syn=synonymous_flag)
         elif entity_type:
             print "%s matched with UniProt ID: %s" % (entity.upper(),
                                                       entity_type)
             entity = entity_type  # An HGNC ID was detected and converted to UNP ID
             v.visualize_unp(entity,
                             anno_list,
                             eps,
                             mins,
                             spectrum_range,
                             colors=colors,
                             syn=synonymous_flag)
         else:
             msg = "Sorry, but the specified entity is not in the PDBMap database.\n"
             sys.stderr.write(msg)
             return 1
     except Exception as e:
         msg = "ERROR (PDBMap) Visualization failed: %s" % str(e)
         raise
Example #8
0
    args = parser.parse_args(remaining_argv)
    args.conf_file = conf_file
    parser.get_default("vep")
    args.create_new_db = bool(args.create_new_db)
    args.force = bool(args.force)
    args.cores = int(args.cores)

    if args.create_new_db and not args.force:
        print "You have opted to create a new database: %s." % args.dbname
        if raw_input("Are you sure you want to do this? (y/n):") != 'y':
            print "Aborting..."
        else:
            print "Creating database tables..."
            io = PDBMapIO(args.dbhost,
                          args.dbuser,
                          args.dbpass,
                          args.dbname,
                          createdb=True)
            print "\nDatabase created. Please set create_new_db to False."
            print "\nIt is strongly recommended that you now refresh the local resource cache."
            if raw_input(
                    "Would you like to refresh the cache now? (y/n):") == 'y':
                print "Refreshing local cache..."
                args.cmd = "refresh"  # continue on to cache refresh
            else:
                sys.exit(0)

    # Initialize PDBMap, refresh mirrored data if specified
    if args.cmd == "refresh":
        io = PDBMapIO(args.dbhost, args.dbuser, args.dbpass, args.dbname)
        try: