Ejemplo n.º 1
0
    def get_pdb_template(
        self,
        sequence,
    ):
        """
        Retrieve a template structure from PDB from a BLAST search

        Parameters
        ----------
        sequence: str
            A string of the protein sequence

        Returns
        -------
        hits: dict
            A dictionary generated from ProDy with PDB hits.
        """

        from prody import blastPDB
        import tempfile
        import pickle

        blast_record = blastPDB(sequence)
        hits = blast_record.getHits()

        #  TODO need to address when BLAST search times out
        #  TODO add option based on sequency similarity cut off

        return hits
Ejemplo n.º 2
0
def prody_blast(opt):
    """Blast search PDB based on command line arguments."""
    
    import prody
    LOGGER = prody.LOGGER
    seq = opt.seq
    title = None
    if os.path.isfile(seq):
        title, seq = readFirstSequenceFasta(seq)
        LOGGER.info("First sequence ({0:s}) is parsed from {1:s}."
                    .format(title, repr(seq)))
    if not seq.isalpha() or not seq.isupper():
        opt.subparser.error("{0:s} is not a valid sequence or a file"
                            .format(repr(seq)))
        
    folder, identity, coverage = opt.folder, opt.identity, opt.coverage
    if not 0 < identity < 100: 
        opt.subparser.error('identity must be between 0 and 100')
    if not 0 < coverage < 100:
        opt.subparser.error('overlap must be between 0 and 100')
    
    blast_results = prody.blastPDB(seq)
    hits = blast_results.getHits(percent_identity=identity, 
                                 percent_coverage=coverage)
    
    #sort hits by decreasing percent identity
    hits2 = []
    for pdb in hits:
        hits2.append( (-hits[pdb]['percent_identity'], pdb) )
    hits2.sort()
    
    for identity, pdb in hits2:
        chain = hits[pdb]['chain_id']
        percent_identity = hits[pdb]['percent_identity']
        title = hits[pdb]['title']
        print(pdb + ' ' + chain + ' ' + ('%5.1f%%' % (percent_identity)) + 
              ' ' + title)
    
    # download hits if --folder is given
    if opt.folder:
        LOGGER.info('Downloading hits to ' + opt.folder)
        pdblist = [ pdb for identity, pdb in hits2 ]
        pdblist2 = prody.fetchPDB(pdblist, opt.folder, 
                                  compressed=opt.gzip, copy=True)
def get_PDBs(pdbdir):
    skip_blast = False
    
    for pdb_file in os.listdir(pdbdir):
        if pdb_file.endswith('.pdb'):
            print "\n***Now BLASTing %s***" % pdb_file
            actual_pdb_file = pdb_file
            output_dir = os.path.join('/kortemmelab/home/james.lucas/Mutant_PDBs', pdb_file[:-4])
            try:
                os.makedirs(output_dir)        
            except:
                print 'File directory %s already exists!!!' % pdb_file[:-4]
                skip_blast = True
            
            if skip_blast == False:
                pdb_chains = re.split(r'_', pdb_file)[1][:-4]
                pdb_id = re.split(r'_', pdb_file)[0]
                hv = prody.parsePDB(os.path.join(pdbdir, pdb_file)).getHierView()

                blast_dict = generate_mut_seq(pdbdir, hv)
                hit_list = []
                for sequence in blast_dict:
                    set_dict = {}
                    temp_set = set()
                    blast_me = prody.blastPDB(blast_dict[sequence], timeout = 600)
                    hits_dict = blast_me.getHits(percent_identity=95)
                    for hit in hits_dict:
                        temp_set.add(hit)

                    hit_list.append(temp_set)

                common_set = hit_list[0]

                for entry in hit_list[1:]:
                    common_set.intersection_update(entry)

                for hit in common_set:
                    rcsb.download_pdb(hit, output_dir)

                alignments(actual_pdb_file, common_set, pdbdir)
            else:
                print 'Blast skipped!'
Ejemplo n.º 4
0
def prody_blast(sequence, **kwargs):
    """Blast search PDB and download hits.

    :arg sequence: sequence or file in fasta format

    :arg identity: percent sequence identity for blast search, default is 90.0
    :type identity: float

    :arg overlap: percent sequence overlap between sequences, default is 90.0
    :type overlap: float

    :arg outdir: download uncompressed PDB files to given directory
    :type outdir: str

    :arg gzip: write compressed PDB file

    *Blast Parameters*

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    :arg hitlist_size: search parameters, default is 250
    :type hitlist_size: int

    :arg expect: search parameters, default is 1e-10
    :type expect: float

    :arg sleep: how long to wait to reconnect for results, default is 2
                sleep time is doubled when results are not ready.
    :type sleep: int

    :arg timeout: when to give up waiting for results. default is 30
    :type timeout: int"""

    import prody
    LOGGER = prody.LOGGER
    title = None
    if os.path.isfile(sequence):
        title, sequence = readFirstSequenceFasta(sequence)
        LOGGER.info("First sequence ({0}) is parsed from {1}."
                    .format(title, repr(sequence)))
    if not sequence.isalpha() or not sequence.isupper():
        raise ValueError("{0} is not a valid sequence or a file"
                        .format(repr(sequence)))

    outdir = kwargs.get('outdir')
    identity, overlap = kwargs.get('identity', 90), kwargs.get('overlap', 90)
    if not 0 < identity < 100:
        raise ValueError('identity must be between 0 and 100')
    if not 0 < overlap < 100:
        raise ValueError('overlap must be between 0 and 100')



    filename = kwargs.get('filename', None)
    hitlist_size = kwargs.get('hitlist_size', 250)
    expect = kwargs.get('expect', 1e-10)
    sleep, timeout = kwargs.get('sleep', 2), kwargs.get('timeout', 30)

    blast_results = prody.blastPDB(sequence,filename=filename,
                                   hitlist_size=hitlist_size, expect=expect,
                                   sleep=sleep, timeout=timeout)

    if blast_results is None:
        raise IOError('blast search timed out, please try again')

    hits = blast_results.getHits(percent_identity=identity,
                                 percent_overlap=overlap)

    #sort hits by decreasing percent identity
    hits2 = []
    for pdb in hits:
        hits2.append( (-hits[pdb]['percent_identity'], pdb) )
    hits2.sort()

    stdout = kwargs.get('stdout', False)

    if not stdout:
        finalHits = []
    else:
        from sys import stdout

    for identity, pdb in hits2:
        chain = hits[pdb]['chain_id']
        percent_identity = hits[pdb]['percent_identity']
        title = hits[pdb]['title']
        if stdout:
            stdout.write(pdb + ' ' + chain + ' ' +
                         ('%5.1f%%' % (percent_identity)) + ' ' + title)
        else:
            finalHits.append((pdb, chain, ('%5.1f%%' % (percent_identity)),
                               title))


    # download hits if --output-dir is given
    if outdir:
        LOGGER.info('Downloading hits to ' + outdir)
        pdblist = [ pdb for identity, pdb in hits2 ]
        pdblist2 = prody.fetchPDB(pdblist, outdir,
                                  compressed=kwargs.get('gzip'), copy=True)

    if not stdout:
        return finalHits
Ejemplo n.º 5
0
def prody_blast(sequence, **kwargs):
    """Blast search PDB and download hits.

    :arg sequence: sequence or file in fasta format

    :arg identity: percent sequence identity for blast search, default is 90.0
    :type identity: float

    :arg overlap: percent sequence overlap between sequences, default is 90.0
    :type overlap: float

    :arg outdir: download uncompressed PDB files to given directory
    :type outdir: str

    :arg gzip: write compressed PDB file

    *Blast Parameters*

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    :arg hitlist_size: search parameters, default is 250
    :type hitlist_size: int

    :arg expect: search parameters, default is 1e-10
    :type expect: float

    :arg sleep: how long to wait to reconnect for results, default is 2
                sleep time is doubled when results are not ready.
    :type sleep: int

    :arg timeout: when to give up waiting for results. default is 30
    :type timeout: int"""

    import prody
    LOGGER = prody.LOGGER
    title = None
    if os.path.isfile(sequence):
        title, sequence = readFirstSequenceFasta(sequence)
        LOGGER.info("First sequence ({0}) is parsed from {1}.".format(
            title, repr(sequence)))
    if not sequence.isalpha() or not sequence.isupper():
        raise ValueError("{0} is not a valid sequence or a file".format(
            repr(sequence)))

    outdir = kwargs.get('outdir')
    identity, overlap = kwargs.get('identity', 90), kwargs.get('overlap', 90)
    if not 0 < identity < 100:
        raise ValueError('identity must be between 0 and 100')
    if not 0 < overlap < 100:
        raise ValueError('overlap must be between 0 and 100')

    filename = kwargs.get('filename', None)
    hitlist_size = kwargs.get('hitlist_size', 250)
    expect = kwargs.get('expect', 1e-10)
    sleep, timeout = kwargs.get('sleep', 2), kwargs.get('timeout', 30)

    blast_results = prody.blastPDB(sequence,
                                   filename=filename,
                                   hitlist_size=hitlist_size,
                                   expect=expect,
                                   sleep=sleep,
                                   timeout=timeout)

    if not blast_results.isSuccess:
        raise IOError('blast search timed out, please try again')

    hits = blast_results.getHits(percent_identity=identity,
                                 percent_overlap=overlap)

    #sort hits by decreasing percent identity
    hits2 = []
    for pdb in hits:
        hits2.append((-hits[pdb]['percent_identity'], pdb))
    hits2.sort()

    stdout = kwargs.get('stdout', False)

    if not stdout:
        finalHits = []
    else:
        from sys import stdout

    for identity, pdb in hits2:
        chain = hits[pdb]['chain_id']
        percent_identity = hits[pdb]['percent_identity']
        title = hits[pdb]['title']
        if stdout:
            stdout.write(pdb + ' ' + chain + ' ' +
                         ('%5.1f%%' % (percent_identity)) + ' ' + title)
        else:
            finalHits.append(
                (pdb, chain, ('%5.1f%%' % (percent_identity)), title))

    # download hits if --outdir is given
    if outdir:
        LOGGER.info('Downloading hits to ' + outdir)
        pdblist = [pdb for identity, pdb in hits2]
        pdblist2 = prody.fetchPDB(pdblist,
                                  outdir,
                                  compressed=kwargs.get('gzip'),
                                  copy=True)

    if not stdout:
        return finalHits