Python parsePDB Beispiele, prody.proteins.pdbfile.parsePDB Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: calc_mode_ov_with_translation.py Projekt: victor-gil-sepulveda/PhD-ANMPythonHelpers

def calculate_transition_vecs(first_conf, second_conf):
    # From first conf to second conf
    # Calculate transition vectors
    print first_conf, second_conf
    first_struct = parsePDB(first_conf).ca
    second_struct = parsePDB(second_conf).ca
    return (second_struct.getCoords() - first_struct.getCoords()).flatten()

Beispiel #2

0

Datei anzeigen

def get_reference_structure(path, selection_str, expected_num_res):
    reference = parsePDB(path)
    reference_structure = reference.select(selection_str)
    writePDB("ref.pdb.chunk", reference_structure)
    #The reference chunk must NOT have RESIDUE GAPS
    assert expected_num_res == reference_structure.getHierView().numResidues(),\
    "[ERROR] There are gaps in the reference structure inside this residue range.%d %d"%(options.final_res - options.initial_res +1, reference_structure.getHierView().numResidues())

Beispiel #3

0

Datei anzeigen

Datei: Encounter.py Projekt: hrch3n/cNMA

 def loadUnfilteredReceptor(self, utils):
     proteinName = self._pdbQueueItem[0]
     proteinName = list(proteinName)
     proteinName[5] = "r"
     proteinName = "".join(proteinName)
     path = utils.config.pathTo2cFiles + proteinName + ".pdb.ms"
     self._unfilteredReferenceAtomCount = utils.file_len(path)
     self._unfilteredReference = parsePDB(path)

Beispiel #4

0

Datei anzeigen

Datei: ModeSpectrumVisualizer.py Projekt: hrch3n/cNMA

def getAllModeInformation(folders, resultsPath):
    fromMode = 12
    toMode = 42

    allModeMemberships = OrderedDict()
    allModeOverlaps = OrderedDict()

    for resultFolder in folders:
        # load models
        NPZfolder = glob.glob(resultFolder + "/*anms.npz")[0]
        NPZ_reference = glob.glob(NPZfolder + "/*reference_ANM.anm.npz")[0]
        NPZ_counterpart = glob.glob(NPZfolder + "/*anm_counterpart.anm.npz")[0]
        NPZ_complex = glob.glob(NPZfolder + "/*anm_complex.anm.npz")[0]
        anm_reference = loadModel(NPZ_reference)
        anm_counterpart = loadModel(NPZ_counterpart)
        anm_complex = loadModel(NPZ_complex)
        assert anm_reference.getArray().shape[0] + anm_counterpart.getArray(
        ).shape[0] == anm_complex.getArray().shape[0]
        # load resolution
        resolution = getResolution(folders)
        # load pdbs and deformation vector
        unboundComplex = parsePDB(
            glob.glob(resultFolder + "/pdbs/*ucomplex.pdb")[0])
        boundComplex = parsePDB(
            glob.glob(resultFolder + "/pdbs/*bcomplex.pdb")[0])
        overallMatchComplex = getOverallMatch(unboundComplex, boundComplex,
                                              resolution)
        defvec = calcDeformVector(overallMatchComplex[0],
                                  overallMatchComplex[1])
        # get sliced ANM
        anm_complex_slc = getSlicedANM(unboundComplex, overallMatchComplex[0],
                                       anm_complex)
        # get mode memberships
        proteinTitle = os.path.basename(resultFolder)
        modeMemberships = getModeMemberships(
            anm_complex.getArray(),
            anm_reference.getArray().shape[0],
            anm_counterpart.getArray().shape[0])
        allModeMemberships[proteinTitle] = modeMemberships
        # get mode overlaps
        modeOverlaps = getModeOverlaps(anm_complex_slc[0].getArray(), defvec)
        allModeOverlaps[proteinTitle] = modeOverlaps

    visualizeModeMemberships(allModeMemberships, allModeOverlaps, fromMode,
                             toMode, resultsPath)

Beispiel #5

0

Datei anzeigen

    def find_gaps(self):
        """
        Scans all residues in the protein and checks whether the N of the current residue and the C of the previous one
        are within peptide bond distance (1.50 A) in order to find any gaps in the backbone.

        Returns:
            Two dictionaries, first one containing the residues involved in the gaps, second one with all the remaining
            residues. Each of them has the chain as key and the residues numbers (previous, current) involved in a bond
            as values.
        """
        structure = parsePDB(self.pdb)
        gaps, no_gaps = CheckforGaps(structure, 1.50)
        return gaps

Beispiel #6

0

Datei anzeigen

def main():

    parser = argparse.ArgumentParser(
        description=
        'Project the calpha coordinates of a PDB file by a projection matrix and output the new PDB file with only calpahs'
    )
    parser.add_argument('pdbFile', help='The pdb file')
    parser.add_argument('P', help='the projection matrix')
    parser.add_argument(
        '-outputName',
        help=
        'name of the output file, default is beforeP_P_pdbFile and afterP_P_pdbFile'
    )

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args()

    assert os.path.isfile(args.pdbFile)
    assert os.path.isfile(args.P)

    if args.outputName:
        outputName = args.outputName
    else:
        outputName = os.path.basename(args.P) + "_" + os.path.basename(
            args.pdbFile)

    # read files
    pdbFile = parsePDB(args.pdbFile)
    pdbFile_ca = pdbFile.select('calpha')

    P = np.loadtxt(args.P)

    writePDB("beforeP_" + outputName, pdbFile_ca)
    coord_shape = pdbFile_ca.getCoords().shape
    coords_P = P.dot(pdbFile_ca.getCoords().flatten())
    coords_P = coords_P.reshape(coord_shape)
    pdbFile_ca.setCoords(coords_P)
    writePDB("afterP_" + outputName, pdbFile_ca)

    referenceSegment = "R"
    referenceOfComplex = pdbFile_ca.select('segment \"' + referenceSegment +
                                           '.\"')
    print "Center of fixed (receptor) frame is: ", calcCenter(
        referenceOfComplex.select('calpha'))

    print "calphas before projection written to: ", "beforeP_" + outputName
    print "calphas after projection written to: ", "afterP_" + outputName

Beispiel #7

0

Datei anzeigen

Datei: blastpdb.py Projekt: uibcdf/ProDy

def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching *sequence* against the PDB using NCBI blastp.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is multiplied by 1.5 when results are not ready.  
    *timeout* (default is 120 s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    else:
        raise TypeError(
            'sequence must be Atomic, Sequence, or str not {0}'.format(
                type(sequence)))

    return PDBBlastRecord(filename, sequence, **kwargs)

Beispiel #8

0

Datei anzeigen

Datei: main.py Projekt: victor-gil-sepulveda/PhD-GPCR

    open(TABLE_PATH,"w").write(tabulate.tabulate(table, headers, tablefmt="rst"))
     
    #--------------------------------
    # Get Residues < 2A of the ligand
    #--------------------------------
    CLUSTERS_PATH = os.path.join(CLUSTERING_PATH, "clusters")
    CONTACTS_FILE = os.path.join(RESULTS_PATH,"residue_contacts.txt")
    CONTACTS_DISTANCE = 2.0

    cluster_pdbs = os.listdir(CLUSTERS_PATH)
    residues_file = open(CONTACTS_FILE,"w")
    contacts_per_cluster = {}
    for cluster_pdb in cluster_pdbs:
        if ".pdb" in cluster_pdb:
            cluster_id,_ = os.path.splitext(os.path.basename(cluster_pdb))
            struct = parsePDB(os.path.join(CLUSTERS_PATH,cluster_pdb))
            protein = struct.select("protein")
            ligand = struct.select("chain C")
            residues = []
            for i in range(protein.numCoordsets()):
                protein.setCoords(protein.getCoordsets()[i])
                ligand.setCoords(ligand.getCoordsets()[i])
                prot_contacts = Contacts(protein)
                
                contacts =  prot_contacts.select(CONTACTS_DISTANCE, ligand)
                
                if contacts is not None:
                    # Get residue names
                    for atom in iter(contacts):
                        residues.append("%s:%s:%s"%(atom.getResnum(), atom.getResname(), atom.getChid()))

Beispiel #9

0

Datei anzeigen

Datei: domain_distance.py Projekt: victor-gil-sepulveda/PhD-ANMPythonHelpers

        distances = {}
        for workspace in workspaces:
            distances[workspace] = {}
            for folder in folders:
                T = int(folder.split("_")[1])
                inner_folders = glob.glob(
                    os.path.join(folder, workspace, "*rmsg*"))
                distances[workspace][T] = {}
                for inner_folder in inner_folders:
                    name = os.path.basename(inner_folder)
                    prefix, p1, v1, p2, v2 = name.split("_")

                    try:
                        traj_file = os.path.join(inner_folder,
                                                 "trajectory.pdb")
                        pdb = parsePDB(traj_file, subset='ca')
                        res_coords = pdb.select(
                            "resid 277 or resid 387").getCoordsets()
                        if max_confs is not None:
                            res_coords = res_coords[0:max_confs]

                        ds = []
                        for cys_coords, leu_coords in res_coords:
                            ds.append(norm(leu_coords - cys_coords))
                        distances[workspace][T][v1, v2] = ds
                    except IOError:
                        print "Trajectory not read", traj_file
                        pass
        pickle.dump(
            distances,
            open(os.path.join(options.results, "domain_dist_data"), "w"))

Beispiel #10

0

Datei anzeigen

'''
Created on 25/9/2015

@author: victor
'''
import sys
import numpy
from prody.proteins.pdbfile import parsePDB
from prody.measure.measure import calcPhi, calcPsi

if __name__ == '__main__':
    trajectory_path = sys.argv[1]

    structure = parsePDB(trajectory_path)
    all_angles = []
    for i, coords in enumerate(structure.getCoordsets()):
        angles = []
        structure.setCoords(coords)
        hv = structure.getHierView()
        for residue in hv.iterResidues():
            try:
                angles.append(calcPhi(residue, radian=True))
            except:
                angles.append(0.)
            try:
                angles.append(calcPsi(residue, radian=True))
            except:
                angles.append(0.)
        all_angles.append(angles)
    all_angles = numpy.array(all_angles)
    numpy.savetxt(sys.argv[1] + ".ang", all_angles, fmt="%.4f")

Beispiel #11

0

Datei anzeigen

    omega_angles = []
    for residue in conformation.iterResidues():
        try:
            omega_angles.append((residue.getResname(),calcOmega(residue, radian=True)))
        except ValueError:
            omega_angles.append(("--",0))
    return numpy.array(omega_angles)

if __name__ == '__main__':
    parser = OptionParser()
    parser.add_option("-i", dest="input")
    parser.add_option("-o", dest="output")
    parser.add_option("-w", action= "store_true", dest="omega")
    (options, args) = parser.parse_args()
    
    pdb = parsePDB(options.input)
    
    if not options.omega:
        angles = get_dihedrals_for_conformation(pdb)
        numpy.savetxt(options.output, angles, delimiter = "\n")
    else:
        angles = get_omegas_for_conformation(pdb)
        open(options.output,"w").write("\n".join([str(omega) for omega in angles]))
    
    
#    
#open = numpy.loadtxt("5XHK_helix.dih")
#closed = numpy.loadtxt("9WVG_helix.dih")
#numpy.savetxt("diff.dih", open-closed)

Beispiel #12

0

Datei anzeigen

Datei: blastpdb.py Projekt: fongchun/ProDy

def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching *sequence* against the PDB using NCBI blastp.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is multiplied by 1.5 when results are not ready.  
    *timeout* (default is 120 s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    headers = {'User-agent': 'ProDy'}
    query = [('DATABASE', 'pdb'), ('ENTREZ_QUERY', '(none)'),
             ('PROGRAM', 'blastp'),]

    expect = float(kwargs.pop('expect', 10e-10))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'
                .format(sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find(b'RTOE =')
    if index == -1:
        rtoe = None # This is not used
    else:
        last = html.find(b'\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting to NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index+len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None
    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

    return PDBBlastRecord(results, sequence)

Beispiel #13

0

Datei anzeigen

Datei: psiblast.py Projekt: fongchun/ProDy

def psiBlastCycle(sequence=None, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    a single cycle of EBI psiblast.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    The following search parameters can be adjusted by the user.
    We use the same default values as 
    http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/
    wherever applicable.

    :arg email: email address for reporting problems
        default is [email protected]
    :type email: str with an @ before a .

    :arg matrix: The comparison matrix to be used to score alignments when searching the database
        possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' 
        default is 'BLOSUM62'
    :type matrix: str

    :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. 
        Increasing the gap opening penalty will decrease the number of gaps in the final alignment.
        Possible values range from 8 to 16 inclusive, default is 11
    :type gapopen: int

    :arg gapext: Penalty taken away from the score for each base or residue in the gap. 
        Increasing the gap extension penalty favors short gaps in the final alignment, 
        conversly decreasing the gap extension penalty favors long gaps in the final alignment. 
        Possible values range from 0 to 3, default is 1
    :type gapext: int

    :arg expthr: Expectation threshold that limits the number of scores and alignments reported. 
        This is the maximum number of times the match is expected to occur by chance.
        Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3,
        1.0e-2, 0.1, 1.0, 10.0, 100, 1000
        default is 10.0
    :type expthr: float

    :arg psithr: Expectation value threshold for automatic selection of matched sequences for 
        inclusion in the PSSM at each iteration.
        Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3,
        1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0
        default is 1.0e-3
    :type psithr: float

    :arg scores: Maximum number of match score summaries reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type scores: int

    :arg alignments: Maximum number of match alignments reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type alignmets: int

    :arg dropoff: The amount a score can drop before extension of word hits is halted
        Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30
        Default is 15
    :type dropoff: int

    :arg finaldropoff: Dropoff value for final gapped alignment
        Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30
        Default is 25
    :type finaldropoff: int

    :arg filter: Filter regions of low sequence complexity. This can avoid issues with 
        low complexity sequences where matches are found due to composition rather than 
        meaningful sequence similarity. However, in some cases filtering also masks 
        regions of interest and so should be used with caution.
        Possible values are T and F, default is F
    :type filter: str

    :arg seqrange: Specify a range or section of the input sequence to use in the search.
        Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST 
        to only use residues 34 to 89, inclusive.
    :type seqrange: str of form START-END

    :arg database: a database name from those available. See
        http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database
        default is pdb
    :type database: str

    :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. 
        default is None
        You can change this if you want to continue from a previous run
    :type previousjobid: str

    :arg selectedHits: Name of a file containing a list of identifiers of the 
        hits from the previous iteration to use to construct the search PSSM 
        for this iteration.
        default is None
    :type selectedHits: str

    :arg cpfile: Name of a Checkpoint file from the previous iteration. 
        default is None
    :type cpfile: str

    :arg sleep: how long to wait to reconnect for status
         Sleep time is multiplied by 1.5 when results are not ready.
         default is 2 seconds
    :type sleep: float

    :arg timeout:  when to give up waiting for the results 
        default is 120 seconds
    :type timeout: float

    :arg cycle: cycle number
    :type cycle: int

    """
    cycle = kwargs.get('cycle',0)

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    elif sequence is None:
        if cycle == 0: 
            cycle = 1
    else:
        raise TypeError('sequence must be Atomic, Sequence, or str not {0}'
                        .format(type(sequence)))

    if cycle == 0:
        query = [('sequence', sequence)]
    else:
        query = []

    email = kwargs.get('email','*****@*****.**')
    if not isinstance(email, str):
        raise TypeError('email must be a string')
    elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2:
        raise ValueError('email must be a valid email address with at least one . and exactly one @ sign')
    elif not email.find('@') < email.find(email.split('.')[-1]):
        raise ValueError('email must be a valid email address with a . after the @ sign')
    query.append(('email', email))
    query.append(('title', 'ProDy psiBlastPDB request'))

    previousjobid = kwargs.get('previousjobid','')
    if previousjobid is not '':
        query.append(('previousjobid',previousjobid))

    selectedHits = kwargs.get('selectedHits','')
    if selectedHits is not '':
        query.append(('selectedHits',selectedHits))

    database = kwargs.get('database','pdb')
    checkPsiBlastParameter('database', database)
    query.append(('database',database))

    matrix = kwargs.get('matrix', 'BLOSUM62')
    checkPsiBlastParameter('matrix', matrix)
    query.append(('matrix',matrix))

    gapopen = kwargs.get('gapopen',11)
    checkPsiBlastParameter('gapopen', gapopen)
    query.append(('gapopen',gapopen))

    gapext = kwargs.get('gapext',1)
    checkPsiBlastParameter('gapext', gapext)
    query.append(('gapext',gapext))

    expthr = kwargs.get('expthr', 10.)
    checkPsiBlastParameter('expthr', expthr)
    query.append(('expthr',expthr))
    
    psithr = kwargs.get('psithr',1.0e-3)
    checkPsiBlastParameter('psithr', psithr)
    query.append(('psithr',psithr))

    scores = kwargs.get('scores',500)
    checkPsiBlastParameter('scores', scores)
    query.append(('scores',scores))

    alignments = kwargs.get('alignments',500)
    checkPsiBlastParameter('alignments', alignments)
    query.append(('alignments',alignments))
    
    query.append(('alignView',0))
                    
    dropoff = kwargs.get('dropoff',15)
    checkPsiBlastParameter('dropoff', dropoff)
    query.append(('dropoff',dropoff))
        
    finaldropoff = kwargs.get('finaldropoff',25)
    checkPsiBlastParameter('finaldropoff', finaldropoff)
    query.append(('finaldropoff',finaldropoff))
        
    filter = kwargs.get('filter','F')
    checkPsiBlastParameter('filter', filter)
    query.append(('filter',filter))
    
    if previousjobid is '' and selectedHits is '':
        seqrange = kwargs.get('seqrange', None)
        if seqrange is None:
            seqrange = '0-' + str(len(sequence))
        elif not isinstance(seqrange, str):
            raise TypeError('seqrange should be a string')
        elif len(seqrange.split('-')) != 2:
            raise ValueError('seqrange should take the form START-END')
        try:
            start = int(seqrange.split('-')[0])
            end = int(seqrange.split('-')[1])
        except:
            raise ValueError('seqrange should be START-END with START and END being integers')
        query.append(('seqrange',seqrange))
        
    headers = { 'User-Agent' : 'ProDy' }
    
    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))
    
    data = urlencode(query)

    # submit the job
    base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/'
    url = base_url + 'run/'
    LOGGER.timeit('_prody_psi-blast')
    if cycle == 0:
        LOGGER.info('PSI-Blast searching PDB database for "{0}..."'
                    .format(sequence[:5]))
    else:
        LOGGER.info('PSI-Blast searching PDB database, cycle={0}'
                    .format(cycle))

    handle = openURL(url, data=data, headers=headers)
    job_id = handle.read()
    handle.close()

    # check the status
    url = base_url + 'status/' + job_id
    handle = openURL(url)
    status = handle.read()
    handle.close()
                    
    # keep checking the status until it's no longer running
    while status == 'RUNNING':
        LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.')
        LOGGER.write('Connecting to EBI for status...')
        handle = openURL(url)
        status = handle.read()
        LOGGER.clear()
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_psi-blast') > timeout:
            LOGGER.warn('PSI-Blast search time out.')
            return None

    LOGGER.info('The status is {0}'.format(status))
    LOGGER.clear()
    LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast')
 
    if cycle != 1:
        # get the results
        url = base_url + 'result/' + job_id + '/xml'
        handle = openURL(url)
        results = handle.read()
        handle.close()
        
        try:
            ext_xml = filename.lower().endswith('.xml')
        except AttributeError:
            pass
        else:
            if not ext_xml:
                filename += '.xml'
            f_out = open(filename, 'w')
            f_out.write(results)
            f_out.close()
            LOGGER.info('Results are saved as {0}.'.format(repr(filename)))
        
        return job_id, PsiBlastRecord(results, sequence)
    else:
        return job_id

Beispiel #14

0

Datei anzeigen

Datei: motif_rmsd.py Projekt: victor-gil-sepulveda/PhD-GPCR

    if options.motifs is None:
        parser.error("You must specify the motifs definition file.")
    else:
        motifs = parse_motifs(options.motifs)
        ordered_motifs = ["F/I-II", "A/IV", "B/V", "C/VI", "D", "E/VII", "Priming Loop"]

    ordered_proteins = ["JEV", "WNV", "TBEV", "BVDV", "HCV", "Polio"]
    if options.data is None:
        rmsd_results = {}
        for protein in ordered_proteins:
            rmsd_results[protein] = {"Drug": [], "RMSD": [], "Motif": []}
            for drug in ["CMA", "CMC", "DMA"]:
                path = os.path.join(drug, "ca_%s_%s.pdb" % (protein, drug))
                print "Working with", path
                pdb = parsePDB(path, subset="ca", csets=range(1000))
                print "Loaded"
                for motif in ordered_motifs:
                    if motif in motifs[protein]:
                        cas = pdb.select("name CA")
                        motif_cas = pdb.select(
                            "resid %d to %d" % (motifs[protein][motif][0], motifs[protein][motif][1])
                        )
                        calculator = RMSDCalculator(
                            calculatorType="QCP_OMP_CALCULATOR",
                            fittingCoordsets=cas.getCoordsets(),
                            calculationCoordsets=motif_cas.getCoordsets(),
                        )
                        rmsds = calculator.oneVsFollowing(0)
                        rmsd_results[protein]["RMSD"].extend(rmsds)
                        rmsd_results[protein]["Drug"].extend([drug] * len(rmsds))

Beispiel #15

0

Datei anzeigen

Datei: extract_motif_structure.py Projekt: victor-gil-sepulveda/PhD-GPCR

'''
Created on Dec 10, 2015

@author: victor
'''
from prody.proteins.pdbfile import parsePDB, writePDB
import sys
from histogram import parse_motifs
import urllib

if __name__ == '__main__':
    pdb = parsePDB(sys.argv[1])
    motifs = parse_motifs(sys.argv[2])
    prot_name = sys.argv[3]
    for motif in motifs[prot_name]:
        motif_struct = pdb.select("resid %d to %d"%motifs[prot_name][motif])
        writePDB("%s.pdb"%urllib.quote(motif, safe=""),motif_struct)
        print motif, sorted(list(set(motif_struct.getResnums())))

Beispiel #16

0

Datei anzeigen

Datei: mode_analysis.py Projekt: victor-gil-sepulveda/PhD-ANMPythonHelpers

        "ubi_cut.fixed.pdb", "ubi_start.pdb"
    ]
    protein_ids = {
        "1ddt.fixed.pdb": "1ddt",
        "1ex6.fixed.pdb": "1ex6",
        "1ggg.fixed.pdb": "1ggg",
        "4ake.fixed.pdb": "4ake",
        "src_kin.fixed.pdb": "1y57",
        "src_kin2.fixed.pdb": "1y57_MD",
        "2lzm.fixed.pdb": "2lzm",
        "ubi_cut.fixed.pdb": "1ubq_cut",
        "ubi_start.pdb": "1ubq"
    }

    structs = [
        parsePDB(os.path.join("structs", pdb_file)) for pdb_file in proteins
    ]
    structs_dict = dict(zip(proteins, structs))
    sizes = [struct.numResidues() for struct in structs]
    size_per_protein = dict(zip(proteins, sizes))
    # order proteins per size
    size_ordered_proteins = [s[1] for s in sorted(zip(sizes, proteins))]

    nmd_file_name = {
        "CC": "normalized_modes.1.nmd",
        "IC": "normalized_modes_cc.1.nmd",
        "IC_FULL": "normalized_modes_cc_full.1.nmd"
    }
    prefixes = {"CC": "CC", "IC": "IC", "IC_FULL": "IC"}
    workspace_folder = {"CC": "cc", "IC": "ic", "IC_FULL": "ic"}

Beispiel #17

0

Datei anzeigen

Datei: dihedral_validation.py Projekt: victor-gil-sepulveda/PhD-ANMPythonHelpers

'''
Created on Nov 26, 2015

@author: victor
'''
from prody.proteins.pdbfile import parsePDB
import sys
from calculate_dihedrals import get_dihedrals_for_conformation
import numpy
import math

if __name__ == '__main__':
    pdb = parsePDB(sys.argv[1])

    coordsets = pdb.getCoordsets()
    for i in range(len(coordsets) - 1):
        pdb.setCoords(coordsets[i])
        dihedrals_i = get_dihedrals_for_conformation(pdb)
        pdb.setCoords(coordsets[i + 1])
        dihedrals_ip1 = get_dihedrals_for_conformation(pdb)
        # sin(\alpha - \beta) = sin \alpha cos \beta - cos \alpha sin \beta.
        sub = numpy.arcsin(
            numpy.sin(dihedrals_ip1) * numpy.cos(dihedrals_i) -
            numpy.cos(dihedrals_ip1) * numpy.sin(dihedrals_i))
        print numpy.max(numpy.abs(sub))

Beispiel #18

0

Datei anzeigen

Datei: vmd_visualization_repr.py Projekt: victor-gil-sepulveda/PhD-GPCR

    import seaborn as sns
    colors = sns.hls_palette(15, l=.3, s=.8)
    
    # VMD execution template
    template = open("/home/victor/git/PhD-GPCR/PhD-GPCR-2/data/load_script_representatives.tcl").read()
    
    for line in open(options.input):
        protein, drug, folder = line.strip().split()

        # sorted clusters and same color generation always make the same cluster_id, color pair
        representatives_file = os.path.join(folder, "representatives.pdb")
        
        output_folder = os.path.join(options.output_folder, drug, protein)
        create_directory(output_folder)
        
        pdb = parsePDB(representatives_file)
        writePDB(os.path.join(output_folder,"protein.pdb"), pdb.select("protein"), csets = [0])
        writePDB(os.path.join(output_folder,"ligands.pdb"), pdb.select("resname %s"%drug))
        
        num_clusters = pdb.numCoordsets()
        clusters_file = open(os.path.join(output_folder,"cluster_colors"), "w")
        for i in range(num_clusters):
            clusters_file.write("%.2f %.2f %.2f%s"%(   colors[i][0],
                                                       colors[i][1],
                                                       colors[i][2],
                                ("\n" if i <(num_clusters-1) else "")))
        clusters_file.close()
        
        camera_settings = ""; camera_settings_zoomed = ""; option_camera = "#"; pre_render_file = ""; rendered_file = ""; option_zoom = "#"
        if options.camera is not None:
            camera_settings = camera[protein][0]

Beispiel #19

0

Datei anzeigen

Datei: prepare_clusters_for_visualization.py Projekt: victor-gil-sepulveda/PhD-GPCR

 if options.output is None:
     parser.error('Output file is needed. The new path for clusters will be stored there.')
 
 if options.results is None:
     parser.error('You must define the results folder')
 
 output = open(options.output,"w")
 if not options.prototypes:
     for line in open(options.input):
         protein, drug, folder = line.strip().split()
         files = glob.glob(os.path.join(folder, "cluster_*.pdb"))
         results_folder = os.path.join(options.results, drug, protein)
         create_directory(results_folder)
         output.write("%s %s %s\n"%(protein, drug, results_folder))
         for i, filename in enumerate(sorted(files)):
             pdb = parsePDB(filename)
             if i == 0:
                 # Extract first frame
                 prot = pdb.select("protein")
                 writePDB(os.path.join(results_folder, "%s.pdb"%protein), prot, csets=[0])
             # Extract ligands
             ligands = pdb.select("resname %s"%drug)
             writePDB(os.path.join(results_folder, "ligand_%s"%os.path.basename(filename)), ligands)
 else:
     # Input file contains the folders with prototypes file
     for line in open(options.input):
         protein, drug, folder = line.strip().split()
         filename  = os.path.join(folder, "representatives.pdb")
         results_folder = os.path.join(options.results, drug, protein)
         # Look for "REMARK cluster id :"
         # Parse once to get correct ordering of clusters

Beispiel #20

0

Datei anzeigen

    (options, args) = parser.parse_args()

    sequences = []
    all_ca_coords = []
    reference_structure = get_reference_structure(
        options.reference, "name CA and resid " +
        " ".join([str(i)
                  for i in range(options.from_res, options.to_res + 1)]),
        options.to_res - options.from_res + 1)

    for pdb_path in open(options.db_list).readlines():
        pdb_path = pdb_path.rstrip('\r\n')

        res_mapping = get_best_res_mapping(options.reference, pdb_path)
        pdb = parsePDB(pdb_path)

        num_res = options.to_res - options.from_res + 1
        sequence = ["GAP"] * num_res
        res_selection = []
        coords = [inf, inf, inf] * num_res

        index = 0
        center = numpy.array([0., 0., 0.])
        for r_id in range(options.from_res, options.to_res + 1):
            if r_id in res_mapping:
                t_id = res_mapping[r_id]
                res_selection.append(t_id)
                residue_ca = pdb.select("name CA and resid %d" % t_id).copy()
                residue_coordsets = residue_ca.getCoordsets()
                sequence[index] = str(residue_ca.getResnames()[0])

Beispiel #21

0

Datei anzeigen

Datei: calc_confs_mode_overlap.py Projekt: victor-gil-sepulveda/PhD-ANMPythonHelpers

from anmichelpers.tools.tools import norm

if __name__ == '__main__':
    distances = [33, 29, 24, 17, 13, 07, 04]
    base_folder = "/home/victor/Desktop/1AKE_dyn/"
    reference_open = "33.pdb"
    reference_closed = "04.pdb"

    # Calculate eigenvectors
    all_eigenvectors = {}
    NUM_MODES = 8
    for distance in distances:
        pdb_file = "%02d.pdb" % distance
        pdb_path = os.path.join(base_folder, pdb_file)
        print pdb_path
        pdb_struct = parsePDB(pdb_path)
        pdb_struct_ca = pdb_struct.ca
        pdb_struct_ca_anm = ANM(pdb_file)
        pdb_struct_ca_anm.buildHessian(pdb_struct_ca)  #cutoff 15
        pdb_struct_ca_anm.calcModes(n_modes=NUM_MODES)  #cutoff 15

        eigenvectors = []
        for i in range(NUM_MODES):
            mode = pdb_struct_ca_anm[i]
            eigenvectors.append(mode.getEigvec().round(3))

        all_eigenvectors[pdb_file] = numpy.array(eigenvectors)

    others = list(sorted(all_eigenvectors.keys(), reverse=True))

    # Calculate transition vectors

Beispiel #22

0

Datei anzeigen

def psiBlastCycle(sequence=None, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    a single cycle of EBI psiblast.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    The following search parameters can be adjusted by the user.
    We use the same default values as 
    http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/
    wherever applicable.

    :arg email: email address for reporting problems
        default is [email protected]
    :type email: str with an @ before a .

    :arg matrix: The comparison matrix to be used to score alignments when searching the database
        possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' 
        default is 'BLOSUM62'
    :type matrix: str

    :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. 
        Increasing the gap opening penalty will decrease the number of gaps in the final alignment.
        Possible values range from 8 to 16 inclusive, default is 11
    :type gapopen: int

    :arg gapext: Penalty taken away from the score for each base or residue in the gap. 
        Increasing the gap extension penalty favors short gaps in the final alignment, 
        conversly decreasing the gap extension penalty favors long gaps in the final alignment. 
        Possible values range from 0 to 3, default is 1
    :type gapext: int

    :arg expthr: Expectation threshold that limits the number of scores and alignments reported. 
        This is the maximum number of times the match is expected to occur by chance.
        Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3,
        1.0e-2, 0.1, 1.0, 10.0, 100, 1000
        default is 10.0
    :type expthr: float

    :arg psithr: Expectation value threshold for automatic selection of matched sequences for 
        inclusion in the PSSM at each iteration.
        Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3,
        1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0
        default is 1.0e-3
    :type psithr: float

    :arg scores: Maximum number of match score summaries reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type scores: int

    :arg alignments: Maximum number of match alignments reported in the result output.
        Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000
        Default is 500
    :type alignmets: int

    :arg dropoff: The amount a score can drop before extension of word hits is halted
        Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30
        Default is 15
    :type dropoff: int

    :arg finaldropoff: Dropoff value for final gapped alignment
        Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30
        Default is 25
    :type finaldropoff: int

    :arg filter: Filter regions of low sequence complexity. This can avoid issues with 
        low complexity sequences where matches are found due to composition rather than 
        meaningful sequence similarity. However, in some cases filtering also masks 
        regions of interest and so should be used with caution.
        Possible values are T and F, default is F
    :type filter: str

    :arg seqrange: Specify a range or section of the input sequence to use in the search.
        Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST 
        to only use residues 34 to 89, inclusive.
    :type seqrange: str of form START-END

    :arg database: a database name from those available. See
        http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database
        default is pdb
    :type database: str

    :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. 
        default is None
        You can change this if you want to continue from a previous run
    :type previousjobid: str

    :arg selectedHits: Name of a file containing a list of identifiers of the 
        hits from the previous iteration to use to construct the search PSSM 
        for this iteration.
        default is None
    :type selectedHits: str

    :arg cpfile: Name of a Checkpoint file from the previous iteration. 
        default is None
    :type cpfile: str

    :arg sleep: how long to wait to reconnect for status
         Sleep time is multiplied by 1.5 when results are not ready.
         default is 2 seconds
    :type sleep: float

    :arg timeout:  when to give up waiting for the results 
        default is 120 seconds
    :type timeout: float

    :arg cycle: cycle number
    :type cycle: int

    """
    cycle = kwargs.get('cycle', 0)

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    elif sequence is None:
        if cycle == 0:
            cycle = 1
    else:
        raise TypeError(
            'sequence must be Atomic, Sequence, or str not {0}'.format(
                type(sequence)))

    if cycle == 0:
        query = [('sequence', sequence)]
    else:
        query = []

    email = kwargs.get('email', '*****@*****.**')
    if not isinstance(email, str):
        raise TypeError('email must be a string')
    elif email.find('@') == -1 or email.find('.') == -1 or len(
            email.split('@')) != 2:
        raise ValueError(
            'email must be a valid email address with at least one . and exactly one @ sign'
        )
    elif not email.find('@') < email.find(email.split('.')[-1]):
        raise ValueError(
            'email must be a valid email address with a . after the @ sign')
    query.append(('email', email))
    query.append(('title', 'ProDy psiBlastPDB request'))

    previousjobid = kwargs.get('previousjobid', '')
    if previousjobid != '':
        query.append(('previousjobid', previousjobid))

    selectedHits = kwargs.get('selectedHits', '')
    if selectedHits != '':
        query.append(('selectedHits', selectedHits))

    database = kwargs.get('database', 'pdb')
    checkPsiBlastParameter('database', database)
    query.append(('database', database))

    matrix = kwargs.get('matrix', 'BLOSUM62')
    checkPsiBlastParameter('matrix', matrix)
    query.append(('matrix', matrix))

    gapopen = kwargs.get('gapopen', 11)
    checkPsiBlastParameter('gapopen', gapopen)
    query.append(('gapopen', gapopen))

    gapext = kwargs.get('gapext', 1)
    checkPsiBlastParameter('gapext', gapext)
    query.append(('gapext', gapext))

    expthr = kwargs.get('expthr', 10.)
    checkPsiBlastParameter('expthr', expthr)
    query.append(('expthr', expthr))

    psithr = kwargs.get('psithr', 1.0e-3)
    checkPsiBlastParameter('psithr', psithr)
    query.append(('psithr', psithr))

    scores = kwargs.get('scores', 500)
    checkPsiBlastParameter('scores', scores)
    query.append(('scores', scores))

    alignments = kwargs.get('alignments', 500)
    checkPsiBlastParameter('alignments', alignments)
    query.append(('alignments', alignments))

    query.append(('alignView', 0))

    dropoff = kwargs.get('dropoff', 15)
    checkPsiBlastParameter('dropoff', dropoff)
    query.append(('dropoff', dropoff))

    finaldropoff = kwargs.get('finaldropoff', 25)
    checkPsiBlastParameter('finaldropoff', finaldropoff)
    query.append(('finaldropoff', finaldropoff))

    filter = kwargs.get('filter', 'no')
    checkPsiBlastParameter('filter', filter)
    query.append(('filter', filter))

    if previousjobid == '' and selectedHits == '':
        seqrange = kwargs.get('seqrange', None)
        if seqrange is None:
            seqrange = '0-' + str(len(sequence))
        elif not isinstance(seqrange, str):
            raise TypeError('seqrange should be a string')
        elif len(seqrange.split('-')) != 2:
            raise ValueError('seqrange should take the form START-END')
        try:
            start = int(seqrange.split('-')[0])
            end = int(seqrange.split('-')[1])
        except:
            raise ValueError(
                'seqrange should be START-END with START and END being integers'
            )
        query.append(('seqrange', seqrange))

    headers = {'User-Agent': 'ProDy'}

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    data = urlencode(query)

    # submit the job
    base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/'
    url = base_url + 'run/'
    LOGGER.timeit('_prody_psi-blast')
    if cycle == 0:
        LOGGER.info('PSI-Blast searching PDB database for "{0}..."'.format(
            sequence[:5]))
    else:
        LOGGER.info(
            'PSI-Blast searching PDB database, cycle={0}'.format(cycle))

    handle = openURL(url, data=data, headers=headers)
    job_id = handle.read()
    if PY3K:
        job_id = job_id.decode()
    handle.close()

    # check the status
    url = base_url + 'status/' + job_id
    handle = openURL(url)
    status = handle.read()
    if PY3K:
        status = status.decode()
    handle.close()

    # keep checking the status until it's no longer running
    while status == 'RUNNING':
        LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.')
        LOGGER.write('Connecting to EBI for status...')
        handle = openURL(url)
        status = handle.read()
        if PY3K:
            status = status.decode()
        LOGGER.clear()
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_psi-blast') > timeout:
            LOGGER.warn('PSI-Blast search time out.')
            return None

    LOGGER.info('The status is {0}'.format(status))
    LOGGER.clear()
    LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast')

    if cycle != 1:
        # get the results
        url = base_url + 'result/' + job_id + '/xml'
        handle = openURL(url)
        results = handle.read()
        handle.close()

        try:
            ext_xml = filename.lower().endswith('.xml')
        except AttributeError:
            pass
        else:
            if not ext_xml:
                filename += '.xml'
            f_out = open(filename, 'w')
            f_out.write(results)
            f_out.close()
            LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

        return job_id, PsiBlastRecord(results, sequence)
    else:
        return job_id

Beispiel #23

0

Datei anzeigen

Datei: blastpdb.py Projekt: emalacs/FF_creator

def blastPDB(sequence, filename=None, **kwargs):
    """Returns a :class:`PDBBlastRecord` instance that contains results from
    blast searching *sequence* against the PDB using NCBI blastp.

    :arg sequence: an object with an associated sequence string 
         or a sequence string itself
    :type sequence: :class:`Atomic`, :class:`Sequence`, or str

    :arg filename: a *filename* to save the results in XML format
    :type filename: str

    *hitlist_size* (default is ``250``) and *expect* (default is ``1e-10``)
    search parameters can be adjusted by the user.  *sleep* keyword argument
    (default is ``2`` seconds) determines how long to wait to reconnect for
    results.  Sleep time is multiplied by 1.5 when results are not ready.  
    *timeout* (default is 120 s) determines when to give up waiting for the results.
    """

    if sequence == 'runexample':
        sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI'
                    'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN'
                    'DAYDIVKMKKSNISPNFNFMGQLLDFERTL')

    elif isinstance(sequence, Atomic):
        sequence = sequence.calpha.getSequence()

    elif isinstance(sequence, Sequence):
        sequence = str(sequence)

    elif isinstance(sequence, str):
        if len(sequence) in [4, 5, 6]:
            ag = parsePDB(sequence)
            sequence = ag.calpha.getSequence()
        sequence = ''.join(sequence.split())

    else:
        raise TypeError(
            'sequence must be Atomic, Sequence, or str not {0}'.format(
                type(sequence)))

    headers = {'User-agent': 'ProDy'}
    query = [
        ('DATABASE', 'pdb'),
        ('ENTREZ_QUERY', '(none)'),
        ('PROGRAM', 'blastp'),
    ]

    expect = float(kwargs.pop('expect', 10e-10))
    if expect <= 0:
        raise ValueError('expect must be a positive number')
    query.append(('EXPECT', expect))
    hitlist_size = int(kwargs.pop('hitlist_size', 250))
    if hitlist_size <= 0:
        raise ValueError('expect must be a positive integer')
    query.append(('HITLIST_SIZE', hitlist_size))
    query.append(('QUERY', sequence))
    query.append(('CMD', 'Put'))

    sleep = float(kwargs.pop('sleep', 2))
    timeout = float(kwargs.pop('timeout', 120))

    try:
        import urllib.parse
        urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8')
    except ImportError:
        from urllib import urlencode

    url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

    data = urlencode(query)
    LOGGER.timeit('_prody_blast')
    LOGGER.info('Blast searching NCBI PDB database for "{0}..."'.format(
        sequence[:5]))
    handle = openURL(url, data=data, headers=headers)

    html = handle.read()
    index = html.find(b'RID =')
    if index == -1:
        raise Exception('NCBI did not return expected response.')
    else:
        last = html.find(b'\n', index)
        rid = html[index + len('RID ='):last].strip()

    index = html.find(b'RTOE =')
    if index == -1:
        rtoe = None  # This is not used
    else:
        last = html.find(b'\n', index)
        rtoe = int(html[index + len('RTOE ='):last].strip())

    query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
             ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
    data = urlencode(query)

    while True:
        LOGGER.sleep(int(sleep), 'to reconnect NCBI for search results.')
        LOGGER.write('Connecting to NCBI for search results...')
        handle = openURL(url, data=data, headers=headers)
        results = handle.read()
        index = results.find(b'Status=')
        LOGGER.clear()
        if index < 0:
            break
        last = results.index(b'\n', index)
        status = results[index + len('Status='):last].strip()
        if status.upper() == 'READY':
            break
        sleep = int(sleep * 1.5)
        if LOGGER.timing('_prody_blast') > timeout:
            LOGGER.warn('Blast search time out.')
            return None

    LOGGER.clear()
    LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

    try:
        ext_xml = filename.lower().endswith('.xml')
    except AttributeError:
        pass
    else:
        if not ext_xml:
            filename += '.xml'
        out = open(filename, 'w')
        if PY3K:
            out.write(results.decode())
        else:
            out.write(results)
        out.close()
        LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

    return PDBBlastRecord(results, sequence)