Example #1
0
def queryUniprot(id, loop_through=[]):
    """Query Uniprot with *id* and return a `dictionary` containing the results
    
    :arg loop_through: entries through which you want to loop dictElements
        until there aren't any elements left
    :type loop_through: list
    """

    if not isinstance(id, str):
        raise TypeError('id should be a string')

    try:
        record_file = openURL(
            'http://www.uniprot.org/uniprot/{0}.xml'.format(id))
    except:
        raise ValueError('No Uniprot record found with that id')

    data = record_file.read()
    record_file.close()
    data = ET.XML(data)

    data = dictElement(data.getchildren()[0],
                       '{http://uniprot.org/uniprot}',
                       number_multiples=True)

    if loop_through != []:
        data = dictElementLoop(data, loop_through,
                               '{http://uniprot.org/uniprot}')

    return data
Example #2
0
def queryUniprot(id, expand=[], regex=True):
    """Query Uniprot with *id* and return a `dict` containing the raw results. 
    Regular users should use :func:`searchUniprot` instead.
    
    :arg expand: entries through which you want to loop dictElements
        until there aren't any elements left
    :type expand: list
    """

    if not isinstance(id, str):
        raise TypeError('id should be a string')

    try:
        record_file = openURL(
            'http://www.uniprot.org/uniprot/{0}.xml'.format(id))
    except:
        raise ValueError('No Uniprot record found with that id')

    data = record_file.read()
    record_file.close()
    data = XML(data)

    data = dictElement(data.getchildren()[0],
                       '{http://uniprot.org/uniprot}',
                       number_multiples=True)

    for key in data:
        value = data[key]
        if not key.startswith('dbReference'):
            continue

        try:
            if value.get('type') != 'PDB':
                continue
        except AttributeError:
            continue

        pdbid = value.get('id')
        refdata = {'PDB': pdbid}
        for prop in value:
            prop_key = prop.get('type')
            prop_val = prop.get('value')
            refdata[prop_key] = prop_val
        data[key] = refdata

    if expand:
        keys = []
        if regex:
            for lt in expand:
                lt_re = re.compile(lt)
                for key in data:
                    if lt_re.match(key):
                        keys.append(key)
        else:
            keys = expand
        data = dictElementLoop(data, keys, '{http://uniprot.org/uniprot}')

    return data
Example #3
0
def checkPsiBlastParameter(parameter, value):
    """Checks that the value provided for a parameter is in the xml page for that parameter
    and raises an error if it isn't.

    :arg parameter: parameter name
    :type parameter: str
    
    :arg value: value being checked
    :type value: any
    """
    info_file = urllib2.urlopen(
        'http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/' +
        parameter)
    data = info_file.read()
    info_file.close()

    data = ET.XML(data)
    data = dictElement(data)

    if data['type'] == 'STRING':
        type = str
    elif data['type'] == 'INTEGER':
        type = int
    elif data['type'] == 'DOUBLE':
        type = float

    if not isinstance(value, type):
        raise TypeError(name + ' should be of type ' + \
                        str(type).split()[1].strip("'>"))

    values = []
    str_values = []
    for element in data['values']:
        values.append(type(dictElement(element)['value']))
        str_values.append(str(dictElement(element)['value']))

    if not value in values:
        raise ValueError(parameter + ' should be one of ' + \
                         ', '.join(str_values[:-1]) \
                         + ', or ' + str_values[-1])

    return
Example #4
0
def queryUniprot(id, expand=[], regex=True):
    """Query Uniprot with *id* and return a `dictionary` containing the results
    
    :arg expand: entries through which you want to loop dictElements
        until there aren't any elements left
    :type expand: list
    """

    if not isinstance(id, str):
        raise TypeError('id should be a string')

    try:
        record_file = openURL('http://www.uniprot.org/uniprot/{0}.xml'.format(id))
    except:
        raise ValueError('No Uniprot record found with that id')
    
    data = record_file.read()
    record_file.close()
    data = XML(data)

    data = dictElement(data.getchildren()[0], '{http://uniprot.org/uniprot}', number_multiples=True)

    for key in data:
        value = data[key]
        if not key.startswith('dbReference'):
            continue
        
        try:
            if value.get('type') != 'PDB':
                continue
        except AttributeError:
            continue

        pdbid = value.get('id')
        refdata = {'PDB': pdbid}
        for prop in value:
            prop_key = prop.get('type')
            prop_val = prop.get('value')
            refdata[prop_key] = prop_val
        data[key] = refdata
            
    if expand:
        keys = []
        if regex:
            for lt in expand:
                lt_re = re.compile(lt)
                for key in data.keys():
                    if lt_re.match(key):
                        keys.append(key)
        else:
            keys = expand
        data = dictElementLoop(data, keys, '{http://uniprot.org/uniprot}')
    
    return data
Example #5
0
def checkPsiBlastParameter(parameter, value):
    """Checks that the value provided for a parameter is in the xml page for that parameter
    and raises an error if it isn't.

    :arg parameter: parameter name
    :type parameter: str
    
    :arg value: value being checked
    :type value: any
    """
    info_file = urllib2.urlopen('http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/' + parameter)
    data = info_file.read()
    info_file.close()
    
    data = ET.XML(data)
    data = dictElement(data)

    if data['type'] == 'STRING':
        type = str
    elif data['type'] == 'INTEGER':
        type = int
    elif data['type'] == 'DOUBLE':
        type = float
    
    if not isinstance(value,type):
        raise TypeError(name + ' should be of type ' + \
                        str(type).split()[1].strip("'>"))
    
    values = []
    str_values = []
    for element in data['values']:
        values.append(type(dictElement(element)['value']))
        str_values.append(str(dictElement(element)['value']))

    if not value in values:
        raise ValueError(parameter + ' should be one of ' + \
                         ', '.join(str_values[:-1]) \
                         + ', or ' + str_values[-1])

    return
Example #6
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = "{http://pfam.xfam.org/}"
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile

        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = "".join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError("could not parse a sequence without gaps from " + query)
    else:
        seq = "".join(query.split())

    import xml.etree.cElementTree as ET

    LOGGER.timeit("_pfam")
    timeout = int(kwargs.get("timeout", 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + " is not a valid sequence")

            fseq = ">Seq\n" + seq
            parameters = {"hmmdb": "pfam", "seq": fseq}
            enc_params = urllib.urlencode(parameters)
            request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params)

            url = urllib2.urlopen(request).geturl() + "?output=xml"
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError("failed to parse results XML, check URL: " + url)
            matches = {}
            for child in root[0]:
                if child.tag == "hits":
                    accession = child.get("acc")
                    pfam_id = accession.split(".")[0]
                    matches[pfam_id] = {}
                    matches[pfam_id]["accession"] = accession
                    matches[pfam_id]["class"] = "Domain"
                    matches[pfam_id]["id"] = child.get("name")
                    matches[pfam_id]["locations"] = {}
                    matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore")
                    matches[pfam_id]["locations"]["end"] = child[0].get("alisqto")
                    matches[pfam_id]["locations"]["evalue"] = child.get("evalue")
                    matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0"
                    matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto")
                    matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom")
                    matches[pfam_id]["locations"]["significant"] = child[0].get("significant")
                    matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom")
                    matches[pfam_id]["type"] = "Pfam-A"
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader

            try:
                polymers = parsePDBHeader(seq[:4], "polymers")
            except Exception as err:
                LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != "UniProt":
                            continue
                        idcode = dbref.idcode
                        LOGGER.info(
                            "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid)
                        )
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq)))
                url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"
            else:
                url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml"

        else:
            url = "http://pfam.xfam.org/protein/" + seq + "?output=xml"

    LOGGER.debug("Retrieving Pfam search results: " + url)
    xml = None
    while LOGGER.timing("_pfam") < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url)
    else:
        LOGGER.report("Pfam search completed in %.2fs.", "_pfam")

    if xml.find(b"There was a system error on your last request.") > 0:
        LOGGER.warn("No Pfam matches found for: " + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError("failed to parse results XML, check URL: " + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError("failed to parse results XML, check URL: " + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results["matches"]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib["accession"][:7]
        except KeyError:
            raise ValueError("failed to parse results XML, check URL: " + url)

        if not re.search("^P(F|B)[0-9]{5}$", accession):
            raise ValueError("{0} does not match pfam accession" " format".format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault("locations", [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = "Query " + repr(query)
    else:
        query = "Query sequence"

    if matches:
        LOGGER.info(query + " matched {0} Pfam families.".format(len(matches)))
    else:
        LOGGER.info(query + " did not match any Pfam families.")
    return matches
Example #7
0
    def __init__(self, results, sequence=None):
        """Instantiate a PDBBlastRecord object instance.

        :arg result: psi-blast search results in XML format or an XML file
            that contains the results
        :type result: str

        :arg sequence: query sequence
        :type sequence: str
        """

        if sequence:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
        self._sequence = sequence

        tree = ET.fromstring(results)
        header = tree.getchildren()[0]
        parameters = dictElement(header.getchildren()[2], '{http://www.ebi.ac.uk/schema}')
        query_len = int(parameters.items()[0][1].getchildren()[0].items()[0][1])

        self._params = np.sum([parameters.items()[0][1].getchildren()[0].items(), \
                               parameters.items()[1:-2], \
                               parameters.items()[-2][1].getchildren()[0].items(), \
                               [(parameters.items()[-1])]])

        result = tree.getchildren()[1] 
        hits = []
        for hit in result.getchildren()[0]:
            alignments = dictElement(hit.getchildren()[0], '{http://www.ebi.ac.uk/schema}')
            data = dictElement(alignments['alignment'], '{http://www.ebi.ac.uk/schema}')
 
            for key in ['gaps', 'score']:
                if key in data.keys():
                    data[key] = int(data[key])
                else:
                    data[key] = 0

            data['query-len'] = query_len

            for key in ['bits', 'expectation', 'identity']:
                data[key] = float(data[key])

            p_identity = data['identity'] 
            data['percent_identity'] = p_identity
            p_overlap = (100.0 * (len(data['querySeq']) - data['gaps']) /
                         query_len)
            data['percent_coverage'] = p_overlap

            pdbch = dict(data)
            pdbch['pdb_id'] = hit.items()[0][1][:4]
            pdbch['chain_id'] = hit.items()[0][1][-1]
            pdbch['description'] = hit.items()[2][1]
            hits.append((p_identity, p_overlap, pdbch))

        hits.sort(key=lambda hit: hit[0], reverse=True)
        self._hits = hits
        
        if sequence and len(sequence) != query_len:
            raise ValueError('xml sequence length and the length of the provided '
                             'sequence do not match')
Example #8
0
    def __init__(self, xml, sequence=None):
        """Instantiate a PDBlast object instance.

        :arg xml: blast search results in XML format or an XML file that
            contains the results
        :type xml: str

        :arg sequence: query sequence
        :type sequence: str"""

        if sequence:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
        self._sequence = sequence

        import xml.etree.cElementTree as ET
        if len(xml) < 100:
            if os.path.isfile(xml):
                xml = ET.parse(xml)
                root = xml.getroot()
            else:
                raise ValueError('xml is not a filename and does not look like'
                                 ' a valid XML string')
        else:
            root = ET.XML(xml)

        root = dictElement(root, 'BlastOutput_')
        if root['db'] != 'swissprot':
            raise ValueError('blast search database in xml must be "pdb"')
        if root['program'] != 'blastp':
            raise ValueError('blast search program in xml must be "blastp"')

        self._param = dictElement(root['param'][0], 'Parameters_')

        query_len = int(root['query-len'])
        if sequence and len(sequence) != query_len:
            raise ValueError('query-len and the length of the sequence do not '
                             'match, xml data may not be for given sequence')
        hits = []
        for iteration in root['iterations']:
            for hit in dictElement(iteration, 'Iteration_')['hits']:
                hit = dictElement(hit, 'Hit_')
                data = dictElement(hit['hsps'][0], 'Hsp_')
                for key in ['align-len', 'gaps', 'hit-frame', 'hit-from',
                            'hit-to', 'identity', 'positive', 'query-frame',
                            'query-from', 'query-to']:
                    data[key] = int(data[key])
                data['query-len'] = query_len
                for key in ['evalue', 'bit-score', 'score']:
                    data[key] = float(data[key])
                p_identity = 100.0 * data['identity'] / (data['query-to'] -
                                                    data['query-from'] + 1)
                data['percent_identity'] = p_identity
                p_overlap = (100.0 * (data['align-len'] - data['gaps']) /
                              query_len)
                data['percent_coverage'] = p_overlap
                data['percent_overlap'] = p_overlap
                for item in (hit['id'] + ' ' + hit['def']).split('>gi'):
                    #>gi|1633462|pdb|4AKE|A Chain A, Adenylate Kinase
                    #                        __________TITLE__________
                    head, title = item.split(None, 1)
                    head = head.split('|')
                    seqInfo = dict(data)
                    accession = head[-2].split(".")[0]
                    protName = head[-1].split("_")[0]
                    species = head[-1].split("_")[1]
                    seqInfo['accession'] = accession
                    seqInfo['protName'] = protName
                    seqInfo['species'] = species
                    hits.append((p_identity, p_overlap, seqInfo))
        hits.sort(key=lambda hit: hit[0], reverse=True)
        self._hits = hits
Example #9
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')

	    fseq = '>Seq\n' + seq
	    parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
	    enc_params = urllib.urlencode(parameters)
	    request = urllib2.Request('http://hmmer.janelia.org/search/hmmscan', enc_params)

	    url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()
		
        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)
	    matches = {}
	    for child in root[0]:
		    if child.tag == 'hits':
			    accession = child.get('acc')
			    pfam_id = accession.split('.')[0]
			    matches[pfam_id]={}
			    matches[pfam_id]['accession']=accession
			    matches[pfam_id]['class']='Domain'
			    matches[pfam_id]['id']=child.get('name')
			    matches[pfam_id]['locations']={}
			    matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
			    matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
			    matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
			    matches[pfam_id]['locations']['end']=child[0].get('alisqto')
			    matches[pfam_id]['locations']['evalue']=child.get('evalue')
			    matches[pfam_id]['locations']['evidence']='hmmer v3.0'
			    matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
			    matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
			    matches[pfam_id]['locations']['significant']=child[0].get('significant')	
			    matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
			    matches[pfam_id]['type']='Pfam-A'
	            return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.xfam.org/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Example #10
0
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = { 'format' : 'tsv' }
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace('results','download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url) 
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        #xml = urllib2.urlopen(result_request).read()
        tsv = urllib2.urlopen(result_request).read()
        # openURL(url, timeout=timeout).read()
        
        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
            #if child.tag == 'hits':
                # accession = child.get('acc')
                # pfam_id = accession.split('.')[0]
                # matches[pfam_id]={}
                # matches[pfam_id]['accession']=accession
                # matches[pfam_id]['class']='Domain'
                # matches[pfam_id]['id']=child.get('name')
                # matches[pfam_id]['locations']={}
                # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                # matches[pfam_id]['locations']['evalue']=child.get('evalue')
                # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                # matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                # matches[pfam_id]['type']='Pfam-A'
        # return matches

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id]={}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']   
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
 
            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'
                                .format(idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND','RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' + seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Example #11
0
File: pfam.py Project: uibcdf/ProDy
def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
        file. Sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = {'hmmdb': 'pfam', 'seq': fseq}
        enc_params = urllib.urlencode(parameters).encode('utf-8')
        request = urllib2.Request(
            'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)

        results_url = urllib2.urlopen(request).geturl()

        #res_params = { 'output' : 'xml' }
        res_params = {'format': 'tsv'}
        enc_res_params = urllib.urlencode(res_params)
        #modified_res_url = results_url + '?' + enc_res_params
        modified_res_url = results_url.replace(
            'results', 'download') + '?' + enc_res_params

        result_request = urllib2.Request(modified_res_url)
        # url = ( urllib2.urlopen(request).geturl() + '?output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(
            seq[:MINSEQLEN]))

        try:
            #xml = urllib2.urlopen(result_request).read()
            tsv = urllib2.urlopen(result_request).read()
            # openURL(url, timeout=timeout).read()
        except:
            raise ValueError('No matching Pfam domains were found.')

        # try:
        #     root = ET.XML(xml)
        # except Exception as err:
        #     raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)

        matches = {}
        #for child in root[0]:
        #if child.tag == 'hits':
        # accession = child.get('acc')
        # pfam_id = accession.split('.')[0]
        # matches[pfam_id]={}
        # matches[pfam_id]['accession']=accession
        # matches[pfam_id]['class']='Domain'
        # matches[pfam_id]['id']=child.get('name')
        # matches[pfam_id]['locations']={}
        # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
        # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
        # matches[pfam_id]['locations']['end']=child[0].get('alisqto')
        # matches[pfam_id]['locations']['evalue']=child.get('evalue')
        # matches[pfam_id]['locations']['evidence']='hmmer v3.0'
        # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
        # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
        # matches[pfam_id]['locations']['significant']=child[0].get('significant')
        # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
        # matches[pfam_id]['type']='Pfam-A'
        # return matches

        if PY3K:
            tsv = tsv.decode()

        lines = tsv.split('\n')
        keys = lines[0].split('\t')
        root = {}
        for i, line in enumerate(lines[1:-1]):
            root[i] = {}
            for j, key in enumerate(keys):
                root[i][key] = line.split('\t')[j]

        for child in root.values():
            accession = child['Family Accession']
            pfam_id = accession.split('.')[0]
            matches[pfam_id] = {}
            matches[pfam_id]['accession'] = accession
            matches[pfam_id]['class'] = 'Domain'
            matches[pfam_id]['id'] = child['Family id']
            matches[pfam_id]['locations'] = {}
            matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
            matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
            matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
            matches[pfam_id]['locations']['end'] = child['Env. End']
            matches[pfam_id]['locations']['cond_evalue'] = child[
                'Cond. E-value']
            matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
            matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
            matches[pfam_id]['locations']['hmm_end'] = child['Model End']
            matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
            #matches[pfam_id]['locations']['significant'] = child['significant']
            matches[pfam_id]['locations']['start'] = child['Env. Start']
            matches[pfam_id]['type'] = 'Pfam-A'
        return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'.format(
                    seq[:4], str(err)))
            else:
                chid = seq[4:].upper()

            for poly in polymers:
                if chid and poly.chid != chid:
                    continue
                for dbref in poly.dbrefs:
                    if dbref.database != 'UniProt':
                        continue
                    idcode = dbref.idcode
                    accession = dbref.accession
                    LOGGER.info('UniProt ID code {0} for {1} chain '
                                '{2} will be used.'.format(
                                    idcode, seq[:4], poly.chid))
                    break
                if idcode is not None:
                    break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = prefix + 'protein/' + seq + '?output=xml'
            else:
                url = prefix + 'protein/' + idcode + '?output=xml'

        else:
            url = prefix + 'protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml not in ['PEND', 'RUN']:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find(b'No valid UniProt accession or ID') > 0:
        try:
            url = prefix + 'protein/' + accession + '?output=xml'
            xml = openURL(url, timeout=timeout).read()
        except:
            try:
                ag = parsePDB(seq, subset='ca')
                ag_seq = ag.getSequence()
                return searchPfam(ag_seq)
            except:
                raise ValueError('No valid UniProt accession or ID for: ' +
                                 seq)

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        key = '{' + prefix + '}'
        results = dictElement(root[0], key)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Example #12
0
File: pfam.py Project: npabon/ProDy
def searchPfam(query, search_b=False, skip_a=False, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg search_b: search Pfam-B families when **True**
    :type search_b: bool

    :arg skip_a: do not search Pfam-A families when **True**
    :type skip_a: bool

    :arg ga: use gathering threshold when **True**
    :type ga: bool

    :arg evalue: user specified e-value cutoff, must be smaller than 10.0
    :type evalue: float

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.sanger.ac.uk/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')

        urlextension = ''
        if kwargs:
            ga = int(kwargs.get('ga', 1))
            if not (ga == 1 or ga == 0):
                raise ValueError('ga must be either 0 or 1')

            evalue = kwargs.get('evalue', None)
            if evalue:
                if not float(evalue) <= 10.0:
                    raise ValueError('evalue must be a valid float < 10.0')
                urlextension = urlextension + '&evalue=' + str(evalue)
            else:
                urlextension = urlextension + '&ga=' + str(ga)

        search_b = int(bool(search_b))
        skip_a = int(bool(skip_a))
        if skip_a == 1:
            search_b = 1

        urlextension = urlextension + '&searchBs=' + str(search_b)
        urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) +
               urlextension + '&output=xml')
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()

        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        try:
            url = dictElement(root[0], prefix)['result_url']
        except (IndexError, KeyError):
            raise ValueError('failed to parse results XML, check URL: ' + url)

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.sanger.ac.uk/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        #else:
        #    if xml:
        #        break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Example #13
0
    def __init__(self, xml, sequence=None):
        """Instantiate a PDBlast object instance.

        :arg xml: blast search results in XML format or an XML file that
            contains the results
        :type xml: str

        :arg sequence: query sequence
        :type sequence: str"""

        if sequence:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
        self._sequence = sequence

        import xml.etree.cElementTree as ET
        if len(xml) < 100:
            if os.path.isfile(xml):
                xml = ET.parse(xml)
                root = xml.getroot()
            else:
                raise ValueError('xml is not a filename and does not look like'
                                 ' a valid XML string')
        else:
            root = ET.XML(xml)

        root = dictElement(root, 'BlastOutput_')
        if root['db'] != 'swissprot':
            raise ValueError('blast search database in xml must be "pdb"')
        if root['program'] != 'blastp':
            raise ValueError('blast search program in xml must be "blastp"')

        self._param = dictElement(root['param'][0], 'Parameters_')

        query_len = int(root['query-len'])
        if sequence and len(sequence) != query_len:
            raise ValueError('query-len and the length of the sequence do not '
                             'match, xml data may not be for given sequence')
        hits = []
        for iteration in root['iterations']:
            for hit in dictElement(iteration, 'Iteration_')['hits']:
                hit = dictElement(hit, 'Hit_')
                data = dictElement(hit['hsps'][0], 'Hsp_')
                for key in ['align-len', 'gaps', 'hit-frame', 'hit-from',
                            'hit-to', 'identity', 'positive', 'query-frame',
                            'query-from', 'query-to']:
                    data[key] = int(data[key])
                data['query-len'] = query_len
                for key in ['evalue', 'bit-score', 'score']:
                    data[key] = float(data[key])
                p_identity = 100.0 * data['identity'] / (data['query-to'] -
                                                    data['query-from'] + 1)
                data['percent_identity'] = p_identity
                p_overlap = (100.0 * (data['align-len'] - data['gaps']) /
                              query_len)
                data['percent_coverage'] = p_overlap
                data['percent_overlap'] = p_overlap
                for item in (hit['id'] + ' ' + hit['def']).split('>gi'):
                    #>gi|1633462|pdb|4AKE|A Chain A, Adenylate Kinase
                    #                        __________TITLE__________
                    head, title = item.split(None, 1)
                    head = head.split('|')
                    seqInfo = dict(data)
                    accession = head[-2].split(".")[0]
                    protName = head[-1].split("_")[0]
                    species = head[-1].split("_")[1]
                    seqInfo['accession'] = accession
                    seqInfo['protName'] = protName
                    seqInfo['species'] = species
                    hits.append((p_identity, p_overlap, seqInfo))
        hits.sort(key=lambda hit: hit[0], reverse=True)
        self._hits = hits
Example #14
0
def searchPfam(query, **kwargs):
    """Return Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence
        file, sequence queries must not contain without gaps and must be at
        least 16 characters long
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int

    *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
    chain identifier.  UniProt ID of the specified chain, or the first
    protein chain will be used for searching the Pfam database."""

    prefix = '{http://pfam.xfam.org/}'
    query = str(query)
    if isfile(query):
        from prody.sequence import MSAFile
        try:
            seq = next(MSAFile(query))
        except:
            with openFile(query) as inp:
                seq = ''.join(inp.read().split())
        else:
            seq = seq[0][1]
        if not seq.isalpha():
            raise ValueError('could not parse a sequence without gaps from ' +
                             query)
    else:
        seq = ''.join(query.split())

    import xml.etree.cElementTree as ET
    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))
    if len(seq) >= MINSEQLEN:
        if not seq.isalpha():
            raise ValueError(repr(seq) + ' is not a valid sequence')
        fseq = '>Seq\n' + seq
        parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
        enc_params = urllib.urlencode(parameters)
        request = urllib.request.Request('http://hmmer.janelia.org/search/hmmscan', enc_params)

        url = ( urllib.request.urlopen(request).geturl() + '?output=xml') 
        LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
                     .format(seq[:MINSEQLEN]))

        xml = openURL(url, timeout=timeout).read()
        
        try:
            root = ET.XML(xml)
        except Exception as err:
            raise ValueError('failed to parse results XML, check URL: ' + url)
        matches = {}
        for child in root[0]:
            if child.tag == 'hits':
                accession = child.get('acc')
                pfam_id = accession.split('.')[0]
                matches[pfam_id]={}
                matches[pfam_id]['accession']=accession
                matches[pfam_id]['class']='Domain'
                matches[pfam_id]['id']=child.get('name')
                matches[pfam_id]['locations']={}
                matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
                matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
                matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
                matches[pfam_id]['locations']['end']=child[0].get('alisqto')
                matches[pfam_id]['locations']['evalue']=child.get('evalue')
                matches[pfam_id]['locations']['evidence']='hmmer v3.0'
                matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
                matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
                matches[pfam_id]['locations']['significant']=child[0].get('significant')    
                matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
                matches[pfam_id]['type']='Pfam-A'
                return matches

    else:
        if len(seq) <= 5:
            idcode = None
            from prody import parsePDBHeader
            try:
                polymers = parsePDBHeader(seq[:4], 'polymers')
            except Exception as err:
                LOGGER.warn('failed to parse header for {0} ({1})'
                            .format(seq[:4], str(err)))
            else:
                chid = seq[4:].upper()
                for poly in polymers:
                    if chid and poly.chid != chid:
                        continue
                    for dbref in poly.dbrefs:
                        if dbref.database != 'UniProt':
                            continue
                        idcode = dbref.idcode
                        LOGGER.info('UniProt ID code {0} for {1} chain '
                                    '{2} will be used.'
                                    .format(idcode, seq[:4], poly.chid))
                        break
                    if idcode is not None:
                        break
            if idcode is None:
                LOGGER.warn('A UniProt ID code for PDB {0} could not be '
                            'parsed.'.format(repr(seq)))
                url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'
            else:
                url = ('http://pfam.xfam.org/protein/' +
                       idcode + '?output=xml')

        else:
            url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml'

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = openURL(url, timeout=timeout).read()
        except Exception:
            pass
        else:
            if xml:
                break

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if xml.find(b'There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None

    try:
        root = ET.XML(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    if len(seq) >= MINSEQLEN:
        try:
            xml_matches = root[0][0][0][0]
        except IndexError:
            raise ValueError('failed to parse results XML, check URL: ' + url)
    else:
        results = dictElement(root[0], prefix)
        try:
            xml_matches = results['matches']
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for child in xml_matches:

        try:
            accession = child.attrib['accession'][:7]
        except KeyError:
            raise ValueError('failed to parse results XML, check URL: ' + url)

        if not re.search('^P(F|B)[0-9]{5}$', accession):
            raise ValueError('{0} does not match pfam accession'
                             ' format'.format(accession))

        match = matches.setdefault(accession, dict(child.items()))
        locations = match.setdefault('locations', [])
        for loc in child:
            locations.append(dict(loc.items()))

    if len(seq) < MINSEQLEN:
        query = 'Query ' + repr(query)
    else:
        query = 'Query sequence'

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches
Example #15
0
    def __init__(self, xml, sequence=None):
        """Instantiate a PDBlast object instance.

        :arg xml: blast search results in XML format or an XML file that
            contains the results
        :type xml: str

        :arg sequence: query sequence
        :type sequence: str"""

        if sequence:
            try:
                sequence = "".join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError("sequence must be a string")
            else:
                if not _:
                    raise ValueError("not a valid protein sequence")
        self._sequence = sequence

        import xml.etree.cElementTree as ET

        if len(xml) < 100:
            if os.path.isfile(xml):
                xml = ET.parse(xml)
                root = xml.getroot()
            else:
                raise ValueError("xml is not a filename and does not look like" " a valid XML string")
        else:
            root = ET.XML(xml)

        root = dictElement(root, "BlastOutput_")
        if root["db"] != "pdb":
            raise ValueError('blast search database in xml must be "pdb"')
        if root["program"] != "blastp":
            raise ValueError('blast search program in xml must be "blastp"')
        self._param = dictElement(root["param"][0], "Parameters_")

        query_len = int(root["query-len"])
        if sequence and len(sequence) != query_len:
            raise ValueError(
                "query-len and the length of the sequence do not " "match, xml data may not be for given sequence"
            )
        hits = []
        for iteration in root["iterations"]:
            for hit in dictElement(iteration, "Iteration_")["hits"]:
                hit = dictElement(hit, "Hit_")
                data = dictElement(hit["hsps"][0], "Hsp_")
                for key in [
                    "align-len",
                    "gaps",
                    "hit-frame",
                    "hit-from",
                    "hit-to",
                    "identity",
                    "positive",
                    "query-frame",
                    "query-from",
                    "query-to",
                ]:
                    data[key] = int(data[key])
                data["query-len"] = query_len
                for key in ["evalue", "bit-score", "score"]:
                    data[key] = float(data[key])
                p_identity = 100.0 * data["identity"] / (data["query-to"] - data["query-from"] + 1)
                data["percent_identity"] = p_identity
                p_overlap = 100.0 * (data["align-len"] - data["gaps"]) / query_len
                data["percent_coverage"] = p_overlap
                data["percent_overlap"] = p_overlap
                for item in (hit["id"] + hit["def"]).split(">gi"):
                    # >gi|1633462|pdb|4AKE|A Chain A, Adenylate Kinase
                    #                        __________TITLE__________
                    head, title = item.split(None, 1)
                    head = head.split("|")
                    pdb_id = head[-2].lower()
                    chain_id = head[-1][:1]
                    pdbch = dict(data)
                    pdbch["pdb_id"] = pdb_id
                    pdbch["chain_id"] = chain_id
                    pdbch["title"] = (head[-1][1:] + title).strip()
                    hits.append((p_identity, p_overlap, pdbch))
        hits.sort(key=lambda hit: hit[0], reverse=True)
        self._hits = hits
Example #16
0
    def fetch(self, xml=None, sequence=None, **kwargs):
        """Get Blast record from url or file.

        :arg sequence: an object with an associated sequence string 
            or a sequence string itself
        :type sequence: :class:`Atomic`, :class:`Sequence`, or str

        :arg xml: blast search results in XML format or an XML file that
            contains the results or a filename for saving the results or None
        :type xml: str

        :arg timeout: amount of time until the query times out in seconds
            default value is 120
        :type timeout: int
        """
        if self.isSuccess:
            LOGGER.warn(
                "The record already exists so not further search is performed")
            return True

        if sequence == None:
            sequence = self._sequence

        if xml == None:
            xml = self._xml

        import xml.etree.cElementTree as ET
        if xml is not None and len(xml) < 100:
            if os.path.isfile(xml):
                xml = ET.parse(xml)
                root = xml.getroot()
            else:
                raise ValueError('xml is not a filename and does not look like'
                                 ' a valid XML string')
        else:

            headers = {'User-agent': 'ProDy'}
            query = [
                ('DATABASE', 'pdb'),
                ('ENTREZ_QUERY', '(none)'),
                ('PROGRAM', 'blastp'),
            ]

            expect = float(kwargs.pop('expect', 10e-10))
            if expect <= 0:
                raise ValueError('expect must be a positive number')
            query.append(('EXPECT', expect))
            hitlist_size = int(kwargs.pop('hitlist_size', 250))
            if hitlist_size <= 0:
                raise ValueError('expect must be a positive integer')
            query.append(('HITLIST_SIZE', hitlist_size))
            query.append(('QUERY', sequence))
            query.append(('CMD', 'Put'))

            sleep = float(kwargs.pop('sleep', 2))
            timeout = float(kwargs.pop('timeout', self._timeout))
            self._timeout = timeout

            try:
                import urllib.parse
                urlencode = lambda data: bytes(urllib.parse.urlencode(data),
                                               'utf-8')
            except ImportError:
                from urllib import urlencode

            url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'

            data = urlencode(query)
            LOGGER.timeit('_prody_blast')
            LOGGER.info(
                'Blast searching NCBI PDB database for "{0}..."'.format(
                    sequence[:5]))
            handle = openURL(url, data=data, headers=headers)

            html = handle.read()
            index = html.find(b'RID =')
            if index == -1:
                raise Exception('NCBI did not return expected response.')
            else:
                last = html.find(b'\n', index)
                rid = html[index + len('RID ='):last].strip()

            query = [('ALIGNMENTS', 500), ('DESCRIPTIONS', 500),
                     ('FORMAT_TYPE', 'XML'), ('RID', rid), ('CMD', 'Get')]
            data = urlencode(query)

            while True:
                LOGGER.sleep(int(sleep),
                             'to reconnect to NCBI for search results.')
                LOGGER.write('Connecting to NCBI for search results...')
                handle = openURL(url, data=data, headers=headers)
                results = handle.read()
                index = results.find(b'Status=')
                LOGGER.clear()
                if index < 0:
                    break
                last = results.index(b'\n', index)
                status = results[index + len('Status='):last].strip()
                if status.upper() == b'READY':
                    break
                sleep = int(sleep * 1.5)
                if LOGGER.timing('_prody_blast') > timeout:
                    LOGGER.warn('Blast search time out.')
                    return False

            LOGGER.clear()
            LOGGER.report('Blast search completed in %.1fs.', '_prody_blast')

            filename = xml
            root = ET.XML(results)
            try:
                ext_xml = filename.lower().endswith('.xml')
            except AttributeError:
                pass
            else:
                if not ext_xml:
                    filename += '.xml'
                out = open(filename, 'w')
                if PY3K:
                    out.write(results.decode())
                else:
                    out.write(results)
                out.close()
                LOGGER.info('Results are saved as {0}.'.format(repr(filename)))

            root = dictElement(root, 'BlastOutput_')
            if root['db'] != 'pdb':
                raise ValueError('blast search database in xml must be "pdb"')
            if root['program'] != 'blastp':
                raise ValueError(
                    'blast search program in xml must be "blastp"')
            self._param = dictElement(root['param'][0], 'Parameters_')

            query_len = int(root['query-len'])
            if sequence and len(sequence) != query_len:
                raise ValueError(
                    'query-len and the length of the sequence do not '
                    'match, xml data may not be for given sequence')
            hits = []
            for iteration in root['iterations']:
                for hit in dictElement(iteration, 'Iteration_')['hits']:
                    hit = dictElement(hit, 'Hit_')
                    data = dictElement(hit['hsps'][0], 'Hsp_')
                    for key in [
                            'align-len', 'gaps', 'hit-frame', 'hit-from',
                            'hit-to', 'identity', 'positive', 'query-frame',
                            'query-from', 'query-to'
                    ]:
                        data[key] = int(data[key])
                    data['query-len'] = query_len
                    for key in ['evalue', 'bit-score', 'score']:
                        data[key] = float(data[key])
                    p_identity = 100.0 * data['identity'] / (
                        data['query-to'] - data['query-from'] + 1)
                    data['percent_identity'] = p_identity
                    p_overlap = (100.0 * (data['align-len'] - data['gaps']) /
                                 query_len)
                    data['percent_coverage'] = p_overlap

                    for item in (hit['id'] + hit['def']).split('>gi'):
                        head, title = item.split(None, 1)
                        head = head.split('|')
                        pdb_id = head[-2].lower()
                        chain_id = head[-1][:1]
                        pdbch = dict(data)
                        pdbch['pdb_id'] = pdb_id
                        pdbch['chain_id'] = chain_id
                        pdbch['title'] = (head[-1][1:] + title).strip()
                        hits.append((p_identity, p_overlap, pdbch))
            hits.sort(key=lambda hit: hit[0], reverse=True)
            self._hits = hits

        return True
Example #17
0
    def __init__(self, results, sequence=None):
        """Instantiate a PDBBlastRecord object instance.

        :arg result: psi-blast search results in XML format or an XML file
            that contains the results
        :type result: str

        :arg sequence: query sequence
        :type sequence: str
        """

        if sequence:
            try:
                sequence = ''.join(sequence.split())
                _ = sequence.isalpha()
            except AttributeError:
                raise TypeError('sequence must be a string')
            else:
                if not _:
                    raise ValueError('not a valid protein sequence')
        self._sequence = sequence

        tree = ET.fromstring(results)
        header = tree.getchildren()[0]
        parameters = dictElement(header.getchildren()[2],
                                 '{http://www.ebi.ac.uk/schema}')
        query_len = int(
            parameters.items()[0][1].getchildren()[0].items()[0][1])

        self._params = np.sum([parameters.items()[0][1].getchildren()[0].items(), \
                               parameters.items()[1:-2], \
                               parameters.items()[-2][1].getchildren()[0].items(), \
                               [(parameters.items()[-1])]])

        result = tree.getchildren()[1]
        hits = []
        for hit in result.getchildren()[0]:
            alignments = dictElement(hit.getchildren()[0],
                                     '{http://www.ebi.ac.uk/schema}')
            data = dictElement(alignments['alignment'],
                               '{http://www.ebi.ac.uk/schema}')

            for key in ['gaps', 'score']:
                if key in data.keys():
                    data[key] = int(data[key])
                else:
                    data[key] = 0

            data['query-len'] = query_len

            for key in ['bits', 'expectation', 'identity']:
                data[key] = float(data[key])

            p_identity = data['identity']
            data['percent_identity'] = p_identity
            p_overlap = (100.0 * (len(data['querySeq']) - data['gaps']) /
                         query_len)
            data['percent_coverage'] = p_overlap

            pdbch = dict(data)
            pdbch['pdb_id'] = hit.items()[0][1][:4]
            pdbch['chain_id'] = hit.items()[0][1][-1]
            pdbch['description'] = hit.items()[2][1]
            hits.append((p_identity, p_overlap, pdbch))

        hits.sort(key=lambda hit: hit[0], reverse=True)
        self._hits = hits

        if sequence and len(sequence) != query_len:
            raise ValueError(
                'xml sequence length and the length of the provided '
                'sequence do not match')
Example #18
0
    def __init__(self, xml, sequence=None):
        """Instantiate a PDBlast object instance.
        
        :arg xml: blast search results in XML format or an XML file that 
            contains the results
        :type xml: str
        :arg sequence: query sequence
        :type sequence: str"""

        if sequence:        
            if not checkSequence(sequence):
                raise ValueError('not a valid protein sequence')
        self._sequence = sequence
        
        import xml.etree.cElementTree as ET
        assert isinstance(xml, str), 'xml must be a string'
        if len(xml) < 100:
            if os.path.isfile(xml):
                xml = ET.parse(xml)
                root = xml.getroot()
            else:
                raise ValueError('xml is not a filename and does not look like'
                                 ' a valid XML string')
        else:
            root = ET.XML(xml)
        
        root = dictElement(root, 'BlastOutput_')
        if root['db'] != 'pdb':
            raise ValueError('blast search database in xml must be "pdb"')
        if root['program'] != 'blastp':
            raise ValueError('blast search program in xml must be "blastp"')
        self._param = dictElement(root['param'][0], 'Parameters_')

        query_len = int(root['query-len'])
        if sequence and len(sequence) != query_len:
            raise ValueError('query-len and the length of the sequence do not '
                             'match, xml data may not be for given sequence')
        hits = [] 
        for iteration in root['iterations']:
            for hit in dictElement(iteration, 'Iteration_')['hits']:
                hit = dictElement(hit, 'Hit_')
                data = dictElement(hit['hsps'][0], 'Hsp_')
                for key in ['align-len', 'gaps', 'hit-frame', 'hit-from',
                            'hit-to', 'identity', 'positive', 'query-frame',
                            'query-from', 'query-to']:
                    data[key] = int(data[key])
                data['query-len'] = query_len
                for key in ['evalue', 'bit-score', 'score']:
                    data[key] = float(data[key])
                p_identity = 100.0 * data['identity'] / (data['query-to'] - 
                                                    data['query-from'] + 1)
                data['percent_identity'] = p_identity
                p_overlap = (100.0 * (data['align-len'] - data['gaps']) /
                              query_len)
                data['percent_coverage'] = p_overlap  
                data['percent_overlap'] = p_overlap
                for item in (hit['id'] + hit['def']).split('>gi'):
                    #>gi|1633462|pdb|4AKE|A Chain A, Adenylate Kinase
                    #                        __________TITLE__________
                    head, title = item.split(None, 1)
                    head = head.split('|')
                    pdb_id = head[-2].lower() 
                    chain_id = head[-1][0]
                    pdbch = dict(data)
                    pdbch['pdb_id'] = pdb_id
                    pdbch['chain_id'] = chain_id
                    pdbch['title'] = (head[-1][1:] + title).strip()
                    hits.append((p_identity, p_overlap, pdbch))
        hits.sort(reverse=True)
        self._hits = hits