Exemple #1
0
def _make_position(location_string, offset=0):
    """Turn a Swiss location position into a SeqFeature position object (PRIVATE).

    An offset of -1 is used with a start location to make it pythonic.
    """
    if location_string == "?":
        return SeqFeature.UnknownPosition()
    #Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0.
    try:
        return SeqFeature.ExactPosition(max(0, offset + int(location_string)))
    except ValueError:
        pass
    if location_string.startswith("<"):
        try:
            return SeqFeature.BeforePosition(
                max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    elif location_string.startswith(">"):  # e.g. ">13"
        try:
            return SeqFeature.AfterPosition(
                max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    elif location_string.startswith("?"):  # e.g. "?22"
        try:
            return SeqFeature.UncertainPosition(
                max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    raise NotImplementedError("Cannot parse location '%s'" % location_string)
Exemple #2
0
 def _parse_feature(element):
     feature = SeqFeature.SeqFeature()
     for k, v in element.attrib.items():
         feature.qualifiers[k] = v
     feature.type = element.attrib.get('type', '')
     if 'id' in element.attrib:
         feature.id = element.attrib['id']
     for feature_element in element:
         if feature_element.tag == NS + 'location':
             position_elements = feature_element.findall(NS + 'position')
             if position_elements:
                 element = position_elements[0]
                 start_position = _parse_position(element, -1)
                 end_position = _parse_position(element)
             else:
                 element = feature_element.findall(NS + 'begin')[0]
                 start_position = _parse_position(element, -1)
                 element = feature_element.findall(NS + 'end')[0]
                 end_position = _parse_position(element)
             feature.location = SeqFeature.FeatureLocation(start_position, end_position)
         else:
             try:
                 feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text
             except:
                 pass  # skip unparsable tag
     self.ParsedSeqRecord.features.append(feature)
Exemple #3
0
def _make_seqfeature(name, from_res, to_res, description, ft_id):
    """Construct SeqFeature from feature data from parser (PRIVATE)."""
    loc = SeqFeature.FeatureLocation(_make_position(from_res, -1),
                                     _make_position(to_res, 0))
    if not ft_id:
        ft_id = "<unknown id>"  # The default in SeqFeature object
    return SeqFeature.SeqFeature(loc,
                                 type=name,
                                 id=ft_id,
                                 qualifiers={"description": description})
Exemple #4
0
        def _parse_reference(element):
            reference = SeqFeature.Reference()
            authors = []
            scopes = []
            tissues = []
            journal_name = ''
            pub_type = ''
            pub_date = ''
            for ref_element in element:
                if ref_element.tag == NS + 'citation':
                    pub_type = ref_element.attrib['type']
                    if pub_type == 'submission':
                        pub_type += ' to the ' + ref_element.attrib['db']
                    if 'name' in ref_element.attrib:
                        journal_name = ref_element.attrib['name']
                    pub_date = ref_element.attrib.get('date', '')
                    j_volume = ref_element.attrib.get('volume', '')
                    j_first = ref_element.attrib.get('first', '')
                    j_last = ref_element.attrib.get('last', '')
                    for cit_element in ref_element:
                        if cit_element.tag == NS + 'title':
                            reference.title = cit_element.text
                        elif cit_element.tag == NS + 'authorList':
                            for person_element in cit_element:
                                authors.append(person_element.attrib['name'])
                        elif cit_element.tag == NS + 'dbReference':
                            self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']
                                                                + ':' + cit_element.attrib['id'])
                            if cit_element.attrib['type'] == 'PubMed':
                                reference.pubmed_id = cit_element.attrib['id']
                            elif ref_element.attrib['type'] == 'MEDLINE':
                                reference.medline_id = cit_element.attrib['id']
                elif ref_element.tag == NS + 'scope':
                    scopes.append(ref_element.text)
                elif ref_element.tag == NS + 'source':
                    for source_element in ref_element:
                        if source_element.tag == NS + 'tissue':
                            tissues.append(source_element.text)
            if scopes:
                scopes_str = 'Scope: ' + ', '.join(scopes)
            else:
                scopes_str = ''
            if tissues:
                tissues_str = 'Tissue: ' + ', '.join(tissues)
            else:
                tissues_str = ''

            # locations cannot be parsed since they are actually written in
            # free text inside scopes so all the references are put in the
            # annotation.
            reference.location = []
            reference.authors = ', '.join(authors)
            if journal_name:
                if pub_date and j_volume and j_first and j_last:
                    reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
                        volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
                else:
                    reference.journal = journal_name
            reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str))
            append_to_annotations('references', reference)
Exemple #5
0
 def _parse_position(element, offset=0):
     try:
         position = int(element.attrib['position']) + offset
     except KeyError as err:
         position = None
     status = element.attrib.get('status', '')
     if status == 'unknown':
         assert position is None
         return SeqFeature.UnknownPosition()
     elif not status:
         return SeqFeature.ExactPosition(position)
     elif status == 'greater than':
         return SeqFeature.AfterPosition(position)
     elif status == 'less than':
         return SeqFeature.BeforePosition(position)
     elif status == 'uncertain':
         return SeqFeature.UncertainPosition(position)
     else:
         raise NotImplementedError("Position status %r" % status)
Exemple #6
0
        def _parse_dbReference(element):
            self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id'])
            #e.g.
            # <dbReference type="PDB" key="11" id="2GEZ">
            #   <property value="X-ray" type="method"/>
            #   <property value="2.60 A" type="resolution"/>
            #   <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/>
            # </dbReference>
            if 'type' in element.attrib:
                if element.attrib['type'] == 'PDB':
                    method = ""
                    resolution = ""
                    for ref_element in element:
                        if ref_element.tag == NS + 'property':
                            dat_type = ref_element.attrib['type']
                            if dat_type == 'method':
                                method = ref_element.attrib['value']
                            if dat_type == 'resolution':
                                resolution = ref_element.attrib['value']
                            if dat_type == 'chains':
                                pairs = ref_element.attrib['value'].split(',')
                                for elem in pairs:
                                    pair = elem.strip().split('=')
                                    if pair[1] != '-':
                                        #TODO - How best to store these, do SeqFeatures make sense?
                                        feature = SeqFeature.SeqFeature()
                                        feature.type = element.attrib['type']
                                        feature.qualifiers['name'] = element.attrib['id']
                                        feature.qualifiers['method'] = method
                                        feature.qualifiers['resolution'] = resolution
                                        feature.qualifiers['chains'] = pair[0].split('/')
                                        start = int(pair[1].split('-')[0]) - 1
                                        end = int(pair[1].split('-')[1])
                                        feature.location = SeqFeature.FeatureLocation(start, end)
                                        #self.ParsedSeqRecord.features.append(feature)

            for ref_element in element:
                if ref_element.tag == NS + 'property':
                    pass  # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
Exemple #7
0
def SwissIterator(handle):
    """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.

    Every section from the ID line to the terminating // becomes
    a single SeqRecord with associated annotation and features.

    This parser is for the flat file "swiss" format as used by:
     * Swiss-Prot aka SwissProt
     * TrEMBL
     * UniProtKB aka UniProt Knowledgebase

    For consistency with BioPerl and EMBOSS we call this the "swiss"
    format. See also the SeqIO support for "uniprot-xml" format.
    """
    swiss_records = SwissProt.parse(handle)
    for swiss_record in swiss_records:
        # Convert the SwissProt record to a SeqRecord
        seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
        record = SeqRecord.SeqRecord(
            seq,
            id=swiss_record.accessions[0],
            name=swiss_record.entry_name,
            description=swiss_record.description,
            features=[_make_seqfeature(*f) for f in swiss_record.features],
        )
        record.description = swiss_record.description
        for cross_reference in swiss_record.cross_references:
            if len(cross_reference) < 2:
                continue
            database, accession = cross_reference[:2]
            dbxref = "%s:%s" % (database, accession)
            if not dbxref in record.dbxrefs:
                record.dbxrefs.append(dbxref)
        annotations = record.annotations
        annotations['accessions'] = swiss_record.accessions
        if swiss_record.created:
            annotations['date'] = swiss_record.created[0]
        if swiss_record.sequence_update:
            annotations[
                'date_last_sequence_update'] = swiss_record.sequence_update[0]
        if swiss_record.annotation_update:
            annotations[
                'date_last_annotation_update'] = swiss_record.annotation_update[
                    0]
        if swiss_record.gene_name:
            annotations['gene_name'] = swiss_record.gene_name
        annotations['organism'] = swiss_record.organism.rstrip(".")
        annotations['taxonomy'] = swiss_record.organism_classification
        annotations['ncbi_taxid'] = swiss_record.taxonomy_id
        if swiss_record.host_organism:
            annotations['organism_host'] = swiss_record.host_organism
        if swiss_record.host_taxonomy_id:
            annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
        if swiss_record.comments:
            annotations['comment'] = "\n".join(swiss_record.comments)
        if swiss_record.references:
            annotations['references'] = []
            for reference in swiss_record.references:
                feature = SeqFeature.Reference()
                feature.comment = " ".join("%s=%s;" % k_v
                                           for k_v in reference.comments)
                for key, value in reference.references:
                    if key == 'PubMed':
                        feature.pubmed_id = value
                    elif key == 'MEDLINE':
                        feature.medline_id = value
                    elif key == 'DOI':
                        pass
                    elif key == 'AGRICOLA':
                        pass
                    else:
                        raise ValueError("Unknown key %s found in references" %
                                         key)
                feature.authors = reference.authors
                feature.title = reference.title
                feature.journal = reference.location
                annotations['references'].append(feature)
        if swiss_record.keywords:
            record.annotations['keywords'] = swiss_record.keywords
        yield record