예제 #1
0
        def _parse_reference(element):
            reference = SeqFeature.Reference()
            authors = []
            scopes = []
            tissues = []
            journal_name = ''
            pub_type = ''
            pub_date = ''
            for ref_element in element:
                if ref_element.tag == NS + 'citation':
                    pub_type = ref_element.attrib['type']
                    if pub_type == 'submission':
                        pub_type += ' to the ' + ref_element.attrib['db']
                    if 'name' in ref_element.attrib:
                        journal_name = ref_element.attrib['name']
                    pub_date = ref_element.attrib.get('date', '')
                    j_volume = ref_element.attrib.get('volume', '')
                    j_first = ref_element.attrib.get('first', '')
                    j_last = ref_element.attrib.get('last', '')
                    for cit_element in ref_element:
                        if cit_element.tag == NS + 'title':
                            reference.title = cit_element.text
                        elif cit_element.tag == NS + 'authorList':
                            for person_element in cit_element:
                                authors.append(person_element.attrib['name'])
                        elif cit_element.tag == NS + 'dbReference':
                            self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']
                                                                + ':' + cit_element.attrib['id'])
                            if cit_element.attrib['type'] == 'PubMed':
                                reference.pubmed_id = cit_element.attrib['id']
                            elif ref_element.attrib['type'] == 'MEDLINE':
                                reference.medline_id = cit_element.attrib['id']
                elif ref_element.tag == NS + 'scope':
                    scopes.append(ref_element.text)
                elif ref_element.tag == NS + 'source':
                    for source_element in ref_element:
                        if source_element.tag == NS + 'tissue':
                            tissues.append(source_element.text)
            if scopes:
                scopes_str = 'Scope: ' + ', '.join(scopes)
            else:
                scopes_str = ''
            if tissues:
                tissues_str = 'Tissue: ' + ', '.join(tissues)
            else:
                tissues_str = ''

            # locations cannot be parsed since they are actually written in
            # free text inside scopes so all the references are put in the
            # annotation.
            reference.location = []
            reference.authors = ', '.join(authors)
            if journal_name:
                if pub_date and j_volume and j_first and j_last:
                    reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
                        volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
                else:
                    reference.journal = journal_name
            reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str))
            append_to_annotations('references', reference)
예제 #2
0
파일: SwissIO.py 프로젝트: cbirdlab/sap
def SwissIterator(handle):
    """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.

    Every section from the ID line to the terminating // becomes
    a single SeqRecord with associated annotation and features.

    This parser is for the flat file "swiss" format as used by:
     * Swiss-Prot aka SwissProt
     * TrEMBL
     * UniProtKB aka UniProt Knowledgebase

    For consistency with BioPerl and EMBOSS we call this the "swiss"
    format. See also the SeqIO support for "uniprot-xml" format.
    """
    swiss_records = SwissProt.parse(handle)
    for swiss_record in swiss_records:
        # Convert the SwissProt record to a SeqRecord
        seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
        record = SeqRecord.SeqRecord(
            seq,
            id=swiss_record.accessions[0],
            name=swiss_record.entry_name,
            description=swiss_record.description,
            features=[_make_seqfeature(*f) for f in swiss_record.features],
        )
        record.description = swiss_record.description
        for cross_reference in swiss_record.cross_references:
            if len(cross_reference) < 2:
                continue
            database, accession = cross_reference[:2]
            dbxref = "%s:%s" % (database, accession)
            if not dbxref in record.dbxrefs:
                record.dbxrefs.append(dbxref)
        annotations = record.annotations
        annotations['accessions'] = swiss_record.accessions
        if swiss_record.created:
            annotations['date'] = swiss_record.created[0]
        if swiss_record.sequence_update:
            annotations[
                'date_last_sequence_update'] = swiss_record.sequence_update[0]
        if swiss_record.annotation_update:
            annotations[
                'date_last_annotation_update'] = swiss_record.annotation_update[
                    0]
        if swiss_record.gene_name:
            annotations['gene_name'] = swiss_record.gene_name
        annotations['organism'] = swiss_record.organism.rstrip(".")
        annotations['taxonomy'] = swiss_record.organism_classification
        annotations['ncbi_taxid'] = swiss_record.taxonomy_id
        if swiss_record.host_organism:
            annotations['organism_host'] = swiss_record.host_organism
        if swiss_record.host_taxonomy_id:
            annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
        if swiss_record.comments:
            annotations['comment'] = "\n".join(swiss_record.comments)
        if swiss_record.references:
            annotations['references'] = []
            for reference in swiss_record.references:
                feature = SeqFeature.Reference()
                feature.comment = " ".join("%s=%s;" % k_v
                                           for k_v in reference.comments)
                for key, value in reference.references:
                    if key == 'PubMed':
                        feature.pubmed_id = value
                    elif key == 'MEDLINE':
                        feature.medline_id = value
                    elif key == 'DOI':
                        pass
                    elif key == 'AGRICOLA':
                        pass
                    else:
                        raise ValueError("Unknown key %s found in references" %
                                         key)
                feature.authors = reference.authors
                feature.title = reference.title
                feature.journal = reference.location
                annotations['references'].append(feature)
        if swiss_record.keywords:
            record.annotations['keywords'] = swiss_record.keywords
        yield record