def _parse_reference(element): reference = SeqFeature.Reference() authors = [] scopes = [] tissues = [] journal_name = '' pub_type = '' pub_date = '' for ref_element in element: if ref_element.tag == NS + 'citation': pub_type = ref_element.attrib['type'] if pub_type == 'submission': pub_type += ' to the ' + ref_element.attrib['db'] if 'name' in ref_element.attrib: journal_name = ref_element.attrib['name'] pub_date = ref_element.attrib.get('date', '') j_volume = ref_element.attrib.get('volume', '') j_first = ref_element.attrib.get('first', '') j_last = ref_element.attrib.get('last', '') for cit_element in ref_element: if cit_element.tag == NS + 'title': reference.title = cit_element.text elif cit_element.tag == NS + 'authorList': for person_element in cit_element: authors.append(person_element.attrib['name']) elif cit_element.tag == NS + 'dbReference': self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type'] + ':' + cit_element.attrib['id']) if cit_element.attrib['type'] == 'PubMed': reference.pubmed_id = cit_element.attrib['id'] elif ref_element.attrib['type'] == 'MEDLINE': reference.medline_id = cit_element.attrib['id'] elif ref_element.tag == NS + 'scope': scopes.append(ref_element.text) elif ref_element.tag == NS + 'source': for source_element in ref_element: if source_element.tag == NS + 'tissue': tissues.append(source_element.text) if scopes: scopes_str = 'Scope: ' + ', '.join(scopes) else: scopes_str = '' if tissues: tissues_str = 'Tissue: ' + ', '.join(tissues) else: tissues_str = '' # locations cannot be parsed since they are actually written in # free text inside scopes so all the references are put in the # annotation. reference.location = [] reference.authors = ', '.join(authors) if journal_name: if pub_date and j_volume and j_first and j_last: reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) else: reference.journal = journal_name reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str)) append_to_annotations('references', reference)
def SwissIterator(handle): """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. Every section from the ID line to the terminating // becomes a single SeqRecord with associated annotation and features. This parser is for the flat file "swiss" format as used by: * Swiss-Prot aka SwissProt * TrEMBL * UniProtKB aka UniProt Knowledgebase For consistency with BioPerl and EMBOSS we call this the "swiss" format. See also the SeqIO support for "uniprot-xml" format. """ swiss_records = SwissProt.parse(handle) for swiss_record in swiss_records: # Convert the SwissProt record to a SeqRecord seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) record = SeqRecord.SeqRecord( seq, id=swiss_record.accessions[0], name=swiss_record.entry_name, description=swiss_record.description, features=[_make_seqfeature(*f) for f in swiss_record.features], ) record.description = swiss_record.description for cross_reference in swiss_record.cross_references: if len(cross_reference) < 2: continue database, accession = cross_reference[:2] dbxref = "%s:%s" % (database, accession) if not dbxref in record.dbxrefs: record.dbxrefs.append(dbxref) annotations = record.annotations annotations['accessions'] = swiss_record.accessions if swiss_record.created: annotations['date'] = swiss_record.created[0] if swiss_record.sequence_update: annotations[ 'date_last_sequence_update'] = swiss_record.sequence_update[0] if swiss_record.annotation_update: annotations[ 'date_last_annotation_update'] = swiss_record.annotation_update[ 0] if swiss_record.gene_name: annotations['gene_name'] = swiss_record.gene_name annotations['organism'] = swiss_record.organism.rstrip(".") annotations['taxonomy'] = swiss_record.organism_classification annotations['ncbi_taxid'] = swiss_record.taxonomy_id if swiss_record.host_organism: annotations['organism_host'] = swiss_record.host_organism if swiss_record.host_taxonomy_id: annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id if swiss_record.comments: annotations['comment'] = "\n".join(swiss_record.comments) if swiss_record.references: annotations['references'] = [] for reference in swiss_record.references: feature = SeqFeature.Reference() feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) for key, value in reference.references: if key == 'PubMed': feature.pubmed_id = value elif key == 'MEDLINE': feature.medline_id = value elif key == 'DOI': pass elif key == 'AGRICOLA': pass else: raise ValueError("Unknown key %s found in references" % key) feature.authors = reference.authors feature.title = reference.title feature.journal = reference.location annotations['references'].append(feature) if swiss_record.keywords: record.annotations['keywords'] = swiss_record.keywords yield record