def _retrieve_reference(adaptor, primary_id): # XXX dbxref_qualifier_value refs = adaptor.execute_and_fetchall( "SELECT start_pos, end_pos, " " location, title, authors," " dbname, accession" " FROM bioentry_reference" " JOIN reference USING (reference_id)" " LEFT JOIN dbxref USING (dbxref_id)" " WHERE bioentry_id = %s" " ORDER BY rank", (primary_id,)) references = [] for start, end, location, title, authors, dbname, accession in refs: reference = SeqFeature.Reference() # If the start/end are missing, reference.location is an empty list if (start is not None) or (end is not None): if start is not None: start -= 1 # python counting reference.location = [SeqFeature.FeatureLocation(start, end)] # Don't replace the default "" with None. if authors: reference.authors = authors if title: reference.title = title reference.journal = location if dbname == 'PUBMED': reference.pubmed_id = accession elif dbname == 'MEDLINE': reference.medline_id = accession references.append(reference) if references: return {'references': references} else: return {}
def _parse_reference(element): reference = SeqFeature.Reference() authors = [] scopes = [] tissues = [] journal_name = '' pub_type = '' pub_date = '' for ref_element in element: if ref_element.tag == NS + 'citation': pub_type = ref_element.attrib['type'] if pub_type == 'submission': pub_type += ' to the ' + ref_element.attrib['db'] if 'name' in ref_element.attrib: journal_name = ref_element.attrib['name'] pub_date = ref_element.attrib.get('date', '') j_volume = ref_element.attrib.get('volume', '') j_first = ref_element.attrib.get('first', '') j_last = ref_element.attrib.get('last', '') for cit_element in ref_element: if cit_element.tag == NS + 'title': reference.title = cit_element.text elif cit_element.tag == NS + 'authorList': for person_element in cit_element: authors.append(person_element.attrib['name']) elif cit_element.tag == NS + 'dbReference': self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type'] + ':' + cit_element.attrib['id']) if cit_element.attrib['type'] == 'PubMed': reference.pubmed_id = cit_element.attrib['id'] elif ref_element.attrib['type'] == 'MEDLINE': reference.medline_id = cit_element.attrib['id'] elif ref_element.tag == NS + 'scope': scopes.append(ref_element.text) elif ref_element.tag == NS + 'source': for source_element in ref_element: if source_element.tag == NS + 'tissue': tissues.append(source_element.text) if scopes: scopes_str = 'Scope: ' + ', '.join(scopes) else: scopes_str = '' if tissues: tissues_str = 'Tissue: ' + ', '.join(tissues) else: tissues_str = '' # locations cannot be parsed since they are actually written in # free text inside scopes so all the references are put in the # annotation. reference.location = [] reference.authors = ', '.join(authors) if journal_name: if pub_date and j_volume and j_first and j_last: reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) else: reference.journal = journal_name reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str)) append_to_annotations('references', reference)
def to_ref(self): r = SeqFeature.Reference() r.title = self.title r.authors = self.authors r.journal = self.journal r.medline_id = self.medline_id r.pubmed_id = self.pubmed_id return r
def _format_publications(self): references = [] for pub in self.genome_object.get('publications', []): if len(pub) != 7: log('Skipping unparseable publication {}'.format(pub)) ref = SeqFeature.Reference() if pub[0]: ref.pubmed_id = str(pub[0]) ref.title = pub[2] ref.authors = pub[5] ref.journal = pub[6] references.append(ref) return references
def _parse_reference(element): reference = SeqFeature.Reference() authors = [] scopes = [] tissues = [] journal_name = "" pub_type = "" pub_date = "" for ref_element in element: if ref_element.tag == NS + "citation": pub_type = ref_element.attrib["type"] if pub_type == "submission": pub_type += " to the " + ref_element.attrib["db"] if "name" in ref_element.attrib: journal_name = ref_element.attrib["name"] pub_date = ref_element.attrib.get("date", "") j_volume = ref_element.attrib.get("volume", "") j_first = ref_element.attrib.get("first", "") j_last = ref_element.attrib.get("last", "") for cit_element in ref_element: if cit_element.tag == NS + "title": reference.title = cit_element.text elif cit_element.tag == NS + "authorList": for person_element in cit_element: authors.append(person_element.attrib["name"]) elif cit_element.tag == NS + "dbReference": self.ParsedSeqRecord.dbxrefs.append( cit_element.attrib["type"] + ":" + cit_element.attrib["id"] ) if cit_element.attrib["type"] == "PubMed": reference.pubmed_id = cit_element.attrib["id"] elif ref_element.attrib["type"] == "MEDLINE": reference.medline_id = cit_element.attrib["id"] elif ref_element.tag == NS + "scope": scopes.append(ref_element.text) elif ref_element.tag == NS + "source": for source_element in ref_element: if source_element.tag == NS + "tissue": tissues.append(source_element.text) if scopes: scopes_str = "Scope: " + ", ".join(scopes) else: scopes_str = "" if tissues: tissues_str = "Tissue: " + ", ".join(tissues) else: tissues_str = "" # locations cannot be parsed since they are actually written in # free text inside scopes so all the references are put in the # annotation. reference.location = [] reference.authors = ", ".join(authors) if journal_name: if pub_date and j_volume and j_first and j_last: reference.journal = REFERENCE_JOURNAL % { "name": journal_name, "volume": j_volume, "first": j_first, "last": j_last, "pub_date": pub_date, } else: reference.journal = journal_name reference.comment = " | ".join( (pub_type, pub_date, scopes_str, tissues_str) ) append_to_annotations("references", reference)
def SwissIterator(handle): """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. Every section from the ID line to the terminating // becomes a single SeqRecord with associated annotation and features. This parser is for the flat file "swiss" format as used by: - Swiss-Prot aka SwissProt - TrEMBL - UniProtKB aka UniProt Knowledgebase For consistency with BioPerl and EMBOSS we call this the "swiss" format. See also the SeqIO support for "uniprot-xml" format. """ swiss_records = SwissProt.parse(handle) for swiss_record in swiss_records: # Convert the SwissProt record to a SeqRecord seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) record = SeqRecord.SeqRecord( seq, id=swiss_record.accessions[0], name=swiss_record.entry_name, description=swiss_record.description, features=[_make_seqfeature(*f) for f in swiss_record.features], ) record.description = swiss_record.description for cross_reference in swiss_record.cross_references: if len(cross_reference) < 2: continue database, accession = cross_reference[:2] dbxref = "%s:%s" % (database, accession) if dbxref not in record.dbxrefs: record.dbxrefs.append(dbxref) annotations = record.annotations annotations['accessions'] = swiss_record.accessions if swiss_record.created: annotations['date'] = swiss_record.created[0] if swiss_record.sequence_update: annotations[ 'date_last_sequence_update'] = swiss_record.sequence_update[0] if swiss_record.annotation_update: annotations[ 'date_last_annotation_update'] = swiss_record.annotation_update[ 0] if swiss_record.gene_name: annotations['gene_name'] = swiss_record.gene_name annotations['organism'] = swiss_record.organism.rstrip(".") annotations['taxonomy'] = swiss_record.organism_classification annotations['ncbi_taxid'] = swiss_record.taxonomy_id if swiss_record.host_organism: annotations['organism_host'] = swiss_record.host_organism if swiss_record.host_taxonomy_id: annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id if swiss_record.comments: annotations['comment'] = "\n".join(swiss_record.comments) if swiss_record.references: annotations['references'] = [] for reference in swiss_record.references: feature = SeqFeature.Reference() feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) for key, value in reference.references: if key == 'PubMed': feature.pubmed_id = value elif key == 'MEDLINE': feature.medline_id = value elif key == 'DOI': pass elif key == 'AGRICOLA': pass else: raise ValueError("Unknown key %s found in references" % key) feature.authors = reference.authors feature.title = reference.title feature.journal = reference.location annotations['references'].append(feature) if swiss_record.keywords: record.annotations['keywords'] = swiss_record.keywords yield record
def SwissIterator(handle): """Break up a Swiss-Prot/UniProt file into SeqRecord objects. Every section from the ID line to the terminating // becomes a single SeqRecord with associated annotation and features. This parser is for the flat file "swiss" format as used by: - Swiss-Prot aka SwissProt - TrEMBL - UniProtKB aka UniProt Knowledgebase For consistency with BioPerl and EMBOSS we call this the "swiss" format. See also the SeqIO support for "uniprot-xml" format. Rather than calling it directly, you are expected to use this parser via Bio.SeqIO.parse(..., format="swiss") instead. """ with as_handle(handle, "rU") as handle: swiss_records = SwissProt.parse(handle) for swiss_record in swiss_records: # Convert the SwissProt record to a SeqRecord seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) record = SeqRecord.SeqRecord( seq, id=swiss_record.accessions[0], name=swiss_record.entry_name, description=swiss_record.description, features=[_make_seqfeature(*f) for f in swiss_record.features], ) record.description = swiss_record.description for cross_reference in swiss_record.cross_references: if len(cross_reference) < 2: continue database, accession = cross_reference[:2] dbxref = "%s:%s" % (database, accession) if dbxref not in record.dbxrefs: record.dbxrefs.append(dbxref) annotations = record.annotations annotations["accessions"] = swiss_record.accessions if swiss_record.protein_existence: annotations[ "protein_existence"] = swiss_record.protein_existence if swiss_record.created: annotations["date"] = swiss_record.created[0] annotations["sequence_version"] = swiss_record.created[1] if swiss_record.sequence_update: annotations[ "date_last_sequence_update"] = swiss_record.sequence_update[ 0] annotations["sequence_version"] = swiss_record.sequence_update[ 1] if swiss_record.annotation_update: annotations[ "date_last_annotation_update"] = swiss_record.annotation_update[ 0] annotations["entry_version"] = swiss_record.annotation_update[ 1] if swiss_record.gene_name: annotations["gene_name"] = swiss_record.gene_name annotations["organism"] = swiss_record.organism.rstrip(".") annotations["taxonomy"] = swiss_record.organism_classification annotations["ncbi_taxid"] = swiss_record.taxonomy_id if swiss_record.host_organism: annotations["organism_host"] = swiss_record.host_organism if swiss_record.host_taxonomy_id: annotations["host_ncbi_taxid"] = swiss_record.host_taxonomy_id if swiss_record.comments: annotations["comment"] = "\n".join(swiss_record.comments) if swiss_record.references: annotations["references"] = [] for reference in swiss_record.references: feature = SeqFeature.Reference() feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) for key, value in reference.references: if key == "PubMed": feature.pubmed_id = value elif key == "MEDLINE": feature.medline_id = value elif key == "DOI": pass elif key == "AGRICOLA": pass else: raise ValueError( "Unknown key %s found in references" % key) feature.authors = reference.authors feature.title = reference.title feature.journal = reference.location annotations["references"].append(feature) if swiss_record.keywords: record.annotations["keywords"] = swiss_record.keywords yield record
def project_to_genbank(filename, project, allblocks, construct_id=None): if construct_id is not None: blocks = [construct_id] else: blocks = project["components"] seq_obj_lst = [] # For each of the construct in the project for block_id in blocks: block = [b for b in allblocks if b["id"] == block_id][0] if not block: continue # Grab the original ID that came from genbank before if available, otherwise the GD Name as the name if "genbank" in block["metadata"] and "id" in block["metadata"][ "genbank"]: genbank_id = block["metadata"]["genbank"]["id"] elif "genbank" in block["metadata"] and "name" in block["metadata"][ "genbank"]: genbank_id = block["metadata"]["genbank"]["name"] else: genbank_id = "GC_DNA" sequence = build_sequence(block, allblocks) seq_obj = SeqIO.SeqRecord( Seq.Seq(sequence, Seq.Alphabet.DNAAlphabet()), genbank_id) # Create a 'source' feature sf = SeqFeature.SeqFeature() sf.type = "source" sf.location = SeqFeature.FeatureLocation(0, len(seq_obj.seq)) add_GC_info(sf, block, allblocks) if "genbank" in block["metadata"]: # Set up all the annotations in the genbank record. These came originally from genbank. if "annotations" in block["metadata"]["genbank"]: for annot_key, annot_value in block["metadata"]["genbank"][ "annotations"].iteritems(): seq_obj.annotations[annot_key] = annot_value # Set up all the references in the genbank record. These came originally from genbank. if "references" in block["metadata"]["genbank"]: for ref in block["metadata"]["genbank"]["references"]: genbank_ref = SeqFeature.Reference() genbank_ref.authors = ref['authors'] genbank_ref.comment = ref['comment'] genbank_ref.consrtm = ref['consrtm'] genbank_ref.journal = ref['journal'] genbank_ref.medline_id = ref['medline_id'] genbank_ref.pubmed_id = ref['pubmed_id'] genbank_ref.title = ref['title'] if "references" not in seq_obj.annotations: seq_obj.annotations["references"] = [] seq_obj.annotations["references"].append(genbank_ref) # Add the original annotations to the source feature if "feature_annotations" in block["metadata"]["genbank"]: for annot_key, annot_value in block["metadata"]["genbank"][ "feature_annotations"].iteritems(): sf.qualifiers[annot_key] = annot_value seq_obj.features.append(sf) if "description" in block["metadata"]: seq_obj.description = block["metadata"]["description"] if "genbank" in block["metadata"] and "name" in block["metadata"][ "genbank"]: seq_obj.name = block["metadata"]["genbank"]["name"] elif "name" in block["metadata"]: seq_obj.name = block["metadata"]["name"].replace(" ", "")[:5] else: seq_obj.name = "GC_DNA" convert_annotations(block, seq_obj, 0) # Add a block for each of the features, recursively start = 0 for child_id in block['components']: child_block = [b for b in allblocks if b["id"] == child_id][0] start = add_features(child_block, allblocks, seq_obj, start) seq_obj_lst.append(seq_obj) SeqIO.write(seq_obj_lst, open(filename, "w"), "genbank")