Esempio n. 1
0
def _retrieve_reference(adaptor, primary_id):
    # XXX dbxref_qualifier_value

    refs = adaptor.execute_and_fetchall(
        "SELECT start_pos, end_pos, "
        " location, title, authors,"
        " dbname, accession"
        " FROM bioentry_reference"
        " JOIN reference USING (reference_id)"
        " LEFT JOIN dbxref USING (dbxref_id)"
        " WHERE bioentry_id = %s"
        " ORDER BY rank", (primary_id,))
    references = []
    for start, end, location, title, authors, dbname, accession in refs:
        reference = SeqFeature.Reference()
        # If the start/end are missing, reference.location is an empty list
        if (start is not None) or (end is not None):
            if start is not None:
                start -= 1  # python counting
            reference.location = [SeqFeature.FeatureLocation(start, end)]
        # Don't replace the default "" with None.
        if authors:
            reference.authors = authors
        if title:
            reference.title = title
        reference.journal = location
        if dbname == 'PUBMED':
            reference.pubmed_id = accession
        elif dbname == 'MEDLINE':
            reference.medline_id = accession
        references.append(reference)
    if references:
        return {'references': references}
    else:
        return {}
Esempio n. 2
0
        def _parse_reference(element):
            reference = SeqFeature.Reference()
            authors = []
            scopes = []
            tissues = []
            journal_name = ''
            pub_type = ''
            pub_date = ''
            for ref_element in element:
                if ref_element.tag == NS + 'citation':
                    pub_type = ref_element.attrib['type']
                    if pub_type == 'submission':
                        pub_type += ' to the ' + ref_element.attrib['db']
                    if 'name' in ref_element.attrib:
                        journal_name = ref_element.attrib['name']
                    pub_date = ref_element.attrib.get('date', '')
                    j_volume = ref_element.attrib.get('volume', '')
                    j_first = ref_element.attrib.get('first', '')
                    j_last = ref_element.attrib.get('last', '')
                    for cit_element in ref_element:
                        if cit_element.tag == NS + 'title':
                            reference.title = cit_element.text
                        elif cit_element.tag == NS + 'authorList':
                            for person_element in cit_element:
                                authors.append(person_element.attrib['name'])
                        elif cit_element.tag == NS + 'dbReference':
                            self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']
                                                                + ':' + cit_element.attrib['id'])
                            if cit_element.attrib['type'] == 'PubMed':
                                reference.pubmed_id = cit_element.attrib['id']
                            elif ref_element.attrib['type'] == 'MEDLINE':
                                reference.medline_id = cit_element.attrib['id']
                elif ref_element.tag == NS + 'scope':
                    scopes.append(ref_element.text)
                elif ref_element.tag == NS + 'source':
                    for source_element in ref_element:
                        if source_element.tag == NS + 'tissue':
                            tissues.append(source_element.text)
            if scopes:
                scopes_str = 'Scope: ' + ', '.join(scopes)
            else:
                scopes_str = ''
            if tissues:
                tissues_str = 'Tissue: ' + ', '.join(tissues)
            else:
                tissues_str = ''

            # locations cannot be parsed since they are actually written in
            # free text inside scopes so all the references are put in the
            # annotation.
            reference.location = []
            reference.authors = ', '.join(authors)
            if journal_name:
                if pub_date and j_volume and j_first and j_last:
                    reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
                        volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
                else:
                    reference.journal = journal_name
            reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str))
            append_to_annotations('references', reference)
Esempio n. 3
0
 def to_ref(self):
     r = SeqFeature.Reference()
     r.title = self.title
     r.authors = self.authors
     r.journal = self.journal
     r.medline_id = self.medline_id
     r.pubmed_id = self.pubmed_id
     return r
Esempio n. 4
0
 def _format_publications(self):
     references = []
     for pub in self.genome_object.get('publications', []):
         if len(pub) != 7:
             log('Skipping unparseable publication {}'.format(pub))
         ref = SeqFeature.Reference()
         if pub[0]:
             ref.pubmed_id = str(pub[0])
         ref.title = pub[2]
         ref.authors = pub[5]
         ref.journal = pub[6]
         references.append(ref)
     return references
Esempio n. 5
0
        def _parse_reference(element):
            reference = SeqFeature.Reference()
            authors = []
            scopes = []
            tissues = []
            journal_name = ""
            pub_type = ""
            pub_date = ""
            for ref_element in element:
                if ref_element.tag == NS + "citation":
                    pub_type = ref_element.attrib["type"]
                    if pub_type == "submission":
                        pub_type += " to the " + ref_element.attrib["db"]
                    if "name" in ref_element.attrib:
                        journal_name = ref_element.attrib["name"]
                    pub_date = ref_element.attrib.get("date", "")
                    j_volume = ref_element.attrib.get("volume", "")
                    j_first = ref_element.attrib.get("first", "")
                    j_last = ref_element.attrib.get("last", "")
                    for cit_element in ref_element:
                        if cit_element.tag == NS + "title":
                            reference.title = cit_element.text
                        elif cit_element.tag == NS + "authorList":
                            for person_element in cit_element:
                                authors.append(person_element.attrib["name"])
                        elif cit_element.tag == NS + "dbReference":
                            self.ParsedSeqRecord.dbxrefs.append(
                                cit_element.attrib["type"]
                                + ":"
                                + cit_element.attrib["id"]
                            )
                            if cit_element.attrib["type"] == "PubMed":
                                reference.pubmed_id = cit_element.attrib["id"]
                            elif ref_element.attrib["type"] == "MEDLINE":
                                reference.medline_id = cit_element.attrib["id"]
                elif ref_element.tag == NS + "scope":
                    scopes.append(ref_element.text)
                elif ref_element.tag == NS + "source":
                    for source_element in ref_element:
                        if source_element.tag == NS + "tissue":
                            tissues.append(source_element.text)
            if scopes:
                scopes_str = "Scope: " + ", ".join(scopes)
            else:
                scopes_str = ""
            if tissues:
                tissues_str = "Tissue: " + ", ".join(tissues)
            else:
                tissues_str = ""

            # locations cannot be parsed since they are actually written in
            # free text inside scopes so all the references are put in the
            # annotation.
            reference.location = []
            reference.authors = ", ".join(authors)
            if journal_name:
                if pub_date and j_volume and j_first and j_last:
                    reference.journal = REFERENCE_JOURNAL % {
                        "name": journal_name,
                        "volume": j_volume,
                        "first": j_first,
                        "last": j_last,
                        "pub_date": pub_date,
                    }
                else:
                    reference.journal = journal_name
            reference.comment = " | ".join(
                (pub_type, pub_date, scopes_str, tissues_str)
            )
            append_to_annotations("references", reference)
Esempio n. 6
0
def SwissIterator(handle):
    """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.

    Every section from the ID line to the terminating // becomes
    a single SeqRecord with associated annotation and features.

    This parser is for the flat file "swiss" format as used by:
     - Swiss-Prot aka SwissProt
     - TrEMBL
     - UniProtKB aka UniProt Knowledgebase

    For consistency with BioPerl and EMBOSS we call this the "swiss"
    format. See also the SeqIO support for "uniprot-xml" format.
    """
    swiss_records = SwissProt.parse(handle)
    for swiss_record in swiss_records:
        # Convert the SwissProt record to a SeqRecord
        seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
        record = SeqRecord.SeqRecord(
            seq,
            id=swiss_record.accessions[0],
            name=swiss_record.entry_name,
            description=swiss_record.description,
            features=[_make_seqfeature(*f) for f in swiss_record.features],
        )
        record.description = swiss_record.description
        for cross_reference in swiss_record.cross_references:
            if len(cross_reference) < 2:
                continue
            database, accession = cross_reference[:2]
            dbxref = "%s:%s" % (database, accession)
            if dbxref not in record.dbxrefs:
                record.dbxrefs.append(dbxref)
        annotations = record.annotations
        annotations['accessions'] = swiss_record.accessions
        if swiss_record.created:
            annotations['date'] = swiss_record.created[0]
        if swiss_record.sequence_update:
            annotations[
                'date_last_sequence_update'] = swiss_record.sequence_update[0]
        if swiss_record.annotation_update:
            annotations[
                'date_last_annotation_update'] = swiss_record.annotation_update[
                    0]
        if swiss_record.gene_name:
            annotations['gene_name'] = swiss_record.gene_name
        annotations['organism'] = swiss_record.organism.rstrip(".")
        annotations['taxonomy'] = swiss_record.organism_classification
        annotations['ncbi_taxid'] = swiss_record.taxonomy_id
        if swiss_record.host_organism:
            annotations['organism_host'] = swiss_record.host_organism
        if swiss_record.host_taxonomy_id:
            annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
        if swiss_record.comments:
            annotations['comment'] = "\n".join(swiss_record.comments)
        if swiss_record.references:
            annotations['references'] = []
            for reference in swiss_record.references:
                feature = SeqFeature.Reference()
                feature.comment = " ".join("%s=%s;" % k_v
                                           for k_v in reference.comments)
                for key, value in reference.references:
                    if key == 'PubMed':
                        feature.pubmed_id = value
                    elif key == 'MEDLINE':
                        feature.medline_id = value
                    elif key == 'DOI':
                        pass
                    elif key == 'AGRICOLA':
                        pass
                    else:
                        raise ValueError("Unknown key %s found in references" %
                                         key)
                feature.authors = reference.authors
                feature.title = reference.title
                feature.journal = reference.location
                annotations['references'].append(feature)
        if swiss_record.keywords:
            record.annotations['keywords'] = swiss_record.keywords
        yield record
Esempio n. 7
0
def SwissIterator(handle):
    """Break up a Swiss-Prot/UniProt file into SeqRecord objects.

    Every section from the ID line to the terminating // becomes
    a single SeqRecord with associated annotation and features.

    This parser is for the flat file "swiss" format as used by:
     - Swiss-Prot aka SwissProt
     - TrEMBL
     - UniProtKB aka UniProt Knowledgebase

    For consistency with BioPerl and EMBOSS we call this the "swiss"
    format. See also the SeqIO support for "uniprot-xml" format.

    Rather than calling it directly, you are expected to use this
    parser via Bio.SeqIO.parse(..., format="swiss") instead.
    """
    with as_handle(handle, "rU") as handle:
        swiss_records = SwissProt.parse(handle)

        for swiss_record in swiss_records:
            # Convert the SwissProt record to a SeqRecord
            seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
            record = SeqRecord.SeqRecord(
                seq,
                id=swiss_record.accessions[0],
                name=swiss_record.entry_name,
                description=swiss_record.description,
                features=[_make_seqfeature(*f) for f in swiss_record.features],
            )
            record.description = swiss_record.description
            for cross_reference in swiss_record.cross_references:
                if len(cross_reference) < 2:
                    continue
                database, accession = cross_reference[:2]
                dbxref = "%s:%s" % (database, accession)
                if dbxref not in record.dbxrefs:
                    record.dbxrefs.append(dbxref)
            annotations = record.annotations
            annotations["accessions"] = swiss_record.accessions
            if swiss_record.protein_existence:
                annotations[
                    "protein_existence"] = swiss_record.protein_existence
            if swiss_record.created:
                annotations["date"] = swiss_record.created[0]
                annotations["sequence_version"] = swiss_record.created[1]
            if swiss_record.sequence_update:
                annotations[
                    "date_last_sequence_update"] = swiss_record.sequence_update[
                        0]
                annotations["sequence_version"] = swiss_record.sequence_update[
                    1]
            if swiss_record.annotation_update:
                annotations[
                    "date_last_annotation_update"] = swiss_record.annotation_update[
                        0]
                annotations["entry_version"] = swiss_record.annotation_update[
                    1]
            if swiss_record.gene_name:
                annotations["gene_name"] = swiss_record.gene_name
            annotations["organism"] = swiss_record.organism.rstrip(".")
            annotations["taxonomy"] = swiss_record.organism_classification
            annotations["ncbi_taxid"] = swiss_record.taxonomy_id
            if swiss_record.host_organism:
                annotations["organism_host"] = swiss_record.host_organism
            if swiss_record.host_taxonomy_id:
                annotations["host_ncbi_taxid"] = swiss_record.host_taxonomy_id
            if swiss_record.comments:
                annotations["comment"] = "\n".join(swiss_record.comments)
            if swiss_record.references:
                annotations["references"] = []
                for reference in swiss_record.references:
                    feature = SeqFeature.Reference()
                    feature.comment = " ".join("%s=%s;" % k_v
                                               for k_v in reference.comments)
                    for key, value in reference.references:
                        if key == "PubMed":
                            feature.pubmed_id = value
                        elif key == "MEDLINE":
                            feature.medline_id = value
                        elif key == "DOI":
                            pass
                        elif key == "AGRICOLA":
                            pass
                        else:
                            raise ValueError(
                                "Unknown key %s found in references" % key)
                    feature.authors = reference.authors
                    feature.title = reference.title
                    feature.journal = reference.location
                    annotations["references"].append(feature)
            if swiss_record.keywords:
                record.annotations["keywords"] = swiss_record.keywords
            yield record
Esempio n. 8
0
def project_to_genbank(filename, project, allblocks, construct_id=None):
    if construct_id is not None:
        blocks = [construct_id]
    else:
        blocks = project["components"]

    seq_obj_lst = []

    # For each of the construct in the project
    for block_id in blocks:
        block = [b for b in allblocks if b["id"] == block_id][0]
        if not block:
            continue

        # Grab the original ID that came from genbank before if available, otherwise the GD Name as the name
        if "genbank" in block["metadata"] and "id" in block["metadata"][
                "genbank"]:
            genbank_id = block["metadata"]["genbank"]["id"]
        elif "genbank" in block["metadata"] and "name" in block["metadata"][
                "genbank"]:
            genbank_id = block["metadata"]["genbank"]["name"]
        else:
            genbank_id = "GC_DNA"

        sequence = build_sequence(block, allblocks)
        seq_obj = SeqIO.SeqRecord(
            Seq.Seq(sequence, Seq.Alphabet.DNAAlphabet()), genbank_id)

        # Create a 'source' feature
        sf = SeqFeature.SeqFeature()
        sf.type = "source"
        sf.location = SeqFeature.FeatureLocation(0, len(seq_obj.seq))

        add_GC_info(sf, block, allblocks)

        if "genbank" in block["metadata"]:
            # Set up all the annotations in the genbank record. These came originally from genbank.
            if "annotations" in block["metadata"]["genbank"]:
                for annot_key, annot_value in block["metadata"]["genbank"][
                        "annotations"].iteritems():
                    seq_obj.annotations[annot_key] = annot_value
            # Set up all the references in the genbank record. These came originally from genbank.
            if "references" in block["metadata"]["genbank"]:
                for ref in block["metadata"]["genbank"]["references"]:
                    genbank_ref = SeqFeature.Reference()
                    genbank_ref.authors = ref['authors']
                    genbank_ref.comment = ref['comment']
                    genbank_ref.consrtm = ref['consrtm']
                    genbank_ref.journal = ref['journal']
                    genbank_ref.medline_id = ref['medline_id']
                    genbank_ref.pubmed_id = ref['pubmed_id']
                    genbank_ref.title = ref['title']
                    if "references" not in seq_obj.annotations:
                        seq_obj.annotations["references"] = []
                    seq_obj.annotations["references"].append(genbank_ref)
            # Add the original annotations to the source feature
            if "feature_annotations" in block["metadata"]["genbank"]:
                for annot_key, annot_value in block["metadata"]["genbank"][
                        "feature_annotations"].iteritems():
                    sf.qualifiers[annot_key] = annot_value

        seq_obj.features.append(sf)

        if "description" in block["metadata"]:
            seq_obj.description = block["metadata"]["description"]
        if "genbank" in block["metadata"] and "name" in block["metadata"][
                "genbank"]:
            seq_obj.name = block["metadata"]["genbank"]["name"]
        elif "name" in block["metadata"]:
            seq_obj.name = block["metadata"]["name"].replace(" ", "")[:5]
        else:
            seq_obj.name = "GC_DNA"

        convert_annotations(block, seq_obj, 0)

        # Add a block for each of the features, recursively
        start = 0
        for child_id in block['components']:
            child_block = [b for b in allblocks if b["id"] == child_id][0]
            start = add_features(child_block, allblocks, seq_obj, start)

        seq_obj_lst.append(seq_obj)

    SeqIO.write(seq_obj_lst, open(filename, "w"), "genbank")