예제 #1
0
 def start_li(self, attrs):
     if self._state == 'references':
         self._reference_state = 'pubmed_id'
         self._flush_text()
         if (self._current_reference != ''):
             self._references.append(self._current_reference)
         self._current_reference = Reference()
예제 #2
0
 def reset(self):
     sgmllib.SGMLParser.reset(self)
     self.ndb_dict = Record()
     self.text = ''
     self._space_group = ''
     self._state = 'id'
     self._reference_state = 'authors'
     self._current_reference = Reference()
예제 #3
0
 def rebuild_references(annotations: Dict) -> Dict[str, List[Reference]]:
     """ Rebuilds the SeqRecord 'references' annotation from JSON """
     bases = annotations["references"]
     refs = []
     for ref in bases:
         new_reference = Reference()
         new_reference.__dict__ = ref
         new_reference.location = [location_from_string(loc) for loc in ref["location"]]
         refs.append(new_reference)
     annotations["references"] = refs
     return annotations
예제 #4
0
 def __init__(self):
     self['Id'] = ''
     self['Features'] = ''
     self['Name'] = ''
     self['Sequence'] = Crystal({})
     self['Citation'] = Reference()
     self['Space Group'] = ''
     self['Cell Constants'] = {}
     self['Crystallization Conditions'] = []
     self['Refinement'] = ''
     self['Coordinates'] = ''
예제 #5
0
    def reference_number(self, line):
        """RN line, reference number (start of new reference)."""
        from Bio.SeqFeature import Reference
        # if we have a current reference that hasn't been added to
        # the list of references, add it.
        if self._current_ref is not None:
            self.data.annotations['references'].append(self._current_ref)
        else:
            self.data.annotations['references'] = []

        self._current_ref = Reference()
예제 #6
0
                    f.qualifiers["label"].remove(color)
                f.qualifiers["label"].append("color: #ff8eff")

        # sort features by start location, source always first
        gb_archive.features.sort(
            key=lambda f: (-len(gb.seq)) * (f.type == "source") + f.location.start
        )

        # translate color from notes to ApEinfo
        for feature in gb_archive.features:
            translate_color(feature)

        # Fix the direct submission reference
        if gb_archive.annotations["references"][-1].title == "Direct Submission":
            ref = gb_archive.annotations["references"][-1]
        else:
            ref = Reference()
            ref.title = "Direct Submission"
            gb_archive.annotations.append(ref)
        ref.authors = "Larralde M"
        ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo"

        # write the final record
        dst_dir = os.path.abspath(
            os.path.join(__file__, "..", "..", "moclo-plant", "registry", "plant")
        )
        with fs.open_fs(os.path.join(__file__, "..", ".."), create=True) as dst_fs:
            dir_fs = dst_fs.makedirs(fs.path.join("moclo-plant", "registry", "plant"), recreate=True)
            with dir_fs.open("{}.gb".format(info["id"]), "w") as dst_file:
                write(gb_archive, dst_file, "gb")
예제 #7
0
    def to_seq_record(self) -> SeqRecord:
        """Convert the cluster to a single record.

        Annotations of the source sequence are kept intact if they don't
        overlap with the cluster boundaries. Component genes are added on the
        record as *CDS* features. Annotated protein domains are added as
        *misc_feature*.

        """
        # store time of record creation
        now = datetime.datetime.now()

        # NB(@althonos): we use inclusive 1-based ranges in the data model
        # but slicing expects 0-based ranges with exclusive ends
        bgc = self.source[self.start - 1:self.end]
        bgc.id = bgc.name = self.id

        # copy sequence annotations
        bgc.annotations = self.source.annotations.copy()
        bgc.annotations["topology"] = "linear"
        bgc.annotations["molecule_type"] = "DNA"
        with patch_locale("C"):
            bgc.annotations['date'] = now.strftime("%d-%b-%Y").upper()

        biopython_version = tuple(map(int, Bio.__version__.split(".")))
        if biopython_version < (1, 77):
            from Bio import Alphabet

            bgc.seq.alphabet = Alphabet.generic_dna

        # add GECCO preprint as a reference
        ref = Reference()
        ref.title = "Accurate de novo identification of biosynthetic gene clusters with GECCO"
        ref.journal = "bioRxiv (2021.05.03.442509)"
        ref.comment = "doi:10.1101/2021.05.03.442509"
        ref.authors = ", ".join([
            "Laura M Carroll", "Martin Larralde", "Jonas Simon Fleck",
            "Ruby Ponnudurai", "Alessio Milanese", "Elisa Cappio Barazzone",
            "Georg Zeller"
        ])
        bgc.annotations.setdefault("references", []).append(ref)

        # add GECCO-specific annotations as a structured comment
        structured_comment = bgc.annotations.setdefault(
            "structured_comment", OrderedDict())
        structured_comment['GECCO-Data'] = {
            "version":
            f"GECCO v{__version__}",
            "creation_date":
            now.isoformat(),
            "biosyn_class":
            ",".join(ty.name for ty in self.type.unpack()),
            "alkaloid_probability":
            self.type_probabilities.get(ProductType.Alkaloid, 0.0),
            "polyketide_probability":
            self.type_probabilities.get(ProductType.Polyketide, 0.0),
            "ripp_probability":
            self.type_probabilities.get(ProductType.RiPP, 0.0),
            "saccharide_probability":
            self.type_probabilities.get(ProductType.Saccharide, 0.0),
            "terpene_probability":
            self.type_probabilities.get(ProductType.Terpene, 0.0),
            "nrp_probability":
            self.type_probabilities.get(ProductType.NRP, 0.0),
            "other_probability":
            self.type_probabilities.get(ProductType.Other, 0.0),
        }

        # add proteins as CDS features
        for gene in self.genes:
            # write gene as a /cds GenBank record
            cds = gene.to_seq_feature()
            cds.location += -self.start
            bgc.features.append(cds)
            # write domains as /misc_feature annotations
            for domain in gene.protein.domains:
                misc = domain.to_seq_feature(protein_coordinates=False)
                misc.location += cds.location.start
                bgc.features.append(misc)

        # return the complete BGC
        return bgc
예제 #8
0
        RN   [1] ok
        RA   Submitter, A.; ok
        RT   "Bacullis sp. strain XYZ genome annotated using Prokka."; ok
        RL   Submitted (18-Apr-2016) to the INSDC. ok
        XX
        '''

        record.id = "XXX"
        record.name = 'XXX'
        contig_name = record.description.split('Contig ')[1].split(' ')[0]
        contig_list.append(contig_name)
        record.description = args.description
        record.dbxrefs.append("Project:%s" % args.project)
        record.annotations['accessions'] = ['XXX', 'contig']
        record.annotations["data_file_division"] = 'XXX'
        record.annotations["references"] = [Reference()]
        record.annotations["references"][0].authors = 'XXX'
        record.annotations["references"][0].location = [
            FeatureLocation(0, len(record))
        ]
        record.annotations["references"][0].title = ''
        record.annotations["references"][
            0].journal = 'Submitted (%s) to the INSDC.' % today.strftime(
                '%d-%b-%Y')
        new_features = []
        for i in range(0, len(record.features)):
            type_list.append(record.features[i].type)
            if record.features[i].type == 'source':
                del record.features[i].qualifiers['project']
                del record.features[i].qualifiers['genome_md5']
                del record.features[i].qualifiers['genome_id']
예제 #9
0
 def reference_number(self, line):
     rn = line[5:].rstrip()
     assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn
     ref = Reference()
     ref.number = int(rn[1:-1])
     self.data.references.append(ref)
예제 #10
0
def create_reference(author_string=None):
    """Returns mock Reference data."""
    reference = Reference()
    reference.authors = author_string
    return reference
예제 #11
0
def reformat_gbk(gbk_file,
                 study,
                 publication_title,
                 publication_authors,
                 publication_journal,
                 locus_tag_prefix,
                 taxon_id,
                 scaffold_prefix,
                 strain,
                 plasmid=False,
                 locus_count_start=1):
    '''

    - remove protein_id
    - split scaffolds into contigs ==> name contigs contig_XXX
    - generate agp file

    :param gbk_file:
    :param study:
    :param publication:
    :param locus_tag_prefix:
    :param plasmid:
    :return:
    '''

    source, taxonomy, organism = taxon_id2taxonomy(taxon_id)

    print(source)
    print()
    print(taxonomy)
    print()

    new_records = []
    from Bio import SeqIO
    import copy
    import copy
    from Bio.SeqFeature import Reference
    from Bio.SeqFeature import FeatureLocation
    with open(gbk_file, 'r') as f:

        records = [i for i in SeqIO.parse(f, 'genbank')]
        #locus_count=1

        contig_records = []
        contig_count = 1

        for new_record in records:
            start = 0
            end = len(new_record.seq)
            print(dir(new_record))
            for feature in new_record.features:
                '''
                if feature.type == 'assembly_gap':
                    print 'GAP-------'
                    print feature
                    contig = new_record[start:int(feature.location.start)]
                    # update start location
                    start = int(feature.location.end)

                    # rename contig record LOCUS

                    contig.id = "contig_%s" % contig_count
                    contig.name = "contig_%s" % contig_count

                    contig_records.append(contig)
                    contig_count += 1
                '''
            contig = new_record[start:end]

            contig.id = "%s_%02d" % (scaffold_prefix, contig_count)
            contig.name = "%s_%02d" % (scaffold_prefix, contig_count)
            contig_records.append(contig)
            contig_count += 1

        for n, record in enumerate(contig_records):

            ref = Reference()
            ref.authors = publication_authors
            ref.journal = publication_journal
            ref.title = publication_title
            '''
            ref_seq = Refserence()
            ref.authors = "Trestan Pillonel"
            ref.journal = "RL   Submitted (09-APRIL-2019) to the INSDC."
            '''

            #print record
            #print dir(record)
            #print "id", record.id
            #print "name", record.name
            #print record.annotations
            #print record.description
            #print record.dbxrefs
            #record.id = ''
            record.annotations['source'] = source
            record.annotations['taxonomy'] = taxonomy
            record.annotations['organism'] = organism
            record.description = '%s %s scaffold_%s' % (organism, strain,
                                                        n + 1)

            if record.features[0].type != 'source':

                print('NOT SOURCE-------------------')
                record.features = [copy.copy(record.features[0])
                                   ] + record.features
                record.features[0].qualifiers = {}
                record.features[0].type = 'source'
                record.features[0].location = FeatureLocation(
                    0, len(record.seq))
            else:
                print('SOURCE!!!!!!!!!!!!!!!!')
            record.features[0].qualifiers['db_xref'] = ["taxon:%s" % taxon_id]
            record.features[0].qualifiers['mol_type'] = ["genomic DNA"]
            record.features[0].qualifiers['organism'] = ["%s" % organism]
            record.features[0].qualifiers['strain'] = ["%s" % strain]

            if plasmid:
                #     /mol_type="genomic DNA"
                #     /organism="Klebsiella pneumoniae"
                #     /strain="KpGe"
                #record.features[0].type = "source"
                #record.features[0].qualifiers['organism'] = ["Klebsiella pneumoniae"]
                #record.features[0].qualifiers['strain'] = ["KpGe"]
                record.features[0].qualifiers['plasmid'] = ["p%s" % strain]

            record.annotations['mol_type'] = ["genomic DNA"]
            ref.location = [record.features[0].location]
            #print 'location!', ref.location
            record.annotations['references'] = [ref]
            record.dbxrefs = ['BioProject:%s' % study]
            for i, feature in enumerate(record.features):
                if "protein_id" in feature.qualifiers:
                    del feature.qualifiers['protein_id']
                if feature.type == 'gene':
                    '''
                    if not plasmid:
                        locus = "%s_%05d" % (locus_tag_prefix, locus_count)
                    else:
                        print 'rename locus!', locus_tag_prefix
                        locus = "%s_p%04d" % (locus_tag_prefix, locus_count)
                    '''
                    locus = "%s_%05d" % (locus_tag_prefix, locus_count_start)
                    locus_count_start += 1
                    feature.qualifiers['locus_tag'] = locus
                    record.features[i + 1].qualifiers['locus_tag'] = locus
            new_records.append(record)

    return new_records