Ejemplo n.º 1
0
 def rebuild_references(annotations: Dict) -> Dict[str, List[Reference]]:
     """ Rebuilds the SeqRecord 'references' annotation from JSON """
     bases = annotations["references"]
     refs = []
     for ref in bases:
         new_reference = Reference()
         new_reference.__dict__ = ref
         new_reference.location = [location_from_string(loc) for loc in ref["location"]]
         refs.append(new_reference)
     annotations["references"] = refs
     return annotations
Ejemplo n.º 2
0
                                                 type="mRNA",
                                                 qualifiers=mrna_quals)
                cds_joined_feature = SeqFeature(joined_loc,
                                                type="CDS",
                                                qualifiers=cds_quals)

                new_feats.append(mrna_joined_feature)
                new_feats.append(cds_joined_feature)

                new_feats += utr_feats

    if keep_rec:
        rec.features = new_feats

        rec.description = args.description

        rec.annotations['organism'] = args.species
        rec.annotations['taxonomy'] = lineage
        rec.annotations['data_file_division'] = args.division

        ref.location = [FeatureLocation(0, len(rec))]
        rec.annotations['references'] = [ref]

        rec.dbxrefs = [('Project:%s' % args.project)]

        rec.annotations['keywords'] = [
            'CON.'
        ]  # CON is appropriate for scaffolds: https://www.ebi.ac.uk/training/online/course/nucleotide-sequence-data-resources-ebi/what-ena/how-sequence-assembled

        SeqIO.write(rec, args.out, "embl")
Ejemplo n.º 3
0
def reformat_gbk(gbk_file,
                 study,
                 publication_title,
                 publication_authors,
                 publication_journal,
                 locus_tag_prefix,
                 taxon_id,
                 scaffold_prefix,
                 strain,
                 plasmid=False,
                 locus_count_start=1):
    '''

    - remove protein_id
    - split scaffolds into contigs ==> name contigs contig_XXX
    - generate agp file

    :param gbk_file:
    :param study:
    :param publication:
    :param locus_tag_prefix:
    :param plasmid:
    :return:
    '''

    source, taxonomy, organism = taxon_id2taxonomy(taxon_id)

    print(source)
    print()
    print(taxonomy)
    print()

    new_records = []
    from Bio import SeqIO
    import copy
    import copy
    from Bio.SeqFeature import Reference
    from Bio.SeqFeature import FeatureLocation
    with open(gbk_file, 'r') as f:

        records = [i for i in SeqIO.parse(f, 'genbank')]
        #locus_count=1

        contig_records = []
        contig_count = 1

        for new_record in records:
            start = 0
            end = len(new_record.seq)
            print(dir(new_record))
            for feature in new_record.features:
                '''
                if feature.type == 'assembly_gap':
                    print 'GAP-------'
                    print feature
                    contig = new_record[start:int(feature.location.start)]
                    # update start location
                    start = int(feature.location.end)

                    # rename contig record LOCUS

                    contig.id = "contig_%s" % contig_count
                    contig.name = "contig_%s" % contig_count

                    contig_records.append(contig)
                    contig_count += 1
                '''
            contig = new_record[start:end]

            contig.id = "%s_%02d" % (scaffold_prefix, contig_count)
            contig.name = "%s_%02d" % (scaffold_prefix, contig_count)
            contig_records.append(contig)
            contig_count += 1

        for n, record in enumerate(contig_records):

            ref = Reference()
            ref.authors = publication_authors
            ref.journal = publication_journal
            ref.title = publication_title
            '''
            ref_seq = Refserence()
            ref.authors = "Trestan Pillonel"
            ref.journal = "RL   Submitted (09-APRIL-2019) to the INSDC."
            '''

            #print record
            #print dir(record)
            #print "id", record.id
            #print "name", record.name
            #print record.annotations
            #print record.description
            #print record.dbxrefs
            #record.id = ''
            record.annotations['source'] = source
            record.annotations['taxonomy'] = taxonomy
            record.annotations['organism'] = organism
            record.description = '%s %s scaffold_%s' % (organism, strain,
                                                        n + 1)

            if record.features[0].type != 'source':

                print('NOT SOURCE-------------------')
                record.features = [copy.copy(record.features[0])
                                   ] + record.features
                record.features[0].qualifiers = {}
                record.features[0].type = 'source'
                record.features[0].location = FeatureLocation(
                    0, len(record.seq))
            else:
                print('SOURCE!!!!!!!!!!!!!!!!')
            record.features[0].qualifiers['db_xref'] = ["taxon:%s" % taxon_id]
            record.features[0].qualifiers['mol_type'] = ["genomic DNA"]
            record.features[0].qualifiers['organism'] = ["%s" % organism]
            record.features[0].qualifiers['strain'] = ["%s" % strain]

            if plasmid:
                #     /mol_type="genomic DNA"
                #     /organism="Klebsiella pneumoniae"
                #     /strain="KpGe"
                #record.features[0].type = "source"
                #record.features[0].qualifiers['organism'] = ["Klebsiella pneumoniae"]
                #record.features[0].qualifiers['strain'] = ["KpGe"]
                record.features[0].qualifiers['plasmid'] = ["p%s" % strain]

            record.annotations['mol_type'] = ["genomic DNA"]
            ref.location = [record.features[0].location]
            #print 'location!', ref.location
            record.annotations['references'] = [ref]
            record.dbxrefs = ['BioProject:%s' % study]
            for i, feature in enumerate(record.features):
                if "protein_id" in feature.qualifiers:
                    del feature.qualifiers['protein_id']
                if feature.type == 'gene':
                    '''
                    if not plasmid:
                        locus = "%s_%05d" % (locus_tag_prefix, locus_count)
                    else:
                        print 'rename locus!', locus_tag_prefix
                        locus = "%s_p%04d" % (locus_tag_prefix, locus_count)
                    '''
                    locus = "%s_%05d" % (locus_tag_prefix, locus_count_start)
                    locus_count_start += 1
                    feature.qualifiers['locus_tag'] = locus
                    record.features[i + 1].qualifiers['locus_tag'] = locus
            new_records.append(record)

    return new_records