def rebuild_references(annotations: Dict) -> Dict[str, List[Reference]]: """ Rebuilds the SeqRecord 'references' annotation from JSON """ bases = annotations["references"] refs = [] for ref in bases: new_reference = Reference() new_reference.__dict__ = ref new_reference.location = [location_from_string(loc) for loc in ref["location"]] refs.append(new_reference) annotations["references"] = refs return annotations
type="mRNA", qualifiers=mrna_quals) cds_joined_feature = SeqFeature(joined_loc, type="CDS", qualifiers=cds_quals) new_feats.append(mrna_joined_feature) new_feats.append(cds_joined_feature) new_feats += utr_feats if keep_rec: rec.features = new_feats rec.description = args.description rec.annotations['organism'] = args.species rec.annotations['taxonomy'] = lineage rec.annotations['data_file_division'] = args.division ref.location = [FeatureLocation(0, len(rec))] rec.annotations['references'] = [ref] rec.dbxrefs = [('Project:%s' % args.project)] rec.annotations['keywords'] = [ 'CON.' ] # CON is appropriate for scaffolds: https://www.ebi.ac.uk/training/online/course/nucleotide-sequence-data-resources-ebi/what-ena/how-sequence-assembled SeqIO.write(rec, args.out, "embl")
def reformat_gbk(gbk_file, study, publication_title, publication_authors, publication_journal, locus_tag_prefix, taxon_id, scaffold_prefix, strain, plasmid=False, locus_count_start=1): ''' - remove protein_id - split scaffolds into contigs ==> name contigs contig_XXX - generate agp file :param gbk_file: :param study: :param publication: :param locus_tag_prefix: :param plasmid: :return: ''' source, taxonomy, organism = taxon_id2taxonomy(taxon_id) print(source) print() print(taxonomy) print() new_records = [] from Bio import SeqIO import copy import copy from Bio.SeqFeature import Reference from Bio.SeqFeature import FeatureLocation with open(gbk_file, 'r') as f: records = [i for i in SeqIO.parse(f, 'genbank')] #locus_count=1 contig_records = [] contig_count = 1 for new_record in records: start = 0 end = len(new_record.seq) print(dir(new_record)) for feature in new_record.features: ''' if feature.type == 'assembly_gap': print 'GAP-------' print feature contig = new_record[start:int(feature.location.start)] # update start location start = int(feature.location.end) # rename contig record LOCUS contig.id = "contig_%s" % contig_count contig.name = "contig_%s" % contig_count contig_records.append(contig) contig_count += 1 ''' contig = new_record[start:end] contig.id = "%s_%02d" % (scaffold_prefix, contig_count) contig.name = "%s_%02d" % (scaffold_prefix, contig_count) contig_records.append(contig) contig_count += 1 for n, record in enumerate(contig_records): ref = Reference() ref.authors = publication_authors ref.journal = publication_journal ref.title = publication_title ''' ref_seq = Refserence() ref.authors = "Trestan Pillonel" ref.journal = "RL Submitted (09-APRIL-2019) to the INSDC." ''' #print record #print dir(record) #print "id", record.id #print "name", record.name #print record.annotations #print record.description #print record.dbxrefs #record.id = '' record.annotations['source'] = source record.annotations['taxonomy'] = taxonomy record.annotations['organism'] = organism record.description = '%s %s scaffold_%s' % (organism, strain, n + 1) if record.features[0].type != 'source': print('NOT SOURCE-------------------') record.features = [copy.copy(record.features[0]) ] + record.features record.features[0].qualifiers = {} record.features[0].type = 'source' record.features[0].location = FeatureLocation( 0, len(record.seq)) else: print('SOURCE!!!!!!!!!!!!!!!!') record.features[0].qualifiers['db_xref'] = ["taxon:%s" % taxon_id] record.features[0].qualifiers['mol_type'] = ["genomic DNA"] record.features[0].qualifiers['organism'] = ["%s" % organism] record.features[0].qualifiers['strain'] = ["%s" % strain] if plasmid: # /mol_type="genomic DNA" # /organism="Klebsiella pneumoniae" # /strain="KpGe" #record.features[0].type = "source" #record.features[0].qualifiers['organism'] = ["Klebsiella pneumoniae"] #record.features[0].qualifiers['strain'] = ["KpGe"] record.features[0].qualifiers['plasmid'] = ["p%s" % strain] record.annotations['mol_type'] = ["genomic DNA"] ref.location = [record.features[0].location] #print 'location!', ref.location record.annotations['references'] = [ref] record.dbxrefs = ['BioProject:%s' % study] for i, feature in enumerate(record.features): if "protein_id" in feature.qualifiers: del feature.qualifiers['protein_id'] if feature.type == 'gene': ''' if not plasmid: locus = "%s_%05d" % (locus_tag_prefix, locus_count) else: print 'rename locus!', locus_tag_prefix locus = "%s_p%04d" % (locus_tag_prefix, locus_count) ''' locus = "%s_%05d" % (locus_tag_prefix, locus_count_start) locus_count_start += 1 feature.qualifiers['locus_tag'] = locus record.features[i + 1].qualifiers['locus_tag'] = locus new_records.append(record) return new_records