if 5 in _RunExercise: print('\n---Exercise 5---') id = "PAX-6.5" format = "fasta" record = SeqIO.read(open(id + "." + format), format) print("Record:\n", record) record.seq.alphabet = IUPAC.unambiguous_dna print("\nAlphabet altered to IUPAC.unambiguous_dna !!!") accNb = record.id.split("|")[3] print("Access number: ", accNb) record.name = accNb record.id = accNb print("record.name and record.id have been altered !!!") feature = SeqFeature() feature.type = "gene" feature.location = FeatureLocation(18, 200) feature.strand = -1 record.features.append(feature) print("record.features: ", record.features) print("\nRecord:\n", record) count = SeqIO.write(record, open(id + ".gb", "w"), "genbank") print("Converted %i records" % count) if 6 in _RunExercise: print('\n---ORF---') mail = '' id = "NC_009926"
def redigest_code(): argscheck() ### making outfile # output file if entry2_input.get() == "": outfile = 'redigest.'+ TIME + '.out' else: outfile=entry2_input.get() out_file = open(outfile, 'wt+') ### processing gene sequences if entry7_input.get() == "Multifasta gene file": genomeSeq = "N" elif entry7_input.get() == "Single genome sequence": genomeSeq = "Y" if genomeSeq == 'N': ### making report file verbosity = 'Y' report_file = outfile + '.csv' # open report file RF = open(report_file, 'wt+') # reverse complement the reverse primer if entry6_input.get() == "": reverse = "" else: reverse = entry6_input.get() reverse=str(Seq(reverse).reverse_complement()) # counter for the NAME count=1 ### iterating sequences input_file = entry1_input.get() infile= open(input_file, 'r') # input file format if entry8_input.get() == "Fasta": informat = "fasta" elif entry8_input.get() == "Genbank": informat = "genbank" for record in SeqIO.parse(infile, informat): header=record.id array=str(record.seq) NAME=str('RED' + TIME + str(count)) # if informat == 'genbank': desc=str(', '.join(list(record.annotations["taxonomy"]))) else: desc = '' ## adding primer sequence if provided if entry5_input.get() == "": forward = "" else: forward = entry5_input.get() if forward is not None: Farray=''.join(forward + array) else: Farray=array ### adding primer sequence if provided if entry6_input.get() == "": reverse = "" else: reverse = entry6_input.get() if reverse is not None: FarrayR=''.join(Farray.strip('\n') + reverse) else: FarrayR=Farray ### orientation based on tagged if entry4_input.get() == "Forward": tagg = str("F") elif entry4_input.get() == "Reverse": tagg = str("R") tagged=tagg.upper() if tagged == 'R': FarrayRseqFR=str(Seq(FarrayR).reverse_complement()) SubFeat="TRF_RevComp" else: FarrayRseqFR=str(FarrayR) SubFeat="TRF" ### Restriction Enzyme check from list enzyme = entry3_input.get() enzyme_RE = RestrictionBatch([enzyme]) ### search the restriction sites position in sequence FarrayRseqFR_RE=enzyme_RE.search(Seq(FarrayRseqFR)) ### convert the dict to the list and indexing index=list(FarrayRseqFR_RE.values())[0] ### checking if restriction site is present or sequence will be uncut if not index: fragment=len(FarrayRseqFR) else: fragment=index[0] ### adding size to header and trimming sequence to terminal fragment length if not index: ### non-cut fragment header FastaHeader=NAME + "|" + str(len(FarrayRseqFR)) + "_bp" + "|" + header ### non-cut fragment sequence FastaSeq=FarrayRseqFR[:len(FarrayRseqFR)] Feat=SeqFeature(FeatureLocation(start=0, end=len(FarrayRseqFR)), type="REDigest", ref=SubFeat) else: ### cut fragment header FastaHeader=NAME + "|" + str(fragment) + "_bp" + "|" + header ### cut fragment sequence and slicing to the fragment length FastaSeq=FarrayRseqFR[:fragment] Feat=SeqFeature(FeatureLocation(start=0, end=fragment), type="REDigest", ref=SubFeat) ### terminal-screen output, info about sequence header and all the fragments ### based of verbosity if verbosity == 'Y': print(" ", FastaHeader, '\t', FarrayRseqFR_RE) ### counter for the locus name count +=1 ### seq object FastaSequence = SeqRecord(Seq(FastaSeq, IUPAC.IUPACAmbiguousDNA()), FastaHeader, description=desc, name=NAME) ### append features to seqobject FastaSequence.features.append(Feat) ### seq object if entry9_input.get() == "Fasta": outformat = "fasta" elif entry9_input.get() == "Genbank": outformat = "genbank" if outformat == 'genbank': SeqIO.write(FastaSequence, out_file, outformat) else: SeqIO.write(FastaSequence, out_file, "fasta-2line") ### writing progress to file too print(FastaHeader, '\t', FarrayRseqFR_RE, file=RF) ##################### ############################################################### Genome else: ### parsing genome sequence ### making report file if entry9_input.get() == "Fasta": outformat = "fasta" elif entry9_input.get() == "Genbank": outformat = "genbank" if entry2_input.get() == "": outfile = 'redigest.'+ TIME + '.out' else: outfile=entry2_input.get() report_file = outfile + '_RF.csv' # open report file RF = open(report_file, 'wt+') print("Individual restriction fragments", file=RF) print("[WRITING:] Individual restriction fragments to file:", report_file) input_file = entry1_input.get() infile= open(input_file, 'r') # input file format if entry8_input.get() == "Fasta": informat = "fasta" elif entry8_input.get() == "Genbank": informat = "genbank" for record in SeqIO.parse(infile, informat): Gen_header=record.id Gen_array=str(record.seq) # if informat == 'genbank': desc=str(', '.join(list(record.annotations["taxonomy"]))) else: desc = '' ### Restriction Enzyme check from list enzyme = entry3_input.get() enzyme_RE = RestrictionBatch([enzyme]) ### search the restriction sites position in sequence Gen_array_RE=enzyme_RE.search(Seq(Gen_array)) Gen_array_RE_V = list(Gen_array_RE.values())[0] # ID0 = 0 ID_min = min(Gen_array_RE_V) ID_max = max(Gen_array_RE_V) ID1 = 0 ID2 = 1 # first fragment from first nt to first cut GenFastaSeq=Gen_array[0:ID_min] GenFastaSeqLen = len(GenFastaSeq) GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header # verbosity verbosity = "Y" if verbosity == 'Y': print(" ", GenFastaHeader) # seq object GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc) ### seq object to file if outformat == 'genbank': SeqIO.write(GenSeqRec, out_file, outformat) elif outformat == 'fasta': SeqIO.write(GenSeqRec, out_file, outformat) # report to file print(GenFastaHeader, file=RF) # for GenomeFragment in Gen_array_RE_V: while ID2 < len(Gen_array_RE_V): GenFastaSeq=Gen_array[Gen_array_RE_V[ID1]:Gen_array_RE_V[ID2]] GenFastaSeqLen = len(GenFastaSeq) GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header # verbosity if verbosity == 'Y': print(" ", GenFastaHeader) # increment value for index ID1 += 1 ID2 += 1 # seq object GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc) ### seq object to file if outformat == 'genbank': SeqIO.write(GenSeqRec, out_file, outformat) elif outformat == 'fasta': SeqIO.write(GenSeqRec, out_file, outformat) # report to file print(GenFastaHeader, file=RF) # last fragment from last cut to last nt GenFastaSeq=Gen_array[ID_max:] GenFastaSeqLen = len(GenFastaSeq) GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header # verbosity if verbosity == 'Y': print(" ", GenFastaHeader) # seq object GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc) ### seq object to file if outformat == 'genbank': SeqIO.write(GenSeqRec, out_file, outformat) else: SeqIO.write(GenSeqRec, out_file, outformat) # report to file print(GenFastaHeader, file=RF) # getting all sequences from first nt to respective cut report_file2 = outfile + '_TRF.csv' # open report file TRF2 = open(report_file2, 'wt+') print("Terminal restriction fragments: from nucleotide 1 to respective cuts", file=TRF2) print("[WRITING:] Terminal restriction fragments, from nucleotide 1 to respective cuts to file:", report_file2) for GenomeFragment in Gen_array_RE_V: GenFastaHeader=Gen_header + "|" + str(GenomeFragment) + "_bp|" + Gen_header GenFastaSeq=Gen_array[:GenomeFragment] ### seq object GenFastaSequence = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc) ### terminal-screen output, info about sequence header and all the fragments ### based of verbosity if verbosity == 'Y': print(" ", GenFastaHeader) ### writing progress to file too print(GenFastaHeader, file=TRF2) ## close files TRF2.close() RF.close() # final close out_file.close() infile.close()
def run(self, record): logging.info('Detecting BGCs using %s model in %s', self.detector_label, record.id) protein_features = util.get_protein_features(record) proteins_by_id = util.get_proteins_by_id(protein_features) pfam_features = util.get_pfam_features(record) if not len(pfam_features): logging.warning('Warning: No Pfam domains in record %s, skipping BGC detection', record.id) return # Filter out previous clusters detected with the same detector label num_prev_features = len(record.features) record.features = [f for f in record.features if not(f.type == 'cluster' and f.qualifiers.get('detector_label') == [self.detector_label])] num_removed_features = num_prev_features - len(record.features) if num_removed_features: logging.warning('Warning: Removed %s previously clusters detected clusters with same label "%s". ' 'Use --label DeepBGCMyLabel to preserve original clusters and add second set of clusters detected ' 'with same model but different parameters.', num_removed_features, self.detector_label) # Create DataFrame with Pfam sequence pfam_sequence = util.create_pfam_dataframe_from_features(pfam_features, proteins_by_id) # Predict BGC score of each Pfam pfam_sequence[self.score_column] = self.model.predict(pfam_sequence) # Get average BGC score for each protein protein_scores = pfam_sequence.groupby('protein_id', sort=False)[self.score_column].mean() # Add score to all Pfam features for i, feature in enumerate(pfam_features): feature.qualifiers[self.score_column] = ['{:.5f}'.format(pfam_sequence[self.score_column].iloc[i])] # Add score to all protein features for protein_id, score in protein_scores.items(): proteins_by_id[protein_id].qualifiers[self.score_column] = ['{:.5f}'.format(score)] clusters = [] active_proteins = [] gap_proteins = [] # Create a list of cluster features by merging consecutive proteins with score satisfying given threshold # Neighboring clusters within given number of nucleotides/proteins are merged for protein in protein_features: if self.score_column not in protein.qualifiers: # TODO: Should proteins with no Pfam domains also be considered? # Current protein did not have any Pfam domains, therefore it has no BGC score, ignore it continue score = float(protein.qualifiers[self.score_column][0]) # Inactive protein, add to gap if score < self.score_threshold: gap_proteins.append(protein) # We just changed from active to inactive, add current list of active proteins as a cluster if active_proteins: clusters.append(active_proteins) active_proteins = [] # Active protein else: # If no cluster is open, check if we should merge with the previous cluster if not active_proteins and clusters: prev_cluster_proteins = clusters[-1] prev_end = prev_cluster_proteins[-1].location.end if len(gap_proteins) <= self.merge_max_protein_gap or \ (protein.location.start - prev_end) <= self.merge_max_nucl_gap: # Remove previous candidate and continue where it left off clusters = clusters[:-1] active_proteins = prev_cluster_proteins + gap_proteins # Add current protein to cluster active_proteins.append(protein) gap_proteins = [] # Last protein was active, add list of active proteins as a cluster if active_proteins: clusters.append(active_proteins) # Add detected clusters as features record_num_detected = 0 for cluster_proteins in clusters: start = cluster_proteins[0].location.start end = cluster_proteins[-1].location.end candidate_id = '{}_{}-{}.1'.format(record.id, int(start), int(end)) if self.min_nucl > 1: nucl_length = end - start if nucl_length < self.min_nucl: logging.debug('Skipping cluster %s with %s < %s nucleotides', candidate_id, nucl_length, self.min_nucl) continue if self.min_proteins > 1: num_proteins = len(cluster_proteins) if num_proteins < self.min_proteins: logging.debug('Skipping cluster %s with %s < %s proteins', candidate_id, num_proteins, self.min_proteins) continue if self.min_domains > 1 or self.min_bio_domains > 0: pfam_ids = util.get_pfam_feature_ids(record) num_domains = len(pfam_features) if num_domains < self.min_domains: logging.debug('Skipping cluster %s with %s < %s protein domains', candidate_id, num_domains, self.min_domains) continue num_bio_domains = len(util.filter_biosynthetic_pfam_ids(pfam_ids)) if num_bio_domains < self.min_bio_domains: logging.debug('Skipping cluster %s with %s < %s known biosynthetic protein domains', candidate_id, num_bio_domains, self.min_bio_domains) continue scores = [float(feature.qualifiers[self.score_column][0]) for feature in cluster_proteins] location = FeatureLocation(start, end) qualifiers = { self.score_column: ['{:.5f}'.format(np.mean(scores))], 'detector': [self.detector_name], 'detector_label': [self.detector_label], 'detector_version': [self.model.version], 'detector_version_timestamp': [self.model.timestamp], 'product': ['{}_putative'.format(self.detector_name)], 'bgc_candidate_id': [candidate_id] } record.features.append(SeqFeature( location=location, type="cluster", qualifiers=qualifiers )) record_num_detected += 1 self.num_detected += 1 # Sort all features by location util.sort_record_features(record) # Add detector metadata to the record as a structured comment if 'structured_comment' not in record.annotations: record.annotations['structured_comment'] = {} comment_key = util.format_detector_meta_key(self.detector_label) record.annotations['structured_comment'][comment_key] = collections.OrderedDict( name=self.detector_name, label=self.detector_label, version=self.model.version, version_timestamp=self.model.timestamp, detection_timestamp_utc=datetime.utcnow().isoformat(), score_threshold=self.score_threshold, merge_max_nucl_gap=self.merge_max_nucl_gap, merge_max_protein_gap=self.merge_max_protein_gap, min_proteins=self.min_proteins, min_domains=self.min_domains, min_bio_domains=self.min_bio_domains ) logging.info('Detected %s BGCs using %s model in %s', record_num_detected, self.detector_label, record.id)
sys.stderr.write(repr(parts) + "\n") raise flip = False if q_start > q_end: flip = not flip q_start, q_end = q_end, q_start if s_start > s_end: flip = not flip s_start, s_end = s_end, s_start if flip: c = colors.Color(0, 0, 1, alpha=0.25) b = False else: c = colors.Color(1, 0, 0, alpha=0.25) b = False q_feature = q_set.add_feature(SeqFeature( FeatureLocation(q_start - 1, q_end)), color=c, border=b) s_feature = s_set.add_feature(SeqFeature( FeatureLocation(s_start - 1, s_end)), color=c, border=b) gd_diagram.cross_track_links.append( CrossLink(q_feature, s_feature, c, b)) # NOTE: We are using the same colour for all the matches, # with transparency. This means overlayed matches will appear darker. # It also means the drawing order not very important. # Note ACT puts long hits at the back, and colours by hit score print("Drawing CDS features...") for f, format in genomes:
def convert_gbk(gb_dir, gb_out_dir, rodeo_output, bg_domains, max_intergenic_distance=100, product_class='thiopeptide'): """Convert a common genbank file to the genbank that mimics antiSMASH output. Adds a feature 'cluster' with information about the class of the product. The coordinates of this feature are boundaries of the group of adjacent genes on the same strand that includes RODEO query. Marks genes with given domains as biosynthetic. Parameters ---------- gb_dir : str Directory with input genbank files. gb_out_dir : str Directory to store the output. rodeo_output: RodeoOutput RODEO output to use as a reference. bg_domains : list List of Pfam or TIGRFAMs IDs for domains that are important for your product biosynthesis. max_intergenic_distance : int, optional Maximum distance (nt) between genes within the biosynthetic gene cluster (default: 100). product_class : string, optional A putative class of the final product (default: thiopeptide). Returns ------- bool True if successful, False otherwise. """ rodeo_output.table_proccessing(bg_domains, max_intergenic_distance) operon_border_accs = (rodeo_output.operon_accs[0], rodeo_output.operon_accs[-1]) biosynthetic_genes = rodeo_output.biosynthetic_genes contig_edge = False prot_id = rodeo_output.query try: genbank = SeqIO.parse('%s%s.gbk' % (gb_dir, prot_id), 'genbank') for record in genbank: # Every file is expected to contain only one record cluster_coords = OrderedDict([('start', 1), ('end', len(record))]) for feature in record.features: if feature.type == 'CDS': border_check = check_if_border(feature, operon_border_accs) if border_check is not None: cluster_coords[border_check[0]] = border_check[1] if 'protein_id' in feature.qualifiers: if feature.qualifiers['protein_id'][ 0] in biosynthetic_genes: feature.qualifiers['sec_met'] = [ 'Kind: biosynthetic' ] start, end = cluster_coords.values() cluster_location = FeatureLocation(start, end) cluster_qualifiers = OrderedDict([('contig_edge', str(contig_edge)), ('product', product_class)]) cluster = SeqFeature(location=cluster_location, type='cluster', qualifiers=cluster_qualifiers) record.features = [cluster] + record.features SeqIO.write(record, '%s%s.gbk' % (gb_out_dir, prot_id), 'genbank') return True except Exception as e: print e return False
def mockup(features_list, write=False, pagesize="A4", scale_fontsize=3, label_size=2, greytrack_fontsize=7, x=0.05, y=0.01, track_size=0.2, track_names="", scale_ticks=False, format="linear", total_len=3000): colors_cycle = [ colors.orchid, colors.cornflower, colors.lightseagreen, colors.cornflower, colors.salmon ] colors_cycle = cycle(colors_cycle) gdd = GenomeDiagram.Diagram('Construct Diagram', x=x, y=y, track_size=track_size) for ix, track_info in enumerate(features_list): track_len = 0 for i in track_info: track_len += i[1] track, features = new_track(gdd, " " + track_names[ix], smalltick=10, scale_fontsize=scale_fontsize, greytrack_fontsize=greytrack_fontsize, scale_ticks=scale_ticks, end=track_len) feature_start = 0 for feature_info in track_info: if feature_info[0] == "skip": feature_start += feature_info[1] continue feature = SeqFeature(FeatureLocation( feature_start, feature_start + feature_info[1]), strand=feature_info[2]) if feature_info[0] == "LoxP": feature_color = colors.yellow elif feature_info[0] == "STOP": feature_color = colors.red elif feature_info[0] == "Restriction": feature_color = colors.chartreuse else: feature_color = colors_cycle.next() features.add_feature(feature, name=feature_info[0], label=True, color=feature_color, label_size=label_size, label_color=feature_color, label_angle=30, sigil=feature_info[3], arrowshaft_height=1) feature_start += feature_info[1] gdd.draw(format=format, pagesize=pagesize, fragments=1, start=0, end=total_len) if write: gdd.write("/home/chymera/src/AutoTransGeno/output/test.pdf", "PDF") return gdd
def test_reverse_complement_seq(self): s = SeqRecord( Seq("ACTG"), id="TestID", name="TestName", description="TestDescription", dbxrefs=["TestDbxrefs"], features=[SeqFeature(FeatureLocation(0, 3), type="Site")], annotations={"organism": "bombyx"}, letter_annotations={"test": "abcd"}, ) rc = s.reverse_complement( id=True, name=True, description=True, dbxrefs=True, features=True, annotations=True, letter_annotations=True, ) self.assertEqual("CAGT", str(rc.seq)) self.assertEqual("TestID", rc.id) self.assertEqual("TestID", s.reverse_complement(id="TestID").id) self.assertEqual("TestName", rc.name) self.assertEqual("TestName", s.reverse_complement(name="TestName").name) self.assertEqual("TestDescription", rc.description) self.assertEqual( "TestDescription", s.reverse_complement(description="TestDescription").description, ) self.assertEqual(["TestDbxrefs"], rc.dbxrefs) self.assertEqual( ["TestDbxrefs"], s.reverse_complement(dbxrefs=["TestDbxrefs"]).dbxrefs ) self.assertEqual( "[SeqFeature(FeatureLocation(ExactPosition(1), ExactPosition(4)), type='Site')]", repr(rc.features), ) rc2 = s.reverse_complement( features=[SeqFeature(FeatureLocation(1, 4), type="Site")] ) self.assertEqual( "[SeqFeature(FeatureLocation(ExactPosition(1), ExactPosition(4)), type='Site')]", repr(rc2.features), ) self.assertEqual({"organism": "bombyx"}, rc.annotations) self.assertEqual( {"organism": "bombyx"}, s.reverse_complement(annotations={"organism": "bombyx"}).annotations, ) self.assertEqual({"test": "dcba"}, rc.letter_annotations) self.assertEqual( {"test": "abcd"}, s.reverse_complement( letter_annotations={"test": "abcd"} ).letter_annotations, )
i += 1 genes_track = GenomeDiagram.Track('genes', greytrack=False, scale=False) genes_track.add_set(feature_set1) genes_track.add_set(feature_set2) #%% from Bio.SeqFeature import SeqFeature, FeatureLocation snv_df = pd.read_csv(tab_dir + '/SNV_HUMAN_YFV_RESULTS.csv') snv_series = snv_df.iloc[:, 2] feature_set_SNV = GenomeDiagram.FeatureSet() for position in snv_series: snv = SeqFeature(FeatureLocation(position, position), strand=+1) feature_set_SNV.add_feature(snv, color='red', strand=None) SNV_track = GenomeDiagram.Track('SNV', greytrack=False, scale=True, scale_format='SInt', scale_fontsize=10, scale_fontangle=90, scale_largetick_interval=5000, scale_smalltick_interval=1000, scale_largeticks=0.5, scale_smallticks=0.2) SNV_track.add_set(feature_set_SNV)
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False): from Bio.Blast import NCBIXML from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import SeqFeature, FeatureLocation blast_records = NCBIXML.parse(blastxml) records = [] for record in blast_records: # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP 'BLASTN': 'nucleotide_match', 'BLASTP': 'protein_match', }.get(record.application, 'match') rec = SeqRecord(Seq("ACTG"), id=record.query) for hit in record.alignments: for hsp in hit.hsps: qualifiers = { "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, "hit_titles": hit.title.split(' >') } desc = hit.title.split(' >')[0] qualifiers['description'] = desc[desc.index(' '):] # This required a fair bit of sketching out/match to figure out # the first time. # # the match_start location must account for queries and # subjecst that start at locations other than 1 parent_match_start = hsp.query_start - hsp.sbjct_start # The end is the start + hit.length because the match itself # may be longer than the parent feature, so we use the supplied # subject/hit length to calculate the real ending of the target # protein. parent_match_end = hsp.query_start + hit.length + hsp.query.count( '-') # However, if the user requests that we trim the feature, then # we need to cut the ``match`` start to 0 to match the parent feature. # We'll also need to cut the end to match the query's end. It (maybe) # should be the feature end? But we don't have access to that data, so # We settle for this. if trim: if parent_match_start < 1: parent_match_start = 0 if trim or trim_end: if parent_match_end > hsp.query_end: parent_match_end = hsp.query_end + 1 # The ``match`` feature will hold one or more ``match_part``s top_feature = SeqFeature(FeatureLocation( parent_match_start, parent_match_end), type=match_type, strand=0, qualifiers=qualifiers) # Unlike the parent feature, ``match_part``s have sources. part_qualifiers = { "source": "blast", } top_feature.sub_features = [] for start, end, cigar in generate_parts(hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap): part_qualifiers['Gap'] = cigar part_qualifiers['ID'] = hit.hit_id if trim: # If trimming, then we start relative to the # match's start match_part_start = parent_match_start + start else: # Otherwise, we have to account for the subject start's location match_part_start = parent_match_start + hsp.sbjct_start + start - 1 # We used to use hsp.align_length here, but that includes # gaps in the parent sequence # # Furthermore align_length will give calculation errors in weird places # So we just use (end-start) for simplicity match_part_end = match_part_start + (end - start) top_feature.sub_features.append( SeqFeature(FeatureLocation(match_part_start, match_part_end), type="match_part", strand=0, qualifiers=copy.deepcopy(part_qualifiers))) rec.features.append(top_feature) rec.annotations = {} records.append(rec) return records
feature].location.start and strand != gene_list[ feature].strand: break if gene != True: qualifiers['gene'] = gene qualifiers['note'] = 'TransTerm HP conf=%s, tail_score=%s' % ( conf, tail_score) else: qualifiers['note'] = 'TransTerm HP conf=%s, tail_score=%s' % ( conf, tail_score) qualifiers['regulatory_class'] = 'terminator' qualifiers['TermGen_check'] = ['Checked!'] feature_location = FeatureLocation(start, end) my_feature = SeqFeature(location=feature_location, type='regulatory', strand=strand, qualifiers=qualifiers) # adding terminators to genbank features list for i in reversed(xrange(len(record.features))): if record.features[i].location.start < start: record.features.insert(i + 1, my_feature) break # editing features to add U-tail new_features_list = [] for feature in record.features: if feature.type == 'regulatory' and feature.qualifiers.has_key( 'TermGen_check'): u_tail = 0 tail = ''
def prepare_visualization(options, seq_record): # Check, whether (Sub)ClusterBlast data is encoded in source feature sourceFeatures = utils.get_all_features_of_type(seq_record, 'source') if len(sourceFeatures) == 0: loc = FeatureLocation(0, len(seq_record.seq)) source_feature = SeqFeature(loc, type="source") seq_record.features.append(source_feature) sourceFeatures = utils.get_all_features_of_type(seq_record, 'source') if 'extrarecord' in options: if options.extrarecord.has_key(seq_record.id): # As there is only one source feature per record we just can take the first one without cycling through all features for key in options.extrarecord[seq_record.id].extradata.keys(): if key == 'ClusterBlastData': logging.debug( "prepare_visualization: Found ClusterBlastData storage object" ) options.clusterblast = True clusterBlastResults = options.extrarecord[ seq_record.id].extradata[key] seq_record.internalhomologygroupsdict = clusterBlastResults.internalhomologygroupsdict seq_record.known_compound_dict = clusterBlastResults.known_compound_dict seq_record.nrhitgeneclusters = clusterBlastResults.nrhitgeneclusters seq_record.qgeneclusterdata = clusterBlastResults.qgeneclusterdata seq_record.queryclusterdata = clusterBlastResults.queryclusterdata seq_record.pubchem_dict = clusterBlastResults.pubchem_dict seq_record.pubmed_dict = clusterBlastResults.pubmed_dict elif key == 'SubClusterBlastData': logging.debug( "prepare_visualization: Found SubClusterBlastData storage object" ) options.subclusterblast = True subclusterBlastResults = options.extrarecord[ seq_record.id].extradata[key] seq_record.internalhomologygroupsdict = subclusterBlastResults.internalhomologygroupsdict seq_record.sc_nrhitgeneclusters = subclusterBlastResults.sc_nrhitgeneclusters # seq_record.sc_qgeneclusterdata = subclusterBlastResults.sc_qgeneclusterdata seq_record.sc_queryclusterdata = subclusterBlastResults.sc_queryclusterdata seq_record.pubchem_dict = subclusterBlastResults.pubchem_dict seq_record.pubmed_dict = subclusterBlastResults.pubmed_dict elif key == 'KnownClusterBlastData': logging.debug( "prepare_visualization: Found KnownClusterBlastData storage object" ) options.knownclusterblast = True knownclusterBlastResults = options.extrarecord[ seq_record.id].extradata[key] seq_record.internalhomologygroupsdict = knownclusterBlastResults.internalhomologygroupsdict seq_record.kc_nrhitgeneclusters = knownclusterBlastResults.kc_nrhitgeneclusters # seq_record.kc_qgeneclusterdata = knownclusterBlastResults.sc_qgeneclusterdata seq_record.kc_queryclusterdata = knownclusterBlastResults.kc_queryclusterdata seq_record.pubchem_dict = knownclusterBlastResults.pubchem_dict seq_record.pubmed_dict = knownclusterBlastResults.pubmed_dict # elif key == 'MetabolicModelDataObj': # pass # else: # logging.warn('Found key %s in options.clusterblastdata which does not match the hard coded choices!' % key) # # # load_pubmed_pubchem_links(seq_record) # if options.clusterblast: # load_clusterblast_outputdata(seq_record, options) # if options.subclusterblast: # load_subclusterblast_outputdata(seq_record, options) # if options.knownclusterblast: # load_knownclusterblast_outputdata(seq_record, options) # load_genecluster_info(seq_record, options) load_genecluster_info(seq_record, options)
if label in f.qualifiers.get("label", [])) def get_features_from_note(note): return (f for f in features if note in f.qualifiers.get("note", [])) # Add direct BsaI site features.append( SeqFeature( type="protein_bind", qualifiers={ # "label": ["BsaI"], "bound_moiety": ["BsaI"], "note": ["color: #ff0000; direction: RIGHT"], # "note": ["This forward directional feature has 2 segments:\n" # "1: 3 .. 8 / #ff0000\n" # "2: 10 .. 13 / #ff0000\n"] }, location=CompoundLocation([ FeatureLocation(2, 8, strand=1), FeatureLocation(9, 13, strand=1) ]), )) # Add reversed BsaI site BsaI_site_r = Seq(BsaI.site).reverse_complement() pos = (gba.seq + gba.seq).find(BsaI_site_r) features.append( SeqFeature( type="protein_bind", qualifiers={
for record in SeqIO.parse(fh, "fasta"): sdf = pd.DataFrame({ 'length': len(record), 'ID': record.id }, index=[record.id]) df = df.append(sdf) df.length = df.length.astype(int) max_len = df.length.max() #scale用ラダー染色体を書く lad_rec = SeqRecord(RandomSeq(scale_max), "ladder_" + str(scale_max) + " bp") lad_feature = SeqFeature() lad_feature_s = [] for i in range(1, scale_split, 2): region_unit = int(scale_max / scale_split) tmp_featur = SeqFeature(FeatureLocation(region_unit * i, region_unit * (i + 1)), type="gene", strand=1) tmp_start = region_unit * i tmp_featur.qualifiers["locus_tag"] = str(tmp_start) lad_feature_s.append(tmp_featur) lad_rec.features = lad_feature_s cur_chromosome = BasicChromosome.Chromosome(lad_rec.id)
# We add dummy features to the tracks for each cross-link BEFORE we add the # arrow features for the genes. This ensures the genes appear on top: for X, Y, X_vs_Y in [("NC_002703", "AF323668", A_vs_B), ("AF323668", "NC_003212", B_vs_C)]: features_X = records[X].features features_Y = records[Y].features set_X = feature_sets[X] set_Y = feature_sets[Y] for score, x, y in X_vs_Y: color = colors.linearlyInterpolatedColor(colors.white, colors.firebrick, 0, 100, score) border = colors.lightgrey f_x = get_feature(features_X, x) F_x = set_X.add_feature(SeqFeature( FeatureLocation(f_x.location.start, f_x.location.end, strand=0)), color=color, border=border) f_y = get_feature(features_Y, y) F_y = set_Y.add_feature(SeqFeature( FeatureLocation(f_y.location.start, f_y.location.end, strand=0)), color=color, border=border) gd_diagram.cross_track_links.append(CrossLink(F_x, F_y, color, border)) for record, gene_colors in zip([A_rec, B_rec, C_rec], [A_colors, B_colors, C_colors]): gd_feature_set = feature_sets[record.name] i = 0 for feature in record.features:
def json2seqrecord(self, json_record): uid = json_record["primaryAccession"] if "recommendedName" in json_record["proteinDescription"]: desc = json_record["proteinDescription"]["recommendedName"]["fullName"]["value"] else: desc = json_record["proteinDescription"]["submissionNames"][0]["fullName"]["value"] ecs = ([x["value"] for x in json_record["proteinDescription"]["recommendedName"]["ecNumbers"]] if ("recommendedName" in json_record["proteinDescription"] and "ecNumbers" in json_record["proteinDescription"]["recommendedName"]) else []) if "contains" in json_record["proteinDescription"]: for pd in json_record["proteinDescription"]["contains"]: if "ecNumbers" in pd["recommendedName"]: ecs += [x["value"] for x in pd["recommendedName"]["ecNumbers"]] r = SeqRecord(id=uid, name="", description=desc, seq=Seq(json_record["sequence"]["value"])) if "genes" in json_record: for gene in json_record["genes"]: if "geneName" in gene: val = gene["geneName"]["value"] dbx = "UnipGene:" + val r.dbxrefs.append(dbx) if "synonyms" in json_record["genes"]: for syn in json_record["genes"]["synonyms"]: val = syn["value"] dbx = "UnipGene:" + val r.dbxrefs.append(dbx) if "alternativeNames" in json_record["proteinDescription"]: for an in json_record["proteinDescription"]["alternativeNames"]: if "shortNames" in an: for x in an["shortNames"]: dbx = "UnipName:" + x["value"] r.dbxrefs.append(dbx) for ref in json_record["uniProtKBCrossReferences"]: dbx = ref["database"] + ":" + ref["id"] if (ref["database"] + ":") not in ref["id"] else ref["id"] r.dbxrefs.append(dbx) self.dbx_dict[dbx] = {x["key"]: x["value"] for x in ref["properties"]} if ref["database"] == "GO": gt = self.dbx_dict[dbx]["GoTerm"] self.dbx_dict[dbx]["GoTerm"] = ":".join(gt.split(":")[1:]) self.dbx_dict[dbx]["database"] = gt.split(":")[0] if self.dbx_dict[dbx]["database"] == "b": self.dbx_dict[dbx]["database"] = "biological_process" if self.dbx_dict[dbx]["database"] == "c": self.dbx_dict[dbx]["database"] = "molecular_function" if self.dbx_dict[dbx]["database"] == "f": self.dbx_dict[dbx]["database"] = "cellular_component" for ref in (json_record["secondaryAccessions"] if "secondaryAccessions" in json_record else [] ) + [json_record["uniProtkbId"], json_record["primaryAccession"]]: dbx = "UnipAcc:" + ref.replace(" ", "_") r.dbxrefs.append(dbx) for ref in ecs: dbx = "EC:" + ref r.dbxrefs.append(dbx) self.dbx_dict[dbx] = [["description", desc]] r.dbxrefs = list(set(r.dbxrefs)) if "features" in json_record: for f in json_record["features"]: l = FeatureLocation(start=f["location"]["start"]["value"], end=f["location"]["end"]["value"]) qual = {"description": f["description"]} if f["description"].replace("-", "").strip() else {} if "featureId" in f: qual["featureId"] = f["featureId"] seqf = SeqFeature(type=f["type"], location=l, qualifiers=qual) r.features.append(seqf) return r
def check_simple_tRNA(self, filename, use_seqfeatures=False): f1 = [(111889, 111961, -1, 'G01270'), (306383, 306456, 1, 'G01870'), (309274, 309347, -1, 'G01890'), (515493, 515566, 1, 'G02480'), (552639, 552711, 1, 'G02600'), (604401, 604474, 1, 'G02760'), (877648, 877720, 1, 'G03515'), (892513, 892585, 1, 'G03570'), (909809, 909882, -1, 'G03640'), (1159021, 1159092, 1, 'G04320'), (1324921, 1324959, 1, 'G04720'), (1583770, 1583844, -1, 'G05390'), (1817398, 1817470, 1, 'G05980'), (1978082, 1978156, 1, 'G06480'), (2025354, 2025427, 1, 'G06610'), (2107396, 2107467, -1, 'G06860'), (2111146, 2111217, -1, 'G06880'), (2177883, 2177957, 1, 'G07100'), (2334818, 2334891, 1, 'G07580'), (2406830, 2406902, -1, 'G07760'), (2588521, 2588593, 1, 'G08240'), (2846538, 2846611, -1, 'G08870'), (2879305, 2879377, 1, 'G08950'), (2939418, 2939490, 1, 'G09110'), (3431185, 3431257, -1, 'G10440'), (3676606, 3676644, 1, 'G11010'), (3678774, 3678848, -1, 'G11030'), (3881528, 3881608, 1, 'G11550'), (3914628, 3914700, -1, 'G11640'), (4266985, 4267059, -1, 'G12510'), (4285884, 4285956, -1, 'G12590'), (4440211, 4440284, 1, 'G13010'), (4522705, 4522779, -1, 'G13240'), (4709631, 4709703, 1, 'G13720'), (4741995, 4742068, 1, 'G13840'), (4743091, 4743164, 1, 'G13850'), (5189681, 5189755, -1, 'G15090'), (5309641, 5309713, -1, 'G15450'), (5380901, 5380983, 1, 'G15650'), (5518055, 5518128, -1, 'G16100'), (5619464, 5619537, -1, 'G16450'), (6038749, 6038831, 1, 'G17570'), (6075812, 6075884, 1, 'G17660'), (6075937, 6076011, -1, 'G17670'), (6345756, 6345828, 1, 'G18430'), (6488645, 6488726, 1, 'G18820'), (6948850, 6948934, -1, 'G20040'), (6995272, 6995344, -1, 'G20170'), (7004504, 7004576, 1, 'G20210'), (7016506, 7016579, 1, 'G20250'), (7082657, 7082729, 1, 'G20420'), (7242749, 7242821, -1, 'G20820'), (7499721, 7499793, -1, 'G21420'), (7656108, 7656180, -1, 'G21800'), (7884405, 7884443, -1, 'G22320'), (8520278, 8520352, -1, 'G24080'), (9143796, 9143870, 1, 'G26430'), (9158169, 9158242, 1, 'G26490'), (10089422, 10089494, 1, 'G28720'), (10089883, 10089955, 1, 'G28730'), (10090353, 10090425, 1, 'G28740'), (10090754, 10090826, 1, 'G28750'), (10092310, 10092382, 1, 'G28770'), (10092786, 10092858, 1, 'G28780'), (10093294, 10093366, 1, 'G28790'), (10093731, 10093803, 1, 'G28800'), (10094158, 10094230, 1, 'G28810'), (10096936, 10097008, 1, 'G28820'), (10097099, 10097171, 1, 'G28830'), (10097703, 10097775, 1, 'G28840'), (10098638, 10098710, 1, 'G28850'), (10099064, 10099136, 1, 'G28860'), (10099410, 10099482, 1, 'G28870'), (10099812, 10099884, 1, 'G28880'), (10100258, 10100330, 1, 'G28890'), (10101013, 10101085, 1, 'G28900'), (10101585, 10101657, 1, 'G28910'), (10101978, 10102050, 1, 'G28920'), (10106075, 10106147, 1, 'G28930'), (10106513, 10106585, 1, 'G28940'), (10106883, 10106955, 1, 'G28950'), (10107634, 10107706, 1, 'G28970'), (10108374, 10108446, 1, 'G28980'), (10108695, 10108767, 1, 'G28990'), (10207291, 10207364, -1, 'G29210'), (10756703, 10756776, 1, 'G30430'), (10963553, 10963627, -1, 'G30830'), (11104093, 11104167, 1, 'G31110'), (11797227, 11797265, -1, 'G32620'), (12097258, 12097327, -1, 'G33370'), (13687637, 13687710, 1, 'G36350'), (15733055, 15733127, -1, 'G42120'), (16588144, 16588216, -1, 'G43820'), (17159046, 17159118, 1, 'G45234'), (17159799, 17159871, 1, 'G45236'), (17160970, 17161042, 1, 'G45238'), (17161418, 17161490, 1, 'G45240'), (17162967, 17163039, 1, 'G45242'), (17163408, 17163480, 1, 'G45244'), (17164461, 17164533, 1, 'G45246'), (17735509, 17735582, 1, 'G48080'), (18139265, 18139337, -1, 'G49020'), (18234146, 18234220, -1, 'G49280'), (18312570, 18312607, 1, 'G49460'), (18391469, 18391542, 1, 'G49690'), (18556666, 18556746, 1, 'G50070'), (18561567, 18561647, 1, 'G50100'), (19428223, 19428297, 1, 'G52170'), (19502087, 19502161, -1, 'G52350'), (19688850, 19688887, -1, 'G52860'), (19851640, 19851714, 1, 'G53220'), (19929506, 19929578, -1, 'G53410'), (20416594, 20416667, -1, 'G54670'), (20794976, 20795058, 1, 'G55625'), (21272451, 21272533, 1, 'G56730'), (21272786, 21272823, 1, 'G56740'), (21273216, 21273253, 1, 'G56750'), (21273960, 21274042, 1, 'G56760'), (21274295, 21274332, 1, 'G56770'), (21274725, 21274762, 1, 'G56780'), (21275469, 21275551, 1, 'G56790'), (21275804, 21275841, 1, 'G56800'), (21276234, 21276271, 1, 'G56810'), (21276978, 21277060, 1, 'G56820'), (21277313, 21277350, 1, 'G56830'), (21277743, 21277780, 1, 'G56840'), (21278487, 21278569, 1, 'G56850'), (21278822, 21278859, 1, 'G56860'), (21279273, 21279310, 1, 'G56870'), (21280016, 21280098, 1, 'G56880'), (21280351, 21280388, 1, 'G56890'), (21280781, 21280818, 1, 'G56900'), (21281525, 21281607, 1, 'G56910'), (21281860, 21281897, 1, 'G56920'), (21282311, 21282348, 1, 'G56930'), (21283054, 21283136, 1, 'G56940'), (21283384, 21283421, 1, 'G56950'), (21283842, 21283879, 1, 'G56960'), (21284586, 21284668, 1, 'G56970'), (21284916, 21284953, 1, 'G56980'), (21285374, 21285411, 1, 'G56990'), (21286118, 21286200, 1, 'G57000'), (21286448, 21286485, 1, 'G57010'), (21286906, 21286943, 1, 'G57020'), (21287650, 21287732, 1, 'G57030'), (21287980, 21288017, 1, 'G57040'), (21288438, 21288475, 1, 'G57050'), (21289183, 21289265, 1, 'G57060'), (21289513, 21289550, 1, 'G57070'), (21289970, 21290007, 1, 'G57080'), (21290714, 21290796, 1, 'G57090'), (21291044, 21291081, 1, 'G57100'), (21291501, 21291538, 1, 'G57110'), (21292245, 21292327, 1, 'G57120'), (21292574, 21292611, 1, 'G57130'), (21293032, 21293069, 1, 'G57140'), (21293776, 21293858, 1, 'G57150'), (21294109, 21294146, 1, 'G57160'), (21294567, 21294604, 1, 'G57170'), (21295125, 21295207, 1, 'G57180'), (21295455, 21295492, 1, 'G57190'), (21295912, 21295949, 1, 'G57200'), (21296656, 21296738, 1, 'G57210'), (21296989, 21297026, 1, 'G57220'), (21297447, 21297484, 1, 'G57230'), (21298005, 21298087, 1, 'G57240'), (21298335, 21298372, 1, 'G57250'), (21298792, 21298829, 1, 'G57260'), (21299536, 21299618, 1, 'G57270'), (21299869, 21299906, 1, 'G57280'), (21300327, 21300364, 1, 'G57290'), (21300885, 21300967, 1, 'G57300'), (21301215, 21301252, 1, 'G57310'), (21301673, 21301710, 1, 'G57320'), (21302417, 21302499, 1, 'G57330'), (21302750, 21302787, 1, 'G57340'), (21303208, 21303245, 1, 'G57350'), (21303766, 21303848, 1, 'G57360'), (21304096, 21304133, 1, 'G57370'), (21304554, 21304591, 1, 'G57380'), (21305298, 21305380, 1, 'G57390'), (21305631, 21305668, 1, 'G57400'), (21306089, 21306126, 1, 'G57410'), (21306647, 21306729, 1, 'G57420'), (21306981, 21307018, 1, 'G57430'), (21307441, 21307478, 1, 'G57440'), (21308184, 21308268, 1, 'G57450'), (21308520, 21308557, 1, 'G57460'), (21308975, 21309012, 1, 'G57470'), (21309719, 21309801, 1, 'G57480'), (21310053, 21310090, 1, 'G57490'), (21310513, 21310550, 1, 'G57500'), (21311256, 21311340, 1, 'G57510'), (21311592, 21311629, 1, 'G57520'), (21312051, 21312088, 1, 'G57530'), (21377983, 21378054, -1, 'G57710'), (21887507, 21887589, -1, 'G59570'), (22044276, 22044348, -1, 'G59880'), (22317078, 22317149, -1, 'G60580'), (22398301, 22398372, -1, 'G60820'), (22401256, 22401327, -1, 'G60840'), (22431831, 22431902, 1, 'G60910'), (22481437, 22481511, -1, 'G61020'), (22870422, 22870494, -1, 'G61880'), (22890754, 22890834, 1, 'G61910'), (23562849, 23562921, -1, 'G63510'), (23671147, 23671219, -1, 'G63790'), (23806215, 23806299, 1, 'G64120'), (23936799, 23936872, 1, 'G64420'), (24490654, 24490736, -1, 'G65830'), (25833316, 25833388, 1, 'G68770'), (25890198, 25890272, 1, 'G68860'), (25931858, 25931931, 1, 'G68950'), (25935739, 25935812, -1, 'G68970'), (25944826, 25944898, 1, 'G69000'), (25993392, 25993466, 1, 'G69130'), (26053140, 26053214, 1, 'G69300'), (26385816, 26385888, -1, 'G70050'), (26977050, 26977121, 1, 'G71700'), (27397046, 27397128, 1, 'G72780'), (27792643, 27792715, 1, 'G73900'), (28024043, 28024124, -1, 'G74570'), (28031620, 28031701, 1, 'G74610'), (28188192, 28188264, 1, 'G75070'), (28377149, 28377222, -1, 'G75570'), (28411644, 28411717, 1, 'G75650'), (28444549, 28444621, 1, 'G75740'), (28523645, 28523717, -1, 'G75970'), (28531427, 28531499, 1, 'G76000'), (28639585, 28639667, 1, 'G76330'), (28952447, 28952519, -1, 'G77040'), (29007098, 29007180, -1, 'G77190'), (29147983, 29148055, -1, 'G77560'), (29448865, 29448903, -1, 'G78250'), (29809015, 29809088, 1, 'G79240'), (29838009, 29838081, 1, 'G79290'), (29838610, 29838682, 1, 'G79300'), (30088888, 30088962, -1, 'G79980'), (30178905, 30178977, -1, 'G80250'), (30242675, 30242757, 1, 'G80430')] f2 = [(102063, 102137, 1, 'G01160'), (706794, 706867, 1, 'G02600'), (846853, 846926, -1, 'G02900'), (1054714, 1054787, -1, 'G03490'), (1113980, 1114052, -1, 'G03660'), (1123386, 1123458, -1, 'G03700'), (1154381, 1154454, 1, 'G03790'), (3239653, 3239725, -1, 'G07742'), (3255828, 3255902, -1, 'G07743'), (3268803, 3268883, 1, 'G07745'), (3276436, 3276508, 1, 'G07746'), (3280859, 3280933, 1, 'G07748'), (3290962, 3291034, 1, 'G07778'), (3303240, 3303312, -1, 'G07752'), (3303350, 3303425, -1, 'G07753'), (3303781, 3303819, -1, 'G07754'), (3328666, 3328739, -1, 'G07755'), (3332674, 3332756, 1, 'G07792'), (3369350, 3369437, 1, 'G07793'), (3383400, 3383474, -1, 'G07794'), (3444359, 3444431, -1, 'G07756'), (3452973, 3453060, 1, 'G07757'), (3462074, 3462148, 1, 'G07758'), (3494378, 3494416, 1, 'G07759'), (3494772, 3494847, 1, 'G07761'), (3495008, 3495083, 1, 'G07762'), (3495438, 3495509, 1, 'G07763'), (3496436, 3496508, 1, 'G07764'), (3497354, 3497437, 1, 'G07765'), (3503518, 3503605, 1, 'G07766'), (6953924, 6953961, -1, 'G15950'), (7046175, 7046247, 1, 'G16240'), (7749793, 7749867, 1, 'G17810'), (7962758, 7962832, -1, 'G18310'), (9144435, 9144507, 1, 'G21360'), (9241319, 9241356, -1, 'G21570'), (9273888, 9273969, -1, 'G21670'), (9277742, 9277814, -1, 'G21700'), (9291113, 9291185, 1, 'G21760'), (9400749, 9400823, 1, 'G22110'), (9456888, 9456962, -1, 'G22220'), (9472660, 9472733, -1, 'G22280'), (9509359, 9509433, 1, 'G22380'), (9598106, 9598179, 1, 'G22580'), (9810296, 9810368, -1, 'G23020'), (10066525, 10066597, -1, 'G23650'), (10380655, 10380728, 1, 'G24380'), (10820917, 10820990, 1, 'G25400'), (11122756, 11122837, -1, 'G26090'), (11781928, 11782000, -1, 'G27560'), (11871230, 11871302, -1, 'G27850'), (12336079, 12336151, 1, 'G28730'), (12346827, 12346899, 1, 'G28770'), (12478849, 12478921, -1, 'G29030'), (12645232, 12645305, -1, 'G29520'), (12888667, 12888738, 1, 'G30180'), (12889810, 12889881, 1, 'G30190'), (12983024, 12983095, -1, 'G30450'), (13144312, 13144385, -1, 'G30850'), (13658350, 13658425, 1, 'G32110'), (14054465, 14054503, -1, 'G33140'), (14250206, 14250278, 1, 'G33650'), (14251774, 14251846, 1, 'G33660'), (14357464, 14357536, 1, 'G33890'), (14358437, 14358509, 1, 'G33900'), (14359269, 14359341, 1, 'G33910'), (14360221, 14360293, 1, 'G33920'), (14360734, 14360806, 1, 'G33930'), (14361176, 14361248, 1, 'G33940'), (14362215, 14362287, 1, 'G33950'), (14363133, 14363205, 1, 'G33960'), (14363599, 14363671, 1, 'G33970'), (14750553, 14750627, -1, 'G34950'), (14757142, 14757213, 1, 'G34985'), (14847685, 14847723, 1, 'G35220'), (15175940, 15176014, 1, 'G36140'), (15176656, 15176736, 1, 'G36150'), (15215480, 15215517, -1, 'G36280'), (15327312, 15327395, 1, 'G36510'), (15327463, 15327546, -1, 'G36520'), (15353238, 15353311, 1, 'G36600'), (15477287, 15477324, -1, 'G36860'), (15923894, 15923967, 1, 'G38030'), (16525641, 16525713, -1, 'G39600'), (16525846, 16525918, 1, 'G39610'), (16646857, 16646929, -1, 'G39860'), (17545780, 17545862, -1, 'G42020'), (17667855, 17667926, 1, 'G42420'), (17880766, 17880839, 1, 'G42970'), (18002649, 18002721, -1, 'G43300'), (18317052, 18317134, -1, 'G44320'), (18576985, 18577058, 1, 'G45020'), (18710751, 18710824, 1, 'G45390'), (18963713, 18963786, 1, 'G46120'), (19351496, 19351569, 1, 'G47100'), (19566924, 19566995, -1, 'G47740')] f3 = [(259640, 259712, 1, 'G01705'), (469666, 469740, 1, 'G02315'), (476808, 476880, 1, 'G02335'), (586092, 586174, 1, 'G02715'), (981975, 982047, 1, 'G03845'), (984105, 984177, 1, 'G03852'), (1220234, 1220307, 1, 'G04525'), (1601343, 1601415, -1, 'G05525'), (1707743, 1707815, -1, 'G05755'), (1738796, 1738870, 1, 'G05835'), (1843329, 1843400, -1, 'G06105'), (1920038, 1920110, -1, 'G06335'), (2104961, 2105033, -1, 'G06665'), (2222251, 2222324, 1, 'G07025'), (2232470, 2232506, -1, 'G07055'), (2253680, 2253762, -1, 'G07115'), (2285607, 2285679, 1, 'G07185'), (2918418, 2918492, -1, 'G09505'), (2944616, 2944698, 1, 'G09585'), (2945700, 2945782, -1, 'G09595'), (3090548, 3090631, 1, 'G10015'), (3096220, 3096293, 1, 'G10035'), (3238371, 3238407, -1, 'G10415'), (3535151, 3535224, 1, 'G11285'), (3575849, 3575923, 1, 'G11395'), (3622697, 3622769, -1, 'G11505'), (3942012, 3942084, 1, 'G12385'), (3995103, 3995176, -1, 'G12585'), (4254534, 4254615, 1, 'G13223'), (4330778, 4330850, 1, 'G13335'), (4998147, 4998219, 1, 'G14855'), (5068300, 5068374, -1, 'G15055'), (5275155, 5275228, 1, 'G15585'), (5632857, 5632930, 1, 'G16552'), (6483945, 6484019, -1, 'G18815'), (6540636, 6540673, 1, 'G18952'), (6663713, 6663786, 1, 'G19235'), (7104314, 7104398, 1, 'G20365'), (7224223, 7224296, -1, 'G20655'), (7319582, 7319664, -1, 'G20885'), (7567399, 7567471, -1, 'G21475'), (9373610, 9373684, -1, 'G25715'), (9840420, 9840494, 1, 'G26747'), (10211564, 10211636, 1, 'G27555'), (10319498, 10319570, 1, 'G27825'), (10325875, 10325947, 1, 'G27845'), (10753667, 10753740, 1, 'G28685'), (10760629, 10760702, -1, 'G28695'), (11076814, 11076886, 1, 'G29095'), (11961645, 11961718, 1, 'G30345'), (16438025, 16438097, -1, 'G44955'), (16896875, 16896949, 1, 'G45935'), (16902623, 16902697, 1, 'G45955'), (16905147, 16905221, 1, 'G45965'), (17160736, 17160808, 1, 'G46585'), (17275564, 17275646, 1, 'G46875'), (17905395, 17905467, 1, 'G48275'), (17985575, 17985611, -1, 'G48515'), (18080062, 18080134, 1, 'G48745'), (18518796, 18518870, 1, 'G49925'), (18755788, 18755860, -1, 'G50505'), (18837020, 18837092, 1, 'G50665'), (18907851, 18907924, 1, 'G50835'), (18928413, 18928487, 1, 'G50895'), (19008621, 19008694, -1, 'G51135'), (19044371, 19044443, -1, 'G51265'), (19403651, 19403723, -1, 'G52285'), (19420345, 19420417, -1, 'G52345'), (19511965, 19512045, 1, 'G52565'), (19566013, 19566085, 1, 'G52765'), (19648105, 19648188, 1, 'G52955'), (19935354, 19935426, 1, 'G53775'), (19995918, 19995989, 1, 'G53965'), (20704664, 20704736, 1, 'G55735'), (20720151, 20720223, 1, 'G55795'), (20824495, 20824568, -1, 'G56085'), (21498293, 21498375, 1, 'G58035'), (21553258, 21553329, 1, 'G58165'), (21970486, 21970557, 1, 'G59415'), (22149699, 22149773, 1, 'G59923'), (22149823, 22149895, -1, 'G59926'), (22197810, 22197892, -1, 'G60075'), (22481215, 22481288, -1, 'G60805'), (22622384, 22622465, 1, 'G61105'), (22786896, 22786969, 1, 'G61545'), (22853496, 22853567, 1, 'G61715'), (22871101, 22871174, 1, 'G61755'), (22892781, 22892853, 1, 'G61825'), (23047854, 23047927, 1, 'G62245'), (23062444, 23062517, -1, 'G62285'), (23221682, 23221753, 1, 'G62735'), (23296567, 23296640, -1, 'G63003'), (23296728, 23296801, -1, 'G63006')] f4 = [(33799, 33872, 1, 'G00085'), (424716, 424788, -1, 'G00985'), (562560, 562634, -1, 'G01355'), (611865, 611932, -1, 'G01455'), (808269, 808342, -1, 'G01865'), (901175, 901247, 1, 'G02055'), (1390894, 1390966, 1, 'G03135'), (1442004, 1442076, 1, 'G03285'), (1501605, 1501677, 1, 'G03405'), (1520781, 1520854, -1, 'G03435'), (5268124, 5268210, -1, 'G08345'), (6646425, 6646496, 1, 'G10815'), (6819287, 6819324, 1, 'G11177'), (6837555, 6837639, -1, 'G11213'), (6837769, 6837853, -1, 'G11216'), (6905479, 6905552, -1, 'G11355'), (6944721, 6944793, 1, 'G11405'), (7185697, 7185771, 1, 'G11985'), (7232792, 7232865, -1, 'G12065'), (7256408, 7256481, 1, 'G12115'), (7341420, 7341494, -1, 'G12405'), (7730956, 7731037, 1, 'G13265'), (7814197, 7814270, 1, 'G13445'), (8255695, 8255767, 1, 'G14345'), (8301720, 8301794, -1, 'G14415'), (8979656, 8979729, 1, 'G15775'), (9108317, 9108391, 1, 'G16105'), (9191590, 9191663, 1, 'G16235'), (9287230, 9287304, 1, 'G16465'), (9289706, 9289787, 1, 'G16475'), (9815215, 9815287, -1, 'G17612'), (9873524, 9873596, -1, 'G17765'), (9978117, 9978189, -1, 'G17975'), (10093077, 10093157, -1, 'G18255'), (10302011, 10302084, 1, 'G18725'), (10325975, 10326047, -1, 'G18815'), (10878733, 10878807, -1, 'G20115'), (11774472, 11774508, -1, 'G22265'), (11910299, 11910373, 1, 'G22635'), (11954751, 11954824, -1, 'G22754'), (11974951, 11975032, 1, 'G22785'), (12320119, 12320203, 1, 'G23635'), (12429608, 12429681, 1, 'G23915'), (12486211, 12486282, -1, 'G24025'), (12686148, 12686230, 1, 'G24565'), (13006243, 13006316, -1, 'G25435'), (13058840, 13058922, -1, 'G25585'), (13076582, 13076666, -1, 'G25635'), (13285431, 13285503, -1, 'G26225'), (13336345, 13336419, -1, 'G26375'), (13341501, 13341575, -1, 'G26385'), (13454562, 13454635, 1, 'G26675'), (13704787, 13704860, 1, 'G27395'), (13882922, 13882994, -1, 'G27875'), (13885196, 13885269, -1, 'G27885'), (14032495, 14032567, 1, 'G28362'), (14267286, 14267368, 1, 'G28915'), (14470283, 14470355, 1, 'G29415'), (15120655, 15120728, 1, 'G31075'), (15183089, 15183162, 1, 'G31265'), (15345717, 15345753, -1, 'G31695'), (15430229, 15430303, -1, 'G31895'), (15576655, 15576728, 1, 'G32265'), (15671398, 15671469, 1, 'G32475'), (15804553, 15804635, 1, 'G32765'), (16304128, 16304201, 1, 'G34035'), (16454700, 16454773, -1, 'G34415'), (16556627, 16556700, 1, 'G34695'), (16655290, 16655364, 1, 'G34975'), (17130054, 17130127, 1, 'G36197'), (17149473, 17149545, 1, 'G36245'), (17276705, 17276779, -1, 'G36635'), (17500800, 17500872, -1, 'G37175'), (18254982, 18255018, -1, 'G39195'), (18293773, 18293845, 1, 'G39345'), (18395021, 18395093, 1, 'G39615'), (18411258, 18411332, 1, 'G39672'), (18501705, 18501778, -1, 'G39865'), (18542164, 18542238, 1, 'G39985')] f5 = [(150353, 150426, -1, 'G01365'), (389889, 389960, -1, 'G02025'), (508427, 508500, -1, 'G02385'), (530819, 530893, 1, 'G02435'), (559327, 559399, -1, 'G02505'), (588890, 588964, -1, 'G02615'), (614641, 614723, 1, 'G02725'), (642397, 642479, -1, 'G02815'), (858534, 858571, 1, 'G03445'), (862395, 862468, -1, 'G03452'), (970797, 970878, -1, 'G03705'), (984365, 984448, 1, 'G03745'), (998940, 999013, 1, 'G03775'), (1742692, 1742765, 1, 'G05795'), (1788651, 1788723, 1, 'G05945'), (1804616, 1804690, 1, 'G05985'), (1853302, 1853382, -1, 'G06125'), (2060153, 2060235, -1, 'G06685'), (2212678, 2212749, -1, 'G07135'), (2309512, 2309549, -1, 'G07315'), (2411148, 2411232, 1, 'G07625'), (2432263, 2432336, -1, 'G07675'), (2587826, 2587899, -1, 'G08075'), (2898867, 2898951, -1, 'G09345'), (2993327, 2993401, 1, 'G09655'), (3030817, 3030890, -1, 'G09755'), (3118377, 3118458, 1, 'G09975'), (3212351, 3212424, -1, 'G10235'), (3287553, 3287635, -1, 'G10455'), (3324702, 3324775, 1, 'G10525'), (3578295, 3578367, -1, 'G11225'), (3617058, 3617130, 1, 'G11325'), (3669000, 3669073, -1, 'G11475'), (4471050, 4471122, 1, 'G13845'), (4530475, 4530548, 1, 'G14035'), (4673902, 4673974, 1, 'G14495'), (4929562, 4929636, 1, 'G15175'), (5157641, 5157715, 1, 'G15805'), (5161514, 5161586, 1, 'G15815'), (5358918, 5359000, 1, 'G16375'), (5962699, 5962771, -1, 'G18005'), (5965972, 5966044, -1, 'G18015'), (5984378, 5984450, 1, 'G18085'), (6258146, 6258218, 1, 'G18755'), (6401240, 6401311, 1, 'G19095'), (7073531, 7073603, -1, 'G20852'), (7073944, 7074016, -1, 'G20854'), (7074357, 7074429, -1, 'G20856'), (7074773, 7074845, -1, 'G20858'), (7222059, 7222131, -1, 'G21378'), (7387890, 7387962, 1, 'G22315'), (7981400, 7981472, 1, 'G23665'), (8906418, 8906502, 1, 'G25585'), (8946826, 8946899, -1, 'G25625'), (9815405, 9815477, -1, 'G27715'), (11802284, 11802356, 1, 'G32017'), (13823211, 13823284, -1, 'G35605'), (15049737, 15049811, -1, 'G37795'), (15242547, 15242621, 1, 'G38155'), (15593086, 15593160, 1, 'G38905'), (15844253, 15844325, -1, 'G39535'), (15993514, 15993587, 1, 'G39895'), (16256865, 16256937, -1, 'G40545'), (16427812, 16427893, 1, 'G40945'), (16524760, 16524832, -1, 'G41265'), (16655393, 16655477, 1, 'G41605'), (16684663, 16684735, -1, 'G41675'), (17476402, 17476475, -1, 'G43455'), (17512768, 17512839, -1, 'G43535'), (17856811, 17856883, -1, 'G44283'), (17894906, 17894979, -1, 'G44375'), (18058014, 18058088, 1, 'G44705'), (18560206, 18560278, -1, 'G45715'), (18576071, 18576143, 1, 'G45745'), (18715888, 18715960, -1, 'G46105'), (18807534, 18807614, 1, 'G46325'), (18924749, 18924821, 1, 'G46595'), (19658828, 19658900, 1, 'G48465'), (19761400, 19761472, -1, 'G48675'), (19820360, 19820398, 1, 'G48835'), (20064048, 20064120, 1, 'G49435'), (20692447, 20692519, 1, 'G50805'), (20758903, 20758940, -1, 'G50995'), (20773555, 20773637, 1, 'G51055'), (21275059, 21275141, -1, 'G52355'), (21318105, 21318189, -1, 'G52495'), (21418369, 21418441, 1, 'G52815'), (21740339, 21740410, -1, 'G53487'), (22091631, 22091704, 1, 'G54365'), (22094087, 22094160, 1, 'G54375'), (22304851, 22304923, -1, 'G54865'), (22355897, 22355970, -1, 'G55045'), (22357726, 22357799, -1, 'G55055'), (22501995, 22502068, -1, 'G55505'), (22845356, 22845430, 1, 'G56365'), (22973066, 22973138, 1, 'G56745'), (23071996, 23072070, -1, 'G56975'), (23463219, 23463291, 1, 'G57885'), (23661936, 23662018, 1, 'G58495'), (23861431, 23861503, 1, 'G59055'), (23971167, 23971239, 1, 'G59385'), (23974655, 23974727, 1, 'G59395'), (24157171, 24157245, -1, 'G59945'), (24279805, 24279886, 1, 'G60285'), (24547401, 24547474, 1, 'G60963'), (24548892, 24548964, 1, 'G60966'), (24684507, 24684579, 1, 'G61345'), (24726891, 24726964, 1, 'G61445'), (24856205, 24856242, 1, 'G61835'), (25347261, 25347333, 1, 'G63145'), (25801340, 25801414, 1, 'G64505'), (25892619, 25892691, -1, 'G64735'), (25942291, 25942372, 1, 'G64855'), (25989903, 25989976, 1, 'G65015'), (26114755, 26114793, -1, 'G65305'), (26174414, 26174496, -1, 'G65445'), (26212684, 26212757, 1, 'G65535'), (26238859, 26238933, -1, 'G65615'), (26573248, 26573322, -1, 'G66535'), (26585622, 26585696, 1, 'G66568'), (26670495, 26670567, -1, 'G66755'), (26699933, 26700004, -1, 'G66817'), (26938897, 26938969, 1, 'G67455')] entries = [("Chr I", "NC_003070", 30432563, f1, colors.red), ("Chr II", "NC_003071", 19705359, f2, colors.green), ("Chr III", "NC_003074", 23470805, f3, colors.blue), ("Chr IV", "NC_003075", 18585042, f4, colors.orange), ("Chr V", "NC_003076", 26992728, f5, colors.purple)] max_length = max([row[2] for row in entries]) chr_diagram = BasicChromosome.Organism() for name, acc, length, features, color in entries: if False: # How I generated the values above... and tested passing in SeqFeatures filename = "/Users/pjcock/Documents/comp_genomics/seed/%s.gbk" % acc import os if not os.path.isfile(filename): continue from Bio import SeqIO record = SeqIO.read(filename, "gb") assert length == len(record) features = [f for f in record.features if f.type=="tRNA"] print(name) # Strip of the first three chars, AT# where # is the chr print([(int(f.location.start), int(f.location.end), f.strand, f.qualifiers['locus_tag'][0][3:]) for f in features]) # Output was copy and pasted to the script, see above. # Continue test using SeqFeature objects! # To test colours from the qualifiers, for i, f in enumerate(features): f.qualifiers['color'] = [str(i % 16)] elif use_seqfeatures: # Features as SeqFeatures features = [SeqFeature(FeatureLocation(start, end, strand), qualifiers={"name": [label], "color": [color]}) for (start, end, strand, label) in features] else: # Features as 5-tuples features = [(start, end, strand, label, color) for (start, end, strand, label) in features] # I haven't found a nice source of data for real Arabidopsis # cytobands, so these three are made up at random! cytobands = [] for color in [colors.gray, colors.darkgray, colors.slategray]: start = (length - 1000000) * random.random() end = min(length, start + 1000000) # Draw these with black borders, and a grey fill cytobands.append((start, end, 0, None, colors.black, color)) # Draw these with black borders, and a brown fill: cytobands.append((0, 1000000, 0, "First 1 Mbp", colors.black, colors.brown)) cytobands.append((length-1000000, length, 0, "Last 1 Mbp", colors.black, colors.brown)) # Additional dummy entry to check fill colour on both strands, if name == "Chr III": cytobands.append((11000000, 13000000, -1, "Reverse", "red", "yellow")) elif name == "Chr V": cytobands.append((9500000, 11000000, +1, "Forward", colors.red, colors.yellow)) # Create the drawing object for the chromosome cur_chromosome = BasicChromosome.Chromosome(name) # Set the length, adding an extra 20 percent for the tolomeres etc: cur_chromosome.scale_num = max_length * 1.2 cur_chromosome.label_sep_percent = 0.15 # Add a dummy segment for allocating vertical space # which can be used for feature label placement spacer = BasicChromosome.SpacerSegment() spacer.scale = 0.03 * max_length cur_chromosome.add(spacer) # Add an opening telomere start = BasicChromosome.TelomereSegment() start.scale = 0.02 * max_length start.fill_color = colors.lightgrey cur_chromosome.add(start) # Add a body - using bp as the scale length here. # Note we put the cytobands a start of combined list, # as want them drawn underneath the tRNA markers. body = BasicChromosome.AnnotatedChromosomeSegment(length, cytobands + features) body.scale = length cur_chromosome.add(body) # Add a closing telomere end = BasicChromosome.TelomereSegment(inverted=True) end.scale = 0.02 * max_length end.fill_color = colors.lightgrey cur_chromosome.add(end) # Another spacer spacer = BasicChromosome.SpacerSegment() spacer.scale = 0.03 * max_length cur_chromosome.add(spacer) # This chromosome is done chr_diagram.add(cur_chromosome) with warnings.catch_warnings(): # BiopythonWarning: Too many labels to avoid overlap warnings.simplefilter("ignore", BiopythonWarning) chr_diagram.draw(filename, "Arabidopsis thaliana tRNA")
def prodigal_parser(seq_file, sco_file, prefix, output_folder): bin_ffn_file = '%s.ffn' % prefix bin_faa_file = '%s.faa' % prefix bin_gbk_file = '%s.gbk' % prefix pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file) pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file) pwd_bin_gbk_file = '%s/%s' % (output_folder, bin_gbk_file) # get sequence id list id_to_sequence_dict = {} sequence_id_list = [] for each_seq in SeqIO.parse(seq_file, 'fasta'): id_to_sequence_dict[each_seq.id] = str(each_seq.seq) sequence_id_list.append(each_seq.id) # get sequence to cds dict and sequence to transl_table dict current_seq_id = '' current_transl_table = '' current_seq_csd_list = [] seq_to_cds_dict = {} seq_to_transl_table_dict = {} for each_cds in open(sco_file): if each_cds.startswith('# Sequence Data'): # add to dict if current_seq_id != '': seq_to_cds_dict[current_seq_id] = current_seq_csd_list seq_to_transl_table_dict[current_seq_id] = current_transl_table # reset value current_seq_id = each_cds.strip().split('=')[-1][1:-1].split( ' ')[0] current_transl_table = '' current_seq_csd_list = [] elif each_cds.startswith('# Model Data'): current_transl_table = each_cds.strip().split(';')[-2].split( '=')[-1] else: current_seq_csd_list.append('_'.join( each_cds.strip().split('_')[1:])) seq_to_cds_dict[current_seq_id] = current_seq_csd_list seq_to_transl_table_dict[current_seq_id] = current_transl_table bin_gbk_file_handle = open(pwd_bin_gbk_file, 'w') bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w') bin_faa_file_handle = open(pwd_bin_faa_file, 'w') gene_index = 1 for seq_id in sequence_id_list: # create SeqRecord current_sequence = Seq(id_to_sequence_dict[seq_id]) current_SeqRecord = SeqRecord(current_sequence, id=seq_id) current_SeqRecord.seq.alphabet = generic_dna transl_table = seq_to_transl_table_dict[seq_id] # add SeqFeature to SeqRecord for cds in seq_to_cds_dict[seq_id]: # define locus_tag id locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index)) # define FeatureLocation cds_split = cds.split('_') cds_start = SF.ExactPosition(int(cds_split[0])) cds_end = SF.ExactPosition(int(cds_split[1])) cds_strand = cds_split[2] current_strand = None if cds_strand == '+': current_strand = 1 if cds_strand == '-': current_strand = -1 current_feature_location = FeatureLocation(cds_start, cds_end, strand=current_strand) # get nc sequence sequence_nc = '' if cds_strand == '+': sequence_nc = id_to_sequence_dict[seq_id][cds_start - 1:cds_end] if cds_strand == '-': sequence_nc = str( Seq(id_to_sequence_dict[seq_id][cds_start - 1:cds_end], generic_dna).reverse_complement()) # translate to aa sequence sequence_aa = str( SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table)) # remove * at the end sequence_aa = sequence_aa[:-1] # export nc and aa sequences export_dna_record(sequence_nc, locus_tag_id, '', bin_ffn_file_handle) export_aa_record(sequence_aa, locus_tag_id, '', bin_faa_file_handle) # Define feature type current_feature_type = 'CDS' # Define feature qualifiers current_qualifiers_dict = {} current_qualifiers_dict['locus_tag'] = locus_tag_id current_qualifiers_dict['transl_table'] = transl_table current_qualifiers_dict['translation'] = sequence_aa # Create a SeqFeature current_feature = SeqFeature(current_feature_location, type=current_feature_type, qualifiers=current_qualifiers_dict) # Append Feature to SeqRecord current_SeqRecord.features.append(current_feature) gene_index += 1 # export to gbk file SeqIO.write(current_SeqRecord, bin_gbk_file_handle, 'genbank') bin_gbk_file_handle.close() bin_ffn_file_handle.close() bin_faa_file_handle.close()
def merge_overlapping_feature_in_simple_format( self, input_file_file_list, scaffold_id_column, feature_start_column, feature_end_column, output_file=None, output_separator="\t", comments_prefix="#", input_separator="\t", coordinates_type="1-based", return_seqfeature_dict=False, feature_type=None): file_list = [input_file_file_list] if isinstance( input_file_file_list, str) else input_file_file_list record_dict_list = [] for filename in file_list: record_dict_list.append(OrderedDict()) for line_list in self.file_line_as_list_generator( filename, comments_prefix=comments_prefix, separator=input_separator): if line_list[scaffold_id_column] not in record_dict_list[-1]: record_dict_list[-1][line_list[scaffold_id_column]] = [] record_dict_list[-1][line_list[scaffold_id_column]].append([ (int(line_list[feature_start_column]) - 1 if coordinates_type == "1-based" else int( line_list[feature_start_column])), int(line_list[feature_end_column]) ]) unified_dict = OrderedDict() merged_dict = OrderedDict() #print record_dict_list[0] scaffold_set = set() for record_dict in record_dict_list: scaffold_set |= set(record_dict.keys()) for scaffold in scaffold_set: unified_dict[scaffold] = [] merged_dict[scaffold] = [] for record_dict in record_dict_list: for scaffold in record_dict: unified_dict[scaffold] += record_dict[scaffold] #print "AAAAAAAAAA" #print scaffold, unified_dict[scaffold], record_dict[scaffold] for scaffold in unified_dict: if unified_dict[scaffold]: unified_dict[scaffold].sort() if unified_dict[scaffold] is None: print(scaffold) #print unified_dict for scaffold in unified_dict: number_of_records = len(unified_dict[scaffold]) if number_of_records == 0: continue # [a, b) [c, d), a < b, c < d # after sorting c >= a i = 1 prev_coordinates = deepcopy(unified_dict[scaffold][0]) #print scaffold, number_of_records, prev_coordinates #print "\t", unified_dict[scaffold] while i < number_of_records: if unified_dict[scaffold][i][0] > prev_coordinates[1]: # c > b #print "AAAAAA", "\t", prev_coordinates, unified_dict[scaffold][i] merged_dict[scaffold].append(deepcopy(prev_coordinates)) prev_coordinates = deepcopy(unified_dict[scaffold][i]) elif unified_dict[scaffold][i][1] > prev_coordinates[ 1]: # d > b; c<=b #print "BBBBBB", "\t",prev_coordinates, unified_dict[scaffold][i] prev_coordinates[1] = deepcopy( unified_dict[scaffold][i][1]) else: # d <= b #print "CCCCCC", "\t",prev_coordinates, unified_dict[scaffold][i] pass i += 1 if merged_dict[scaffold]: if prev_coordinates != merged_dict[scaffold][-1]: merged_dict[scaffold].append(prev_coordinates) else: merged_dict[scaffold].append(prev_coordinates) #print "\t", unified_dict[scaffold] #print "\t", merged_dict[scaffold] #print unified_dict #print merged_dict if output_file: with self.metaopen(output_file, "w") as out_fd: for scaffold in merged_dict: for feature in merged_dict[scaffold]: out_fd.write( output_separator.join( map(str, [ scaffold, feature[0] + 1 if coordinates_type == "1-based" else feature[0], feature[1] ])) + "\n") if return_seqfeature_dict and feature_type: feature_dict = OrderedDict() for region in merged_dict: feature_dict[region] = [] for (start, stop) in merged_dict[region]: feature_dict[region].append( SeqFeature(FeatureLocation(start, stop), type=feature_type, strand=None)) return feature_dict elif return_seqfeature_dict and (not feature_type): raise ValueError( "ERROR!!! Feature type for seqfeature records was not set!") else: return merged_dict
from Bio.Seq import Seq from Bio.SeqFeature import SeqFeature, FeatureLocation at5g40780 = Seq( "MVAQAPHDDHQDDEKLAAARQKEIEDWLPITSSRNAKWWYSAFHNVTAMVGAGVLGLPYAMSQLGWGPGIAVLVLSWVITLYTLWQMVEMHEMVPGKRFDRYHELGQHAFGEKLGLYIVVPQQLIVEIGVCIVYMVTGGKSLKKFHELVCDDCKPIKLTYFIMIFASVHFVLSHLPNFNSISGVSLAAAVMSLSYSTIAWASSASKGVQEDVQYGYKAKTTAGTVFNFFSGLGDVAFAYAGHNVVLEIQATIPSTPEKPSKGPMWRGVIVAYIVVALCYFPVALVGYYIFGNGVEDNILMSLKKPAWLIATANIFVVIHVIGSYQIYAMPVFDMMETLLVKKLNFRPTTTLRFFVRNFYVAATMFVGMTFPFFGGLLAFFGGFAFAPTTYFLPCVIWLAIYKPKKYSLSWWANWVCIVFGLFLMVLSPIGGLRTIVIQAKGYKFYS" ) # 단백질 시퀀스 왜 이렇게 길어 이거 feature = SeqFeature(FeatureLocation(0, 30), type="protein", strand=1) feature_seq = at5g40780[feature.location.start:feature.location.end] print(feature_seq) # 아래나 위나 방식은 똑같은 듯 하다. feature_seq2 = feature.extract(at5g40780) print(feature_seq2)
if int(rw[ti['Amplicon_length']]) > 0: start = int(rw[ti['Primer1_target_start']]) end = int(rw[ti['Primer2_target_end']]) if end > start: strd = 1 truestart = start trueend = end else: strd = -1 truestart = end trueend = start #print '{} Start:{}, End:{}'.format(indexa,start,end) seq_feature=SeqFeature(FeatureLocation(truestart,trueend, strand=strd), type="Amplicon", \ id=indexa,\ qualifiers={'Plate':plate,'Well':well,'Gene':gene_name,\ 'Amplicon':amplicon,\ 'Amplicons':amplicons,\ 'Strand':strd,'colour':col,\ #'note':'{} {}'.format(indexa,data[key]['Description']),\ 'Strand_annotation':data[key]['Strand_annotation']}) genome.features.append(seq_feature) faf = open(amplicons_features, 'w') SeqIO.write(genome, faf, "gb") faf.close() #Map TF binding sites TF_file = "../Transcription_Factors/TF_binding_clean.csv" TF_data = readcsv(TF_file, delim=',')
def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) # Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) secmet_cds_features = utils.get_secmet_cds_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: if not utils.features_overlap(cf_cluster, cluster): continue overlaps = True # Get signature genes from antiSMASH-predicted cluster features_in_cluster = utils.get_cluster_cds_features( cluster, seq_record) cluster_sig_genes = [ gene for gene in secmet_cds_features if gene in features_in_cluster ] # Predict gene cluster borders using ClusterFinder if options.borderpredict: if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: # Make sure that antiSMASH signature genes are still included in the cluster for sig_gene in cluster_sig_genes: startpoint = min( [sig_gene.location.start, sig_gene.location.end]) endpoint = max( [sig_gene.location.start, sig_gene.location.end]) if cf_cluster.location.start > startpoint: cf_cluster.location = FeatureLocation( startpoint, cf_cluster.location.end) if cf_cluster.location.end < endpoint: cf_cluster.location = FeatureLocation( cf_cluster.location.start, endpoint) cluster_border = SeqFeature(cf_cluster.location, type="cluster_border") cluster_border.qualifiers = { "tool": ["clusterfinder"], "probability": [cf_cluster.probability], "note": ["best prediction"], } seq_record.features.append(cluster_border) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps and not ('borderpredict_only' in options and options.borderpredict_only): cf_cluster_CDSs = utils.get_cluster_cds_features( cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [ feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat ] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) if len(newclusters): seq_record.features.extend(newclusters) renumber_clusters(seq_record, options)
], "EC_number": "3.5.2.6", } ampr_prom = next(get_features("AmpR promoter"), None) ampr_prom = ampr_prom or next(get_features("AmpR Promoter"), None) if ampr_prom is not None: ampr_prom.qualifiers["label"] = ["AmpR Promoter"] ampr_prom.qualifiers["note"] = ["color: #ff6666"] ampr_term_start = gb.seq.find(AMPR_TERM) if ampr is not None and ampr_term_start >= 0: ampr_term = SeqFeature( location=FeatureLocation(ampr_term_start, ampr_term_start + 94, -1), type="terminator", qualifiers={ "label": "AmpR Terminator", "note": ["color: #ff6666"] }, ) gb.features.append(ampr_term) # KanR recolor and annotations kanr = next(get_features("KanR"), None) if kanr is not None: kanr.qualifiers.update({ "gene": "aphA1", "product": "aminoglycoside phosphotransferase", "EC_number":
defline = protein.description pattern = re.compile('.+#\s(\d+)\s#\s(\d+)\s#\s(\S*1)\s#\sID.+') match = pattern.match(defline) start_pos = int(match.group(1)) end_pos = int(match.group(2)) strand_pos = int(match.group(3)) feat_loc = FeatureLocation(start_pos - 1, end_pos) # adjust for 0-index l_tag = protein.id # consolidate feature annotations quals = { 'note': defline, 'locus_tag': l_tag, 'translation': protein.seq } feature = SeqFeature(location=feat_loc, strand=strand_pos, id=protein.id, type='CDS', qualifiers=quals) record.features.append(feature) # save record with annotations record.description = rec_name + "_with_ORFs" record.name = rec_name record.dbxrefs = ["Project: " + argv[1] + "/" + rec_name] record.seq.alphabet = generic_dna write_genbank(annot_gbk, record) print "OK"
seq = Dseqrecord("") colors = [] for hue in range(0, 360, 36): for s in [0.2, 0.4, 0.6]: r, g, b = colorsys.hsv_to_rgb(hue / 360, s, 255) r, g, b = int(r), int(g), int(b) hex = "#{0:02x}{1:02x}{2:02x}".format(r, g, b) colors.append(f"{hex}") colors = colors[::3] + colors[1::3] + colors[2::3] colors = colors[::-1] for hex in colors: sf = SeqFeature(FeatureLocation(1, 9, strand=1), type="misc_feature") sf.qualifiers["label"] = [ hex, ] sf.qualifiers["ApEinfo_fwdcolor"] = [ hex, ] se = Dseqrecord("agtagtcgta") se.features.append(sf) seq += se from pydna.editor import ape ape(seq) print(colors)
def printClusterPics(cluNet, data_box, annot_def_dict, output_dir, ncutoff=8): """This abomination takes in a network of gene clusters, a dict of cluster_member_ID:pfam domains, and a place to dump output. It goes thru and prints pictures of each cluster and its best buddies. Possible defects: Proteins can be related, but won't be colored if there is no pfam domain annotation. Handling of multidomain proteins is very crude. Also, much much too long.""" for node in cluNet.nodes(): #record the species for file printing poirposes parentSpecies, parentAcession, parentWpID, parentAnnotation, parentStartstop = node.split( "|") #Make a new diagram. TODO add the path for the output and whatnot. gd_diagram = GenomeDiagram.Diagram() #Return the top most related clusters. sortedClusters = getTopClusters(cluNet, node)[0] #the largest nucleotide span in all the clusters under consideration. maxSpan = getMaxSpan(sortedClusters, data_box) #Get info about pfam domains in all the clusters under consideration id_list = makeIDList(sortedClusters, data_box) domainCounts = getPfamCounts(annot_def_dict, id_list) colorDict = getColorDict(domainCounts) # for each of the best clusters: for e in sortedClusters: #Make the track and add a feature set to the track. scale = 0 turns off the scale, greytrack adds a background. What is the 1 for? Who knows. gd_track_for_features = gd_diagram.new_track(1, name="", scale=0, greytrack=True) gd_feature_set = gd_track_for_features.new_set() #get the cluster_member_IDs from data_box using the cluster_ID #TODO this will be replaced with a clusterID:clusterMemberID dict based on the BLAST2 db? maybe. clusterMembers = data_box[e].keys() #start stop span sss = getClustSS(clusterMembers) offset = int((maxSpan - sss[2]) / 2 - sss[0]) for cm in clusterMembers: organism, acession, wpID, annotation, startstop = cm.split("|") startstop = re.sub('[<>$%^&*#|\[\]]', "", startstop) q_start = int(startstop.split("-")[0]) q_stop = int(startstop.split("-")[1]) #encode the directional info. if q_start < q_stop: q_strand = 1 label_angle = 15 elif q_start > q_stop: q_strand = -1 label_angle = 179.9 #else: # strand = None #Uh, I think this should make the whole shebang roughly centered? feature = SeqFeature(FeatureLocation(q_start + offset, q_stop + offset), ref=wpID, strand=q_strand) #What color should it be? pfam = domainHash(annot_def_dict[cm]) if len(pfam) > 0 and pfam in colorDict: color = colorDict[ pfam] #pfam is a tuple of domain names, usually just one else: color = "0x000000" #paint it black gd_feature_set.add_feature(feature, sigil="ARROW", color=color, label=True, label_position="start", name=feature.ref, label_size=7, label_angle=label_angle) #TODO Add a legend at the bottom, too. each pfam dom is a colored square. ref = name of pfam domain. gd_track_for_features = gd_diagram.new_track(0, name="", scale=0, greytrack=False) gd_feature_set = gd_track_for_features.new_set() counter = 0 increment = maxSpan / len(colorDict.keys()) for domain_tup in colorDict: q_start = round(counter * increment) q_stop = round((counter + .33) * increment) counter += 1 feature = SeqFeature(FeatureLocation(q_start, q_stop), ref=domain_tup[0], strand=None) gd_feature_set.add_feature(feature, sigil="BOX", color=colorDict[domain_tup], label=True, name=feature.ref, label_size=7, label_angle=15) #Done reading in all the info! Now write the plot name = abspath(output_dir + "/" + parentSpecies + "-" + parentWpID + ".pdf") drawMap(gd_diagram, name, plotWidth=maxSpan)
def __rshift__(self, index): """Rotate the sequence clockwise, preserving annotations. """ index %= len(self.seq) # avoid unnecessary cycles if index == 0: return self elif index < 0: return self << -index newseq = self.seq[-index:] + self.seq[:-index] newfeats = [] newletan = { k: v[index:] + v[:index] for k, v in six.iteritems(self.letter_annotations) } for feature in self.features: loc = feature.location if loc is None: newloc = None elif feature.type == "source" and loc.start == 0 and loc.end == len( self): newloc = loc else: _newloc = [] for part in (loc + index).parts: if part.end >= len(newseq) and part.start >= len(newseq): r = part.start // len(newseq) # remainder is used to _newloc.append( FeatureLocation( # make sure that part.end start=part.start - r * len(newseq), # is always after part.start end=part.end - r * len(newseq), # even on additional end strand=part.strand, # overlap ref=part.ref, ref_db=part.ref_db, )) else: _newloc.append(part) newloc = _newloc[0] if len(_newloc) == 1 else CompoundLocation( _newloc) newfeats.append( SeqFeature( location=newloc, type=feature.type, id=feature.id, qualifiers=feature.qualifiers, )) return type(self)( seq=newseq, id=self.id, name=self.name, description=self.description, dbxrefs=self.dbxrefs, features=newfeats, annotations=self.annotations, letter_annotations=newletan, )
def merge_gbk(gbk_records, filter_size=0, gi=False): ''' merge multiple contigs into a single DNA molecule with 200*N between contigs keep source description from the first record remove contigs smaller than <filter_size> :param gbk_records: :param filter_size: :param gi: :return: ''' from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition from Bio.SeqRecord import SeqRecord n=0 if len(gbk_records) == 1: merged_rec = gbk_records[0] else: for i, rec in enumerate(gbk_records): # remove source feature of all records except the first one if rec.features[0].type == 'source' and i != 0: rec.features.pop(0) # filter small contigs if len(rec) > filter_size: if n == 0: n+=1 merged_rec = rec else: merged_rec+=rec # you could insert a spacer if needed # do not add spacer after the last contig if i != len(gbk_records)-1: merged_rec += "N" * 200 my_start_pos = ExactPosition(len(merged_rec)-200) my_end_pos = ExactPosition(len(merged_rec)) my_feature_location = FeatureLocation(my_start_pos, my_end_pos) my_feature = SeqFeature(my_feature_location, type="assembly_gap") merged_rec.features.append(my_feature) try: merged_rec.id = gbk_records[0].annotations["accessions"][-1] except KeyError: merged_rec.id = gbk_records[0].id if gi: merged_rec.annotations["gi"] = gi merged_rec.description = "%s" % gbk_records[0].annotations["organism"] merged_rec.annotations = gbk_records[0].annotations try: merged_rec.name = gbk_records[0].annotations["accessions"][-1] except KeyError: merged_rec.name = gbk_records[0].id my_start_pos = ExactPosition(0) my_end_pos = ExactPosition(len(merged_rec)) merged_rec.features[0].location = FeatureLocation(my_start_pos, my_end_pos) return merged_rec