def plot_GC(handle, outname): pp = PdfPages(outname) parsed_handle = [record for record in SeqIO.parse(handle, "fasta")] for i in range(0, len(parsed_handle)): gc_skew = GC_skew(parsed_handle[i].seq) cumulated_skew = np.cumsum(gc_skew) pylab.plot(cumulated_skew) pylab.plot(gc_skew) pylab.title("contig %s" % (str(parsed_handle[i].id))) pylab.savefig(pp, format='pdf') pylab.close() all_seq = '' for record in parsed_handle: all_seq += record.seq gc_skew = GC_skew(all_seq) cumulated_skew = np.cumsum(gc_skew) pylab.plot(cumulated_skew) pylab.plot(gc_skew) pylab.title("concatenated contigs") pylab.savefig(pp, format='pdf') pp.close()
def gc_skew_window(genome, window=1000): interval = [x for x in range(int(len(genome.seq) / window) + 1)] result_dict = dict() for i in interval[:-1]: gc_skew = GC_skew(genome.seq[interval[i] * window:interval[i + 1] * window])[0] gc_skew_cell = { i: { "chr": genome.id, "start": interval[i] * window + 1, "end": interval[i + 1] * window, "GC_skew": gc_skew } } result_dict.update(gc_skew_cell) gc_skew = GC_skew(genome.seq[interval[-1] * window:len(genome.seq)])[0] gc_skew_cell = { interval[-1]: { "chr": genome.id, "start": interval[-1] * 100 + 1, "end": len(genome.seq), "GC_skew": gc_skew } } result_dict.update(gc_skew_cell) return DataFrame.from_dict(result_dict, "index")
def calculate_GCskew(entries): name, sequences = list(zip(*entries)) total_seq = ''.join(sequences) L = len(total_seq) window = L // len(sequences) gc_skew_values = GC_skew(total_seq, window=window) return pd.Series(gc_skew_values[:len(sequences)], name='gc_skew')
def skew(file_name, file_type): file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "Files", file_name + "." + file_type) skew_val = [0] positions = [0] for rec in SeqIO.parse(file_path, file_type): skew_data = GC_skew(rec.seq, 1) for i in range(0, len(rec.seq)): positions.append(i) skew_val.append(skew_val[len(skew_val) - 1] + int(skew_data[i])) x = positions y = skew_val return x, y
def mk_gc_skewset(data, outpath, window): """calculates the GC skew in a window and writes in a file that circos accepts to plot""" skew_file = open(outpath + '/gc_skew.txt', 'w') cg = 0 for contig in data: cg += 1 pos = 0 interval = window gc_skew = GC_skew(data[contig], window=window) for point in gc_skew: skew_file.write('cg{} {} {} {}\n'.format(cg, pos, interval, point)) pos = interval + 1 interval += window return outpath + '/gc_skew.txt'
def find_ori_ter(gb): """Find positions of origin and terminus of replication in a genome, using the cumulative GC skew method. Args: gb (Bio.SeqRecord.SeqRecord): GenBank annotation. Returns: ori, ter (int): origin and terminus positions. """ dna = gb.seq w = 100 cum_gcskew = np.cumsum(GC_skew(dna, window=w)) ori = np.argmin(cum_gcskew) * w ter = np.argmax(cum_gcskew) * w print("Found ori and ter positions: %d, %d" % (ori, ter)) return ori, ter
def fa2bed(fa, window, step): # genome bed file genomeBed = open("genome.bed","w") #genomeBed.write("chr\tstart\tend\n") record_dict = SeqIO.to_dict(SeqIO.parse(fa, "fasta")) # GC% distribution file gcBed = open("gc.bed","w") #gcBed.write("chr\tstart\tend\tgc\n") #GC-skew bed file gcSkewBed = open("gcSkew.bed","w") #gcSkewBed.write("chr\tstart\tend\tgcSkew\n") window_len = int(window) step_len = int(step) for k,v in record_dict.items(): chrom = v.id chrom_seq = v.seq length = len(chrom_seq) genomeBed.write(chrom+"\t0\t%d\n"%length) for i in range(0, length, step_len): start0 = i end0 = i+step_len if i+step_len<=length else length start = i-window_len/2 if i-window_len/2>=0 else 0 end = i+window_len/2 if i+window_len/2<=length else length s = chrom_seq[start : end] gc = GC(s) skew = GC_skew(s, window_len)[0] gcBed.write(chrom+"\t%d\t%d\t%.3f\n"%(start0,end0,gc)) gcSkewBed.write(chrom+"\t%d\t%d\t%.3f\n"%(start0,end0,skew)) genomeBed.close() gcBed.close() gcSkewBed.close()
file_list = sorted(file_list) # get data data = [] names = [] for file_path in file_list: for record in SeqIO.parse(file_path, "fasta"): names.append(record.id) data.append(record.seq) print(names) number_strains = len(data) GC_skews = [] for i in data: GC_skews.append(GC_skew(i, window=200)) #comparing specific strains if True: fig = plt.figure(figsize=(10, 10)) pos_jvci1 = names.index("JCVI1") pos_jvci2 = names.index("JCVI2") pos_jvci3 = names.index("JCVI3") pos_gm12wt = names.index("GM12WT") pos_gm12d86 = names.index("GM12delta68") pos_mg37 = names.index("MG37") pos_Mf5583 = names.index("Mf5583") compared = [
#!/usr/bin/python # testing python script import Bio as bio import matplotlib.pyplot as plot from Bio.Seq import Seq from Bio.Alphabet import generic_rna,generic_dna from Bio.SeqUtils import GC123,GC,GC_skew seq = "ataccaggctgaggcccattaatgatgcaatttgctgggcttctctattttctccgtgcttccatcctcttctccgtcggcggggagaagtgaaatgccgtggagatgggcggcggcggcggcgacggcggcgacgagaaagctcaccgggatctctcagtcgcgagtttcagtagcctttaccggccgtcttctctaccgctcgttcggaagcgactccagtgaaagccgcaagaggtcactgccacggggggtcgtatcgatcggggccatcagccttgctggaggtctcgtgctcagcgccgtcaacgacctcgccatcttcaatggatgcacaacgaaggcaattgagcatgctgctgacaaccctgctgttgtggaagcaattggagtgcctatagtcagaggaccgtggtatgatgcttctcttgaggtgggccatcgacggcggtctgtgtcatgcacattccctgtatctgggccacatgggtcaggatttctccagattaaggcaacccgagatggagaggatggtctgctttcgtttctgcggcatcacgactggaagatcctattgctggaggctcatcttgaagcaccatcagatgatgaggaccagagaaagctggttaaggtgaatcttgcaagcagtggccgtggggaagatggggatccagagagtggttaatcttttgtactgaattccatggtgagtggaagatcgtgtcatctgaatggactccaaatattaaatgacatggagatctagggaagcaaaaaaaaaaaaaaaa" print GC123(seq) print GC(seq) plot(GC_skew(seq,window=100),c="r") xlabel("Window") ylabel("(G-C)/(G+C)") title("GC-skew")
def circos_cumul_gc_skew(record, windows=1000, shift=0, initial=0): ''' :param record: :return: circos string with difference as compared to the average GC ex: average = 32 GC(seq[3000:4000]) = 44 diff = 44 - 32 = 12% ''' from Bio.SeqFeature import FeatureLocation circos_string = '' #print "GENOME SIZE:", len(record.seq) gap_locations = [] for feature in record.features: if feature.type == "assembly_gap": gap_locations.append(feature.location) if len(gap_locations) == 0: gap_locations.append(FeatureLocation(0, len(record.seq))) else: #gap_locations.append(FeatureLocation(gap_locations[-1].end + 1, len(record.seq))) gap_locations.append(FeatureLocation(len(record.seq), len(record.seq))) #print 'gap locations', gap_locations if len(gap_locations) > 1: for i in range(0, len(gap_locations)): if i == 0: seq = record.seq[0:gap_locations[i].start] chr_start = 0 else: seq = record.seq[gap_locations[i - 1].end:gap_locations[i].start] chr_start = gap_locations[i - 1].end #print i, "seq", gap_locations[i-1].end, gap_locations[i].start, gap_locations[i].start - gap_locations[i-1].end try: values = GC_skew(seq, windows) except: print(len(seq), seq) contig_name = record.name + "_%s" % (i + 1) for i in range(0, len(values)): start = i * windows stop = start + windows #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100 section_start = chr_start + start section_end = chr_start + stop circos_string += "%s %s %s %s\n" % ( contig_name, section_start + shift, section_end + shift, values[i]) else: try: values = GC_skew(record.seq, windows) except: values = GC_skew(record.seq, 2000) # skip last value acc = initial for i in range(0, len(values) - 1): gc = values[i] acc += gc start = i * windows stop = start + windows circos_string += "%s %s %s %s\n" % ( record.id.split(".")[0], start + shift, stop + shift, acc) return circos_string, acc
def test_GC_skew(self): seq = "A" * 50 self.assertEqual(GC_skew(seq)[0], 0)
def circos_gc_skew(record, windows=1000, shift=0): ''' :param record: :return: circos string with difference as compared to the average GC ex: average = 32 GC(seq[3000:4000]) = 44 diff = 44 - 32 = 12% NEW 12.06.2017: calculate GC based on whole sequence ''' from Bio.SeqFeature import FeatureLocation circos_string = '' #print "GENOME SIZE:", len(record.seq) gap_locations = [] for feature in record.features: if feature.type == "assembly_gap": gap_locations.append(feature.location) if len(gap_locations) == 0: gap_locations.append(FeatureLocation(0, len(record.seq))) else: #gap_locations.append(FeatureLocation(gap_locations[-1].end + 1, len(record.seq))) gap_locations.append(FeatureLocation(len(record.seq), len(record.seq))) #print 'gap locations', gap_locations if len(gap_locations) > 1: from chlamdb.plots import circos_convert_contigs_coords contig_coords = [] start = 0 for i, coord in enumerate(gap_locations): contig_name = record.name + "_%s" % (i + 1) contig_coords.append([contig_name, start, int(coord.start)]) start = coord.end + 1 values = GC_skew(record.seq, windows) data_list = [] for n_value, value in enumerate(values): start = (windows * n_value) + 1 end = (start + windows) #for i in range(0, len(gap_locations)): data_list.append([record.name, start, end, value]) ''' if i == 0: seq = record.seq[0:gap_locations[i].start] chr_start = 0 else: seq = record.seq[gap_locations[i-1].end+1:gap_locations[i].start] chr_start = gap_locations[i-1].end+1 if 'n' in seq or 'N' in seq: print 'n in seq!!!!!!!!!!!!!!' #print i, "seq", gap_locations[i-1].end, gap_locations[i].start, gap_locations[i].start - gap_locations[i-1].end try: values = GC_skew(seq, windows) except: print len(seq), seq # skip very small contigs!!!!!!!! if len(values)<5: continue for i in range(0, len(values)): start = i *windows stop = start + windows #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100 section_start = chr_start + start section_end = chr_start + stop circos_string += "%s %s %s %s\n" % (contig_name, section_start+shift, section_end+shift, values[i]) ''' #print "contig_coords", contig_coords #print "data_list", data_list[0:4] renamed_data = circos_convert_contigs_coords.rename_karyotype( contig_coords, data_list) for row in renamed_data: contig_name = row[0] start = row[1] end = row[2] try: value = row[3] except: #print 'problem with', row continue circos_string += "%s %s %s %s\n" % (contig_name, start, end, value) else: #print 'no gaps!' try: values = GC_skew(record.seq, windows) except: values = GC_skew(record.seq, 2000) # skip last value for i in range(0, len(values) - 1): start = i * windows stop = start + windows circos_string += "%s %s %s %s\n" % (record.id.split(".")[0], start + shift, stop + shift, values[i]) return circos_string
def gc_skew(sequence, **kwargs): values = GC_skew(sequence) # Average GC skew if len(values) == 0: return 0 return reduce(lambda x, y: x + y, values) / float(len(values))
def xGC_skew_mod(name, seq, outpath, window=None, zoom=100, r=300, px=100, py=100): __doc__ = """Copy of from Bio.SeqUtils import xGC_skew. to make it print""" # if not window: window = round(len(seq) / 720) from Bio.SeqUtils import GC, GC_skew from math import pi, sin, cos try: import Tkinter as tkinter # Python 2 except ImportError: import tkinter # Python 3 yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL) xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL) canvas = tkinter.Canvas(yscrollcommand=yscroll.set, xscrollcommand=xscroll.set, background='white') win = canvas.winfo_toplevel() win.geometry('900x900') yscroll.config(command=canvas.yview) xscroll.config(command=canvas.xview) yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y) xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X) canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1) canvas.update() X0, Y0 = r + px, r + py x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r ty = Y0 canvas.create_text(X0, ty, text=name) ty += 20 canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq))) ty += 20 canvas.create_text(X0, ty, text='GC Skew', fill='blue') ty += 20 canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta') ty += 20 canvas.create_oval(x1, y1, x2, y2) acc = 0 start = 0 for gc in GC_skew(seq, window): r1 = r acc += gc # GC skew alpha = pi - (2 * pi * start) / len(seq) r2 = r1 - gc * zoom x1 = X0 + r1 * sin(alpha) y1 = Y0 + r1 * cos(alpha) x2 = X0 + r2 * sin(alpha) y2 = Y0 + r2 * cos(alpha) canvas.create_line(x1, y1, x2, y2, fill='blue') # accumulated GC skew r1 = r - 50 r2 = r1 - acc x1 = X0 + r1 * sin(alpha) y1 = Y0 + r1 * cos(alpha) x2 = X0 + r2 * sin(alpha) y2 = Y0 + r2 * cos(alpha) canvas.create_line(x1, y1, x2, y2, fill='magenta') canvas.update() start += window canvas.configure(scrollregion=canvas.bbox(tkinter.ALL)) canvas.postscript(file=outpath) canvas.destroy()
#!/usr/bin/env python3 import argparse parser = argparse.ArgumentParser() parser.add_argument('fasta') args = parser.parse_args() from Bio import SeqIO from Bio.SeqUtils import GC_skew for record in SeqIO.parse(args.fasta, 'fasta'): for skew in GC_skew(record.seq, window=100): print(skew)
def complete_tasks(full_seq, des, unique_key): file_details = st.radio("Details", ("Description", "Sequence"), key=unique_key) #Show description and sequence in DNA Analysis section if file_details == "Description": st.write(des) elif file_details == "Sequence": st.write(full_seq) #Nucleotide occurances plot and color selector for the bars st.subheader("Plot Nucleotide Frequency") full_seq_freq = OrderedDict(Counter(full_seq)) bar1_colour = st.beta_color_picker("Pick Colour for Bar 1", key=unique_key) bar2_colour = st.beta_color_picker("Pick Colour for Bar 2", key=unique_key) bar3_colour = st.beta_color_picker("Pick Colour for Bar 3", key=unique_key) bar4_colour = st.beta_color_picker("Pick Colour for Bar 4", key=unique_key) if st.button("Plot Frequency", key=unique_key): barlist = plt.bar(full_seq_freq.keys(), full_seq_freq.values()) barlist[0].set_color(bar1_colour) barlist[1].set_color(bar2_colour) barlist[2].set_color(bar3_colour) barlist[3].set_color(bar4_colour) st.pyplot() st.subheader("Properties") #GC Content, GC Melting temp, GC_skew, Complement and reverse complement gc_count = GC(full_seq) st.write("GC Content: {}".format(gc_count)) mt = MeltingTemp.Tm_GC(full_seq, strict=False) st.write("Melting Temperature based on GC Content: {}".format(mt)) gc_skew_bases = st.number_input("Enter number of bases", key=unique_key) try: gc_skew = GC_skew(full_seq, int(gc_skew_bases)) st.write("GC Skew for {} bases: {}".format(gc_skew_bases, gc_skew)) except ValueError: st.write("Enter a Valid Number for bases") if st.checkbox("Complement", key=unique_key): st.write(full_seq.complement()) elif st.checkbox("Reverse Complement", key=unique_key): st.write(full_seq.reverse_complement()) #Protein Synthesis st.subheader("Protein Synthesis") p1 = full_seq.translate() if st.checkbox("Transcription: DNA to mRNA", key=unique_key): st.write(full_seq.transcribe()) elif st.checkbox("Translation: DNA to 1 letter Amino Acid Sequence", key=unique_key): st.write(p1) elif st.checkbox("Translation: DNA to 3 letter Amino Acid Sequence", key=unique_key): full_aa_name = str(p1).replace("*", "") st.write(seq3(full_aa_name)) elif st.checkbox("Plot Amino Acid Frequency", key=unique_key): aa_freq = OrderedDict(Counter(str(p1))) bar_colour = st.beta_color_picker("Pick Colour for all Bars", key=unique_key) plt.bar(aa_freq.keys(), aa_freq.values(), color=bar_colour) st.pyplot() st.write("Asterisk (*) - Denotes Stop Codons.")
def draw_gc(sumgene, dwg, gc_color): #gc kmer number unit_num = 300 unit_len = int(sumgene.len / unit_num) #first coordinates start_unit = 0 end_unit = unit_len - 1 mid_unit = (end_unit - start_unit) / 2 #radius gcc_r = 430 gcs_r = 230 gcc_range = (330, 530) gcs_range = (130, 330) gcc_mean = GC(sumgene.seq) gc_contents = [] gc_skews = [] #compute gc while (unit_num > 0): gc_content = GC(sumgene.seq[start_unit:end_unit]) gcc_variance = gc_content - gcc_mean gc_contents.append(gcc_variance) gc_skew = GC_skew(sumgene.seq[start_unit:end_unit], window=unit_len)[0] gc_skews.append(gc_skew) start_unit += unit_len if end_unit + unit_len <= sumgene.len: end_unit += unit_len else: end_unit = sumgene.len unit_num -= 1 gcc_max = max(gc_contents) gcc_min = min(gc_contents) gcs_max = max(gc_skews) gcs_min = min(gc_skews) gcc_scal = (gcc_range[1] - gcc_range[0]) / (gcc_max - gcc_min) gcs_scal = (gcs_range[1] - gcs_range[0]) / (gcs_max - gcs_min) gc_count = 0 unit_num = 300 start_unit = 0 end_unit = unit_len - 1 mid_unit = (end_unit - start_unit) / 2 while (unit_num > 0): # draw gc_content c_pos = position_mapping(sumgene, [start_unit, end_unit], gcc_r) gcc_nr = (gc_contents[gc_count] - gcc_min) * gcc_scal + gcc_range[0] cm_pos = position_mapping(sumgene, [mid_unit, 0], gcc_nr) points = [(1500 + c_pos[0], 1500 - c_pos[1]), (1500 + c_pos[2], 1500 - c_pos[3]), (1500 + cm_pos[0], 1500 - cm_pos[1])] if gcc_nr >= gcc_r: dwg.add(dwg.polygon(points, fill=gc_color[0], stroke_width=0)) else: dwg.add(dwg.polygon(points, fill=gc_color[1], stroke_width=0)) # draw gc_skew s_pos = position_mapping(sumgene, [start_unit, end_unit], gcs_r) gcs_nr = (gc_skews[gc_count] - gcs_min) * gcs_scal + gcs_range[0] sm_pos = position_mapping(sumgene, [mid_unit, 0], gcs_nr) points = [(1500 + s_pos[0], 1500 - s_pos[1]), (1500 + s_pos[2], 1500 - s_pos[3]), (1500 + sm_pos[0], 1500 - sm_pos[1])] if gcs_nr >= gcs_r: dwg.add(dwg.polygon(points, fill=gc_color[2], stroke_width=0)) else: dwg.add(dwg.polygon(points, fill=gc_color[3], stroke_width=0)) start_unit += unit_len if end_unit + unit_len <= sumgene.len: end_unit += unit_len else: end_unit = sumgene.len mid_unit += unit_len unit_num -= 1 gc_count += 1 return dwg