Esempio n. 1
0
def plot_GC(handle, outname):
    pp = PdfPages(outname)
    parsed_handle = [record for record in SeqIO.parse(handle, "fasta")]

    for i in range(0, len(parsed_handle)):

        gc_skew = GC_skew(parsed_handle[i].seq)

        cumulated_skew = np.cumsum(gc_skew)
        pylab.plot(cumulated_skew)
        pylab.plot(gc_skew)
        pylab.title("contig %s" % (str(parsed_handle[i].id)))
        pylab.savefig(pp, format='pdf')
        pylab.close()

    all_seq = ''
    for record in parsed_handle:
        all_seq += record.seq
    gc_skew = GC_skew(all_seq)
    cumulated_skew = np.cumsum(gc_skew)
    pylab.plot(cumulated_skew)
    pylab.plot(gc_skew)
    pylab.title("concatenated contigs")
    pylab.savefig(pp, format='pdf')
    pp.close()
def gc_skew_window(genome, window=1000):
    interval = [x for x in range(int(len(genome.seq) / window) + 1)]
    result_dict = dict()
    for i in interval[:-1]:
        gc_skew = GC_skew(genome.seq[interval[i] * window:interval[i + 1] *
                                     window])[0]
        gc_skew_cell = {
            i: {
                "chr": genome.id,
                "start": interval[i] * window + 1,
                "end": interval[i + 1] * window,
                "GC_skew": gc_skew
            }
        }
        result_dict.update(gc_skew_cell)
    gc_skew = GC_skew(genome.seq[interval[-1] * window:len(genome.seq)])[0]
    gc_skew_cell = {
        interval[-1]: {
            "chr": genome.id,
            "start": interval[-1] * 100 + 1,
            "end": len(genome.seq),
            "GC_skew": gc_skew
        }
    }
    result_dict.update(gc_skew_cell)
    return DataFrame.from_dict(result_dict, "index")
Esempio n. 3
0
def calculate_GCskew(entries):
    name, sequences = list(zip(*entries))
    total_seq = ''.join(sequences)
    L = len(total_seq)
    window = L // len(sequences)
    gc_skew_values = GC_skew(total_seq, window=window)
    return pd.Series(gc_skew_values[:len(sequences)], name='gc_skew')
Esempio n. 4
0
def skew(file_name, file_type):
    file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "Files", file_name + "." + file_type)
    skew_val = [0]
    positions = [0]
    for rec in SeqIO.parse(file_path, file_type):
        skew_data = GC_skew(rec.seq, 1)
        for i in range(0, len(rec.seq)):
            positions.append(i)
            skew_val.append(skew_val[len(skew_val) - 1] + int(skew_data[i]))

    x = positions
    y = skew_val

    return x, y
Esempio n. 5
0
def mk_gc_skewset(data, outpath, window):
    """calculates the GC skew in a window and writes in a file that circos accepts to plot"""
    skew_file = open(outpath + '/gc_skew.txt', 'w')
    cg = 0
    for contig in data:
        cg += 1
        pos = 0
        interval = window
        gc_skew = GC_skew(data[contig], window=window)
        for point in gc_skew:
            skew_file.write('cg{} {} {} {}\n'.format(cg, pos, interval, point))
            pos = interval + 1
            interval += window

    return outpath + '/gc_skew.txt'
Esempio n. 6
0
def find_ori_ter(gb):
    """Find positions of origin and terminus of replication in a genome,
    using the cumulative GC skew method.

    Args:
        gb (Bio.SeqRecord.SeqRecord): GenBank annotation.
    Returns:
        ori, ter (int): origin and terminus positions.
    """
    dna = gb.seq
    w = 100
    cum_gcskew = np.cumsum(GC_skew(dna, window=w))
    ori = np.argmin(cum_gcskew) * w
    ter = np.argmax(cum_gcskew) * w
    print("Found ori and ter positions: %d, %d" % (ori, ter))
    return ori, ter
Esempio n. 7
0
def fa2bed(fa, window, step):

    # genome bed file
    genomeBed = open("genome.bed","w")
    #genomeBed.write("chr\tstart\tend\n")

    record_dict = SeqIO.to_dict(SeqIO.parse(fa, "fasta"))

    # GC% distribution file
    gcBed = open("gc.bed","w")
    #gcBed.write("chr\tstart\tend\tgc\n")

    #GC-skew bed file
    gcSkewBed = open("gcSkew.bed","w")
    #gcSkewBed.write("chr\tstart\tend\tgcSkew\n")

    window_len = int(window)
    step_len = int(step)

    for k,v in record_dict.items():
        chrom = v.id
        chrom_seq = v.seq
        length = len(chrom_seq)
        genomeBed.write(chrom+"\t0\t%d\n"%length)

        for i in range(0, length, step_len):
            start0 = i
            end0 = i+step_len if i+step_len<=length else length
            start = i-window_len/2 if i-window_len/2>=0 else 0
            end = i+window_len/2 if i+window_len/2<=length else length
            s = chrom_seq[start : end]
            gc = GC(s)
            skew = GC_skew(s, window_len)[0]
            gcBed.write(chrom+"\t%d\t%d\t%.3f\n"%(start0,end0,gc))
            gcSkewBed.write(chrom+"\t%d\t%d\t%.3f\n"%(start0,end0,skew))

    genomeBed.close()
    gcBed.close()
    gcSkewBed.close()
Esempio n. 8
0
file_list = sorted(file_list)

# get data
data = []
names = []
for file_path in file_list:
    for record in SeqIO.parse(file_path, "fasta"):
        names.append(record.id)
        data.append(record.seq)

print(names)
number_strains = len(data)

GC_skews = []
for i in data:
    GC_skews.append(GC_skew(i, window=200))

#comparing specific strains
if True:

    fig = plt.figure(figsize=(10, 10))

    pos_jvci1 = names.index("JCVI1")
    pos_jvci2 = names.index("JCVI2")
    pos_jvci3 = names.index("JCVI3")
    pos_gm12wt = names.index("GM12WT")
    pos_gm12d86 = names.index("GM12delta68")
    pos_mg37 = names.index("MG37")
    pos_Mf5583 = names.index("Mf5583")

    compared = [
Esempio n. 9
0
#!/usr/bin/python

# testing python script
import Bio as bio
import matplotlib.pyplot as plot

from Bio.Seq import Seq
from Bio.Alphabet import generic_rna,generic_dna
from Bio.SeqUtils import GC123,GC,GC_skew

seq = "ataccaggctgaggcccattaatgatgcaatttgctgggcttctctattttctccgtgcttccatcctcttctccgtcggcggggagaagtgaaatgccgtggagatgggcggcggcggcggcgacggcggcgacgagaaagctcaccgggatctctcagtcgcgagtttcagtagcctttaccggccgtcttctctaccgctcgttcggaagcgactccagtgaaagccgcaagaggtcactgccacggggggtcgtatcgatcggggccatcagccttgctggaggtctcgtgctcagcgccgtcaacgacctcgccatcttcaatggatgcacaacgaaggcaattgagcatgctgctgacaaccctgctgttgtggaagcaattggagtgcctatagtcagaggaccgtggtatgatgcttctcttgaggtgggccatcgacggcggtctgtgtcatgcacattccctgtatctgggccacatgggtcaggatttctccagattaaggcaacccgagatggagaggatggtctgctttcgtttctgcggcatcacgactggaagatcctattgctggaggctcatcttgaagcaccatcagatgatgaggaccagagaaagctggttaaggtgaatcttgcaagcagtggccgtggggaagatggggatccagagagtggttaatcttttgtactgaattccatggtgagtggaagatcgtgtcatctgaatggactccaaatattaaatgacatggagatctagggaagcaaaaaaaaaaaaaaaa"

print GC123(seq)
print GC(seq)

plot(GC_skew(seq,window=100),c="r")
xlabel("Window")
ylabel("(G-C)/(G+C)")
title("GC-skew")
Esempio n. 10
0
def circos_cumul_gc_skew(record, windows=1000, shift=0, initial=0):
    '''
    :param record:
    :return: circos string with difference as compared to the average GC
    ex: average = 32
        GC(seq[3000:4000]) = 44
        diff = 44 - 32 = 12%

    '''
    from Bio.SeqFeature import FeatureLocation
    circos_string = ''
    #print "GENOME SIZE:", len(record.seq)

    gap_locations = []
    for feature in record.features:

        if feature.type == "assembly_gap":
            gap_locations.append(feature.location)
    if len(gap_locations) == 0:
        gap_locations.append(FeatureLocation(0, len(record.seq)))

    else:
        #gap_locations.append(FeatureLocation(gap_locations[-1].end + 1, len(record.seq)))
        gap_locations.append(FeatureLocation(len(record.seq), len(record.seq)))
    #print 'gap locations', gap_locations
    if len(gap_locations) > 1:
        for i in range(0, len(gap_locations)):
            if i == 0:
                seq = record.seq[0:gap_locations[i].start]
                chr_start = 0
            else:
                seq = record.seq[gap_locations[i -
                                               1].end:gap_locations[i].start]
                chr_start = gap_locations[i - 1].end
            #print i, "seq", gap_locations[i-1].end, gap_locations[i].start, gap_locations[i].start - gap_locations[i-1].end

            try:
                values = GC_skew(seq, windows)
            except:
                print(len(seq), seq)
            contig_name = record.name + "_%s" % (i + 1)

            for i in range(0, len(values)):
                start = i * windows
                stop = start + windows
                #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100
                section_start = chr_start + start
                section_end = chr_start + stop
                circos_string += "%s %s %s %s\n" % (
                    contig_name, section_start + shift, section_end + shift,
                    values[i])
    else:
        try:
            values = GC_skew(record.seq, windows)
        except:
            values = GC_skew(record.seq, 2000)
        # skip last value
        acc = initial
        for i in range(0, len(values) - 1):
            gc = values[i]
            acc += gc
            start = i * windows
            stop = start + windows

            circos_string += "%s %s %s %s\n" % (
                record.id.split(".")[0], start + shift, stop + shift, acc)
    return circos_string, acc
Esempio n. 11
0
 def test_GC_skew(self):
     seq = "A" * 50
     self.assertEqual(GC_skew(seq)[0], 0)
Esempio n. 12
0
def circos_gc_skew(record, windows=1000, shift=0):
    '''
    :param record:
    :return: circos string with difference as compared to the average GC
    ex: average = 32
        GC(seq[3000:4000]) = 44
        diff = 44 - 32 = 12%

    NEW 12.06.2017: calculate GC based on whole sequence

    '''
    from Bio.SeqFeature import FeatureLocation
    circos_string = ''
    #print "GENOME SIZE:", len(record.seq)

    gap_locations = []
    for feature in record.features:

        if feature.type == "assembly_gap":
            gap_locations.append(feature.location)
    if len(gap_locations) == 0:
        gap_locations.append(FeatureLocation(0, len(record.seq)))

    else:
        #gap_locations.append(FeatureLocation(gap_locations[-1].end + 1, len(record.seq)))
        gap_locations.append(FeatureLocation(len(record.seq), len(record.seq)))
    #print 'gap locations', gap_locations
    if len(gap_locations) > 1:
        from chlamdb.plots import circos_convert_contigs_coords

        contig_coords = []
        start = 0
        for i, coord in enumerate(gap_locations):
            contig_name = record.name + "_%s" % (i + 1)
            contig_coords.append([contig_name, start, int(coord.start)])
            start = coord.end + 1

        values = GC_skew(record.seq, windows)
        data_list = []
        for n_value, value in enumerate(values):

            start = (windows * n_value) + 1
            end = (start + windows)
            #for i in range(0, len(gap_locations)):
            data_list.append([record.name, start, end, value])
            '''
            if i == 0:
                seq = record.seq[0:gap_locations[i].start]
                chr_start = 0
            else:
                seq = record.seq[gap_locations[i-1].end+1:gap_locations[i].start]
                chr_start = gap_locations[i-1].end+1
            if 'n' in seq or 'N' in seq:
                print 'n in seq!!!!!!!!!!!!!!'

            #print i, "seq", gap_locations[i-1].end, gap_locations[i].start, gap_locations[i].start - gap_locations[i-1].end

            try:
                values = GC_skew(seq, windows)
            except:
                print len(seq), seq




            # skip very small contigs!!!!!!!!
            if len(values)<5:
                continue

            for i in range(0, len(values)):
                start = i *windows
                stop = start + windows
                #gc = ((GC(record.seq[start:stop])/average_gc) - 1)*100
                section_start = chr_start + start
                section_end = chr_start + stop
                circos_string += "%s %s %s %s\n" % (contig_name, section_start+shift, section_end+shift, values[i])
            '''
        #print "contig_coords", contig_coords
        #print "data_list", data_list[0:4]
        renamed_data = circos_convert_contigs_coords.rename_karyotype(
            contig_coords, data_list)
        for row in renamed_data:
            contig_name = row[0]
            start = row[1]
            end = row[2]
            try:
                value = row[3]
            except:
                #print 'problem with', row
                continue
            circos_string += "%s %s %s %s\n" % (contig_name, start, end, value)

    else:
        #print 'no gaps!'
        try:
            values = GC_skew(record.seq, windows)
        except:
            values = GC_skew(record.seq, 2000)
        # skip last value
        for i in range(0, len(values) - 1):

            start = i * windows
            stop = start + windows

            circos_string += "%s %s %s %s\n" % (record.id.split(".")[0],
                                                start + shift, stop + shift,
                                                values[i])
    return circos_string
Esempio n. 13
0
def gc_skew(sequence, **kwargs):
    values = GC_skew(sequence)
    # Average GC skew
    if len(values) == 0:
        return 0
    return reduce(lambda x, y: x + y, values) / float(len(values))
Esempio n. 14
0
def xGC_skew_mod(name,
                 seq,
                 outpath,
                 window=None,
                 zoom=100,
                 r=300,
                 px=100,
                 py=100):
    __doc__ = """Copy of from Bio.SeqUtils import xGC_skew. to make it print"""
    #
    if not window:
        window = round(len(seq) / 720)
    from Bio.SeqUtils import GC, GC_skew
    from math import pi, sin, cos

    try:
        import Tkinter as tkinter  # Python 2
    except ImportError:
        import tkinter  # Python 3

    yscroll = tkinter.Scrollbar(orient=tkinter.VERTICAL)
    xscroll = tkinter.Scrollbar(orient=tkinter.HORIZONTAL)
    canvas = tkinter.Canvas(yscrollcommand=yscroll.set,
                            xscrollcommand=xscroll.set,
                            background='white')
    win = canvas.winfo_toplevel()
    win.geometry('900x900')

    yscroll.config(command=canvas.yview)
    xscroll.config(command=canvas.xview)
    yscroll.pack(side=tkinter.RIGHT, fill=tkinter.Y)
    xscroll.pack(side=tkinter.BOTTOM, fill=tkinter.X)
    canvas.pack(fill=tkinter.BOTH, side=tkinter.LEFT, expand=1)
    canvas.update()

    X0, Y0 = r + px, r + py
    x1, x2, y1, y2 = X0 - r, X0 + r, Y0 - r, Y0 + r

    ty = Y0
    canvas.create_text(X0, ty, text=name)
    ty += 20
    canvas.create_text(X0, ty, text='GC %3.2f%%' % (GC(seq)))
    ty += 20
    canvas.create_text(X0, ty, text='GC Skew', fill='blue')
    ty += 20
    canvas.create_text(X0, ty, text='Accumulated GC Skew', fill='magenta')
    ty += 20
    canvas.create_oval(x1, y1, x2, y2)

    acc = 0
    start = 0
    for gc in GC_skew(seq, window):
        r1 = r
        acc += gc
        # GC skew
        alpha = pi - (2 * pi * start) / len(seq)
        r2 = r1 - gc * zoom
        x1 = X0 + r1 * sin(alpha)
        y1 = Y0 + r1 * cos(alpha)
        x2 = X0 + r2 * sin(alpha)
        y2 = Y0 + r2 * cos(alpha)
        canvas.create_line(x1, y1, x2, y2, fill='blue')
        # accumulated GC skew
        r1 = r - 50
        r2 = r1 - acc
        x1 = X0 + r1 * sin(alpha)
        y1 = Y0 + r1 * cos(alpha)
        x2 = X0 + r2 * sin(alpha)
        y2 = Y0 + r2 * cos(alpha)
        canvas.create_line(x1, y1, x2, y2, fill='magenta')

        canvas.update()
        start += window

    canvas.configure(scrollregion=canvas.bbox(tkinter.ALL))
    canvas.postscript(file=outpath)
    canvas.destroy()
Esempio n. 15
0
#!/usr/bin/env python3

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('fasta')
args = parser.parse_args()

from Bio import SeqIO
from Bio.SeqUtils import GC_skew

for record in SeqIO.parse(args.fasta, 'fasta'):
    for skew in GC_skew(record.seq, window=100):
        print(skew)
Esempio n. 16
0
def complete_tasks(full_seq, des, unique_key):
    file_details = st.radio("Details", ("Description", "Sequence"),
                            key=unique_key)

    #Show description and sequence in DNA Analysis section
    if file_details == "Description":
        st.write(des)
    elif file_details == "Sequence":
        st.write(full_seq)

    #Nucleotide occurances plot and color selector for the bars
    st.subheader("Plot Nucleotide Frequency")
    full_seq_freq = OrderedDict(Counter(full_seq))

    bar1_colour = st.beta_color_picker("Pick Colour for Bar 1", key=unique_key)
    bar2_colour = st.beta_color_picker("Pick Colour for Bar 2", key=unique_key)
    bar3_colour = st.beta_color_picker("Pick Colour for Bar 3", key=unique_key)
    bar4_colour = st.beta_color_picker("Pick Colour for Bar 4", key=unique_key)

    if st.button("Plot Frequency", key=unique_key):
        barlist = plt.bar(full_seq_freq.keys(), full_seq_freq.values())
        barlist[0].set_color(bar1_colour)
        barlist[1].set_color(bar2_colour)
        barlist[2].set_color(bar3_colour)
        barlist[3].set_color(bar4_colour)
        st.pyplot()

    st.subheader("Properties")

    #GC Content, GC Melting temp, GC_skew, Complement and reverse complement
    gc_count = GC(full_seq)
    st.write("GC Content: {}".format(gc_count))

    mt = MeltingTemp.Tm_GC(full_seq, strict=False)
    st.write("Melting Temperature based on GC Content: {}".format(mt))

    gc_skew_bases = st.number_input("Enter number of bases", key=unique_key)
    try:
        gc_skew = GC_skew(full_seq, int(gc_skew_bases))
        st.write("GC Skew for {} bases: {}".format(gc_skew_bases, gc_skew))
    except ValueError:
        st.write("Enter a Valid Number for bases")

    if st.checkbox("Complement", key=unique_key):
        st.write(full_seq.complement())

    elif st.checkbox("Reverse Complement", key=unique_key):
        st.write(full_seq.reverse_complement())

    #Protein Synthesis
    st.subheader("Protein Synthesis")
    p1 = full_seq.translate()
    if st.checkbox("Transcription: DNA to mRNA", key=unique_key):
        st.write(full_seq.transcribe())

    elif st.checkbox("Translation: DNA to 1 letter Amino Acid Sequence",
                     key=unique_key):
        st.write(p1)

    elif st.checkbox("Translation: DNA to 3 letter Amino Acid Sequence",
                     key=unique_key):
        full_aa_name = str(p1).replace("*", "")
        st.write(seq3(full_aa_name))

    elif st.checkbox("Plot Amino Acid Frequency", key=unique_key):
        aa_freq = OrderedDict(Counter(str(p1)))
        bar_colour = st.beta_color_picker("Pick Colour for all Bars",
                                          key=unique_key)
        plt.bar(aa_freq.keys(), aa_freq.values(), color=bar_colour)
        st.pyplot()
        st.write("Asterisk (*) - Denotes Stop Codons.")
Esempio n. 17
0
def draw_gc(sumgene, dwg, gc_color):
    #gc kmer number
    unit_num = 300
    unit_len = int(sumgene.len / unit_num)
    #first coordinates
    start_unit = 0
    end_unit = unit_len - 1
    mid_unit = (end_unit - start_unit) / 2
    #radius
    gcc_r = 430
    gcs_r = 230
    gcc_range = (330, 530)
    gcs_range = (130, 330)
    gcc_mean = GC(sumgene.seq)
    gc_contents = []
    gc_skews = []
    #compute gc
    while (unit_num > 0):
        gc_content = GC(sumgene.seq[start_unit:end_unit])
        gcc_variance = gc_content - gcc_mean
        gc_contents.append(gcc_variance)
        gc_skew = GC_skew(sumgene.seq[start_unit:end_unit], window=unit_len)[0]
        gc_skews.append(gc_skew)
        start_unit += unit_len
        if end_unit + unit_len <= sumgene.len:
            end_unit += unit_len
        else:
            end_unit = sumgene.len
        unit_num -= 1
    gcc_max = max(gc_contents)
    gcc_min = min(gc_contents)
    gcs_max = max(gc_skews)
    gcs_min = min(gc_skews)
    gcc_scal = (gcc_range[1] - gcc_range[0]) / (gcc_max - gcc_min)
    gcs_scal = (gcs_range[1] - gcs_range[0]) / (gcs_max - gcs_min)
    gc_count = 0

    unit_num = 300
    start_unit = 0
    end_unit = unit_len - 1
    mid_unit = (end_unit - start_unit) / 2
    while (unit_num > 0):
        # draw gc_content
        c_pos = position_mapping(sumgene, [start_unit, end_unit], gcc_r)
        gcc_nr = (gc_contents[gc_count] - gcc_min) * gcc_scal + gcc_range[0]
        cm_pos = position_mapping(sumgene, [mid_unit, 0], gcc_nr)
        points = [(1500 + c_pos[0], 1500 - c_pos[1]),
                  (1500 + c_pos[2], 1500 - c_pos[3]),
                  (1500 + cm_pos[0], 1500 - cm_pos[1])]
        if gcc_nr >= gcc_r:
            dwg.add(dwg.polygon(points, fill=gc_color[0], stroke_width=0))
        else:
            dwg.add(dwg.polygon(points, fill=gc_color[1], stroke_width=0))

        # draw gc_skew
        s_pos = position_mapping(sumgene, [start_unit, end_unit], gcs_r)
        gcs_nr = (gc_skews[gc_count] - gcs_min) * gcs_scal + gcs_range[0]
        sm_pos = position_mapping(sumgene, [mid_unit, 0], gcs_nr)
        points = [(1500 + s_pos[0], 1500 - s_pos[1]),
                  (1500 + s_pos[2], 1500 - s_pos[3]),
                  (1500 + sm_pos[0], 1500 - sm_pos[1])]
        if gcs_nr >= gcs_r:
            dwg.add(dwg.polygon(points, fill=gc_color[2], stroke_width=0))
        else:
            dwg.add(dwg.polygon(points, fill=gc_color[3], stroke_width=0))

        start_unit += unit_len

        if end_unit + unit_len <= sumgene.len:
            end_unit += unit_len
        else:
            end_unit = sumgene.len

        mid_unit += unit_len
        unit_num -= 1
        gc_count += 1
    return dwg