Esempio n. 1
0
def calculateSkew(fullPath):
    '''
	reads file specified by function argument, records is parsed by using SeqIO
	, gc skew is calculated with SeqUtils.GC_skew. for every 10000 nucleotides 
	the sequence ID, nucleotide index, cumulative average GC-skew, 
	and average nucleotide score is added to a pandas dataframe and returned.

	The average gc-skew av nucleotide score is used as a compromise between 
	computational load and accuracy.

	'''

    df = pd.DataFrame(columns=["id", "x", "y", "z"])
    df_index = 0

    with open(fullPath, "r") as handle:

        for record in SeqIO.parse(handle, "fasta"):

            gc = SeqUtils.GC_skew(record.seq, 10000)

            count = 0
            skew = 0
            for nc in gc:

                count += 1
                skew = skew + nc
                df.loc[df_index] = [record.id, count, skew, nc]
                df.index += 1

    return df
Esempio n. 2
0
from Bio import SeqIO
from Bio import SeqUtils
if __name__ == "__main__":
    for fna in SeqIO.parse("../sample_genome/sample1.fasta", "fasta"):
        print(fna.id)
        print(fna.seq)
        print(fna.seq.reverse_complement())
        print(SeqUtils.GC(fna.seq))
        print(SeqUtils.GC_skew(fna.seq)[0])
Esempio n. 3
0
from Bio import SeqIO
from Bio import SeqUtils
if __name__ == "__main__":
    size = 10000
    for fna in SeqIO.parse("../sample_genome/sample2.fasta", "fasta"):
        results = []
        seq = fna.seq
        results = SeqUtils.GC_skew(seq, size)
        with open("{}_GCskew.txt".format(fna.id), "w") as o:
            o.write(fna.id + "," + str(size) + "\n")
            for i in range(len(results)):
                o.write(str(i * size + 1) + "," + str(results[i]) + "\n")
Esempio n. 4
0
import sys

import pyBigWig
from Bio import SeqIO
from Bio import SeqUtils

span = int(sys.argv[2])
bw = pyBigWig.open(sys.argv[3], "w")

# Prepare header separately because ugh
data = []
for rec in SeqIO.parse(sys.argv[1], "fasta"):
    data.append((rec.id, len(rec)))
bw.addHeader(data)

for rec in SeqIO.parse(sys.argv[1], "fasta"):
    gc = SeqUtils.GC_skew(rec.seq, span)

    bw.addEntries(rec.id, 0, values=list(gc), span=span, step=span)

bw.close()
Esempio n. 5
0
#!/usr/bin/python

# coding=utf-8

'''
gc_skew.py [input-fasta][output-txt]

'''


import sys,re

input_file_name = sys.argv[1]#"polish_assembly.fasta"
output_file = open(sys.argv[2],"w")#"GC_skew.txt"

from Bio import SeqIO, SeqUtils

rec = SeqIO.read(input_file_name, 'fasta')
gc = SeqUtils.GC_skew(rec.seq, 1000)
start = 1
for gc_value in gc:
    end = start + 1000
    if end > len(rec):
        end = len(rec)
    output_file.write(str(start)+"\t"+str(end)+"\t"+str(gc_value) + "\n")
    start = start + 1000
    
output_file.close()
Esempio n. 6
0
def get_lstm_xx(j, seq_dict, kmer=2, dim=128, mode='train'):
    loc1, scf1, std1, st1, ed1, loc2, scf2, std2, st2, ed2 = j[:10]
    if scf1 != scf2 or std1 != std2:
        #X0 = np.ones((4 ** kmer, dim))
        X0 = [127] * dim
        #X0 = None
        X1 = [10**4] * 11
        return X0, X1

    # get the sequence
    st1, ed1, st2, ed2 = list(map(int, [st1, ed1, st2, ed2]))
    st1 -= 1
    st2 -= 1

    if st1 > st2:
        loc1, scf1, std1, st1, ed1, loc2, scf2, std2, st2, ed2 = loc2, scf2, std2, st2, ed2, loc1, scf1, std1, st1, ed1
    seq1 = seq_dict[scf1][st1:ed1]
    seq1 = std1 == '+' and seq1 or seq1.reverse_complement()
    seq2 = seq_dict[scf2][st2:ed2]
    seq2 = std1 == '+' and seq2 or seq2.reverse_complement()

    start, end = ed1, st2
    seq12 = seq_dict[scf1][start:end]

    # if len(seq12) > dim:
    #    seq12 = seq12[: dim // 2] + seq12[-dim // 2: ]

    seq12 = std1 == '+' and seq12 or seq12.reverse_complement()
    seq1, seq2, seq12 = list(map(str, [seq1.seq, seq2.seq, seq12.seq]))
    seq1, seq2, seq12 = seq1.upper(), seq2.upper(), seq12.upper()

    # 1D features such as gc, dist
    cai1, cai2, cai12 = list(map(cai, [seq1, seq2, seq12]))
    dist = st2 - ed1
    distn = (st2 - ed1) * 1. / (ed2 - st1)
    ratio = math.log((ed1 - st1) * 1. / (ed2 - st2))
    ratio = std1 == '+' and ratio or -ratio
    idx = -100
    bgs = Counter(seq12[idx:])
    up10, up35 = find_motif(seq12[idx:], box_up10,
                            bgs), find_motif(seq12[idx:], box_up35, bgs)
    if seq12[idx:]:
        gc = SeqUtils.GC(seq12[idx:])
        try:
            skew = SeqUtils.GC_skew(seq12[idx:])[0]
        except:
            skew = 0.
    else:
        gc = skew = 0.

    bias = sharekmer(seq1, seq2)
    if st1 == st2 == '+':
        X1 = [cai1, cai2, bias, distn, ratio, gc, skew] + up10[1:] + up35[1:]
    else:
        X1 = [cai2, cai1, bias, distn, ratio, gc, skew] + up10[1:] + up35[1:]
        #X1 = [cai1, cai2, bias, distn, ratio, gc, skew] + up10[1: ] + up35[1: ]

    # 1D features of lstm
    n12 = len(seq12)
    '''
    L = dim // 2
    R = dim - L
    if n12 > dim:
        seq12 = seq12[: L] + seq12[-R: ]
    else:
        seq12 = seq12[: L] +'N' * (dim - n12)  + seq12[-R: ]
    '''
    lstm_seq = [
        s2n(seq12[elem:elem + kmer], code5) for elem in range(n12 - kmer + 1)
    ]

    #X0 = lstm_seq[::kmer]
    #for i in xrange(kmer):
    #    X0.extend(lstm_seq[i::kmer])
    #X0 = lstm_seq
    X0 = [-1] * dim
    ndim = len(lstm_seq)
    if ndim == dim:
        X0 = lstm_seq
        #print 'X0 0', len(X0)

    elif 2 <= ndim < dim:
        ndim = ndim // 2
        X0[:ndim] = lstm_seq[:ndim]
        X0[-ndim:] = lstm_seq[-ndim:]
        #print 'X0 1', len(X0)
    elif ndim > dim:
        ndim = dim // 2
        X0[:ndim] = lstm_seq[:ndim]
        X0[-ndim:] = lstm_seq[-ndim:]
        #print 'X0 2', len(X0)
    else:
        pass

    return X0, X1
Esempio n. 7
0
def get_xx0(j, seq_dict, kmer=2, dim=128, mode='train', context=False):
    loc1, scf1, std1, st1, ed1, loc2, scf2, std2, st2, ed2 = j[:10]
    if scf1 != scf2 or std1 != std2:
        if context:
            X0 = np.ones((4**kmer * 3, dim // 3 * 3))
        else:
            X0 = np.ones((4**kmer * 3, dim // 3))
        X1 = [10**4] * 11
        X2 = [127] * dim
        return [X0], X1, X2

    # get the sequence
    st1, ed1, st2, ed2 = list(map(int, [st1, ed1, st2, ed2]))
    st1 -= 1
    st2 -= 1

    if st1 > st2:
        loc1, scf1, std1, st1, ed1, loc2, scf2, std2, st2, ed2 = loc2, scf2, std2, st2, ed2, loc1, scf1, std1, st1, ed1

    seq1 = seq_dict[scf1][st1:ed1]
    seq1 = std1 == '+' and seq1 or seq1.reverse_complement()
    seq2 = seq_dict[scf2][st2:ed2]
    seq2 = std1 == '+' and seq2 or seq2.reverse_complement()

    start, end = ed1, st2
    seq12 = seq_dict[scf1][start:end]

    seq12 = std1 == '+' and seq12 or seq12.reverse_complement()
    seq1, seq2, seq12 = list(map(str, [seq1.seq, seq2.seq, seq12.seq]))
    seq1, seq2, seq12 = seq1.upper(), seq2.upper(), seq12.upper()

    # 1D features such as gc, dist
    cai1, cai2, cai12 = list(map(cai, [seq1, seq2, seq12]))
    dist = st2 - ed1
    distn = (st2 - ed1) * 1. / (ed2 - st1)
    ratio = math.log((ed1 - st1) * 1. / (ed2 - st2))
    ratio = std1 == '+' and ratio or -ratio
    idx = -100
    bgs = Counter(seq12[idx:])
    up10, up35 = find_motif(seq12[idx:], box_up10,
                            bgs), find_motif(seq12[idx:], box_up35, bgs)
    if seq12[idx:]:
        gc = SeqUtils.GC(seq12[idx:])
        try:
            skew = SeqUtils.GC_skew(seq12[idx:])[0]
        except:
            skew = 0.
    else:
        gc = skew = 0.

    bias = sharekmer(seq1, seq2)
    if st1 == st2 == '+':
        X1 = [cai1, cai2, bias, distn, ratio, gc, skew] + up10[1:] + up35[1:]
    else:
        X1 = [cai2, cai1, bias, distn, ratio, gc, skew] + up10[1:] + up35[1:]

    # 2D features of kmer matrix
    if context:
        seqmat12 = kpm(seq12, d=dim, k=kmer, scale=4)
        seqmat1 = kpm(seq1, d=dim, k=kmer, scale=4)
        seqmat2 = kpm(seq2, d=dim, k=kmer, scale=4)
        seqmat = np.concatenate((seqmat1, seqmat12, seqmat2), 1)
    else:
        seqmat = kpm(seq12, d=dim, k=kmer, scale=4)

    if ed1 > st2:
        seqmat[:] = 0
    X0 = [seqmat]
    n12 = len(seq12)
    X2 = [
        s2n(seq12[elem:elem + kmer], code5) for elem in range(n12 - kmer + 1)
    ]

    return X0, X1, X2
Esempio n. 8
0
gdt2 = GenomeDiagram.Track('GC content', greytrack=1, greytrack_labels=4)
gdt2.add_set(gdgs)
track_list.append(gdt2)


def gcSkewData(lst):
    ret_list = []
    i = 50
    for x in lst:
        ret_list.append((i, x))
        i = i + 100
    return ret_list


graphdata2 = gcSkewData(SeqUtils.GC_skew(seq_string))

gdgs2 = GenomeDiagram.GraphSet('GC Skew')
gdgs2.new_graph(graphdata, 'GC Skew', style='line', linewidth=2)

gdt3 = GenomeDiagram.Track('GC Skew', greytrack=1, greytrack_labels=4)
gdt3.add_set(gdgs2)

track_list.append(gdt3)

gd_diagram = GenomeDiagram.Diagram("Tomato Curly Stunt Virus, complete genome",
                                   track_size=0.7)

i = 1

for track in track_list: