def calculateSkew(fullPath): ''' reads file specified by function argument, records is parsed by using SeqIO , gc skew is calculated with SeqUtils.GC_skew. for every 10000 nucleotides the sequence ID, nucleotide index, cumulative average GC-skew, and average nucleotide score is added to a pandas dataframe and returned. The average gc-skew av nucleotide score is used as a compromise between computational load and accuracy. ''' df = pd.DataFrame(columns=["id", "x", "y", "z"]) df_index = 0 with open(fullPath, "r") as handle: for record in SeqIO.parse(handle, "fasta"): gc = SeqUtils.GC_skew(record.seq, 10000) count = 0 skew = 0 for nc in gc: count += 1 skew = skew + nc df.loc[df_index] = [record.id, count, skew, nc] df.index += 1 return df
from Bio import SeqIO from Bio import SeqUtils if __name__ == "__main__": for fna in SeqIO.parse("../sample_genome/sample1.fasta", "fasta"): print(fna.id) print(fna.seq) print(fna.seq.reverse_complement()) print(SeqUtils.GC(fna.seq)) print(SeqUtils.GC_skew(fna.seq)[0])
from Bio import SeqIO from Bio import SeqUtils if __name__ == "__main__": size = 10000 for fna in SeqIO.parse("../sample_genome/sample2.fasta", "fasta"): results = [] seq = fna.seq results = SeqUtils.GC_skew(seq, size) with open("{}_GCskew.txt".format(fna.id), "w") as o: o.write(fna.id + "," + str(size) + "\n") for i in range(len(results)): o.write(str(i * size + 1) + "," + str(results[i]) + "\n")
import sys import pyBigWig from Bio import SeqIO from Bio import SeqUtils span = int(sys.argv[2]) bw = pyBigWig.open(sys.argv[3], "w") # Prepare header separately because ugh data = [] for rec in SeqIO.parse(sys.argv[1], "fasta"): data.append((rec.id, len(rec))) bw.addHeader(data) for rec in SeqIO.parse(sys.argv[1], "fasta"): gc = SeqUtils.GC_skew(rec.seq, span) bw.addEntries(rec.id, 0, values=list(gc), span=span, step=span) bw.close()
#!/usr/bin/python # coding=utf-8 ''' gc_skew.py [input-fasta][output-txt] ''' import sys,re input_file_name = sys.argv[1]#"polish_assembly.fasta" output_file = open(sys.argv[2],"w")#"GC_skew.txt" from Bio import SeqIO, SeqUtils rec = SeqIO.read(input_file_name, 'fasta') gc = SeqUtils.GC_skew(rec.seq, 1000) start = 1 for gc_value in gc: end = start + 1000 if end > len(rec): end = len(rec) output_file.write(str(start)+"\t"+str(end)+"\t"+str(gc_value) + "\n") start = start + 1000 output_file.close()
def get_lstm_xx(j, seq_dict, kmer=2, dim=128, mode='train'): loc1, scf1, std1, st1, ed1, loc2, scf2, std2, st2, ed2 = j[:10] if scf1 != scf2 or std1 != std2: #X0 = np.ones((4 ** kmer, dim)) X0 = [127] * dim #X0 = None X1 = [10**4] * 11 return X0, X1 # get the sequence st1, ed1, st2, ed2 = list(map(int, [st1, ed1, st2, ed2])) st1 -= 1 st2 -= 1 if st1 > st2: loc1, scf1, std1, st1, ed1, loc2, scf2, std2, st2, ed2 = loc2, scf2, std2, st2, ed2, loc1, scf1, std1, st1, ed1 seq1 = seq_dict[scf1][st1:ed1] seq1 = std1 == '+' and seq1 or seq1.reverse_complement() seq2 = seq_dict[scf2][st2:ed2] seq2 = std1 == '+' and seq2 or seq2.reverse_complement() start, end = ed1, st2 seq12 = seq_dict[scf1][start:end] # if len(seq12) > dim: # seq12 = seq12[: dim // 2] + seq12[-dim // 2: ] seq12 = std1 == '+' and seq12 or seq12.reverse_complement() seq1, seq2, seq12 = list(map(str, [seq1.seq, seq2.seq, seq12.seq])) seq1, seq2, seq12 = seq1.upper(), seq2.upper(), seq12.upper() # 1D features such as gc, dist cai1, cai2, cai12 = list(map(cai, [seq1, seq2, seq12])) dist = st2 - ed1 distn = (st2 - ed1) * 1. / (ed2 - st1) ratio = math.log((ed1 - st1) * 1. / (ed2 - st2)) ratio = std1 == '+' and ratio or -ratio idx = -100 bgs = Counter(seq12[idx:]) up10, up35 = find_motif(seq12[idx:], box_up10, bgs), find_motif(seq12[idx:], box_up35, bgs) if seq12[idx:]: gc = SeqUtils.GC(seq12[idx:]) try: skew = SeqUtils.GC_skew(seq12[idx:])[0] except: skew = 0. else: gc = skew = 0. bias = sharekmer(seq1, seq2) if st1 == st2 == '+': X1 = [cai1, cai2, bias, distn, ratio, gc, skew] + up10[1:] + up35[1:] else: X1 = [cai2, cai1, bias, distn, ratio, gc, skew] + up10[1:] + up35[1:] #X1 = [cai1, cai2, bias, distn, ratio, gc, skew] + up10[1: ] + up35[1: ] # 1D features of lstm n12 = len(seq12) ''' L = dim // 2 R = dim - L if n12 > dim: seq12 = seq12[: L] + seq12[-R: ] else: seq12 = seq12[: L] +'N' * (dim - n12) + seq12[-R: ] ''' lstm_seq = [ s2n(seq12[elem:elem + kmer], code5) for elem in range(n12 - kmer + 1) ] #X0 = lstm_seq[::kmer] #for i in xrange(kmer): # X0.extend(lstm_seq[i::kmer]) #X0 = lstm_seq X0 = [-1] * dim ndim = len(lstm_seq) if ndim == dim: X0 = lstm_seq #print 'X0 0', len(X0) elif 2 <= ndim < dim: ndim = ndim // 2 X0[:ndim] = lstm_seq[:ndim] X0[-ndim:] = lstm_seq[-ndim:] #print 'X0 1', len(X0) elif ndim > dim: ndim = dim // 2 X0[:ndim] = lstm_seq[:ndim] X0[-ndim:] = lstm_seq[-ndim:] #print 'X0 2', len(X0) else: pass return X0, X1
def get_xx0(j, seq_dict, kmer=2, dim=128, mode='train', context=False): loc1, scf1, std1, st1, ed1, loc2, scf2, std2, st2, ed2 = j[:10] if scf1 != scf2 or std1 != std2: if context: X0 = np.ones((4**kmer * 3, dim // 3 * 3)) else: X0 = np.ones((4**kmer * 3, dim // 3)) X1 = [10**4] * 11 X2 = [127] * dim return [X0], X1, X2 # get the sequence st1, ed1, st2, ed2 = list(map(int, [st1, ed1, st2, ed2])) st1 -= 1 st2 -= 1 if st1 > st2: loc1, scf1, std1, st1, ed1, loc2, scf2, std2, st2, ed2 = loc2, scf2, std2, st2, ed2, loc1, scf1, std1, st1, ed1 seq1 = seq_dict[scf1][st1:ed1] seq1 = std1 == '+' and seq1 or seq1.reverse_complement() seq2 = seq_dict[scf2][st2:ed2] seq2 = std1 == '+' and seq2 or seq2.reverse_complement() start, end = ed1, st2 seq12 = seq_dict[scf1][start:end] seq12 = std1 == '+' and seq12 or seq12.reverse_complement() seq1, seq2, seq12 = list(map(str, [seq1.seq, seq2.seq, seq12.seq])) seq1, seq2, seq12 = seq1.upper(), seq2.upper(), seq12.upper() # 1D features such as gc, dist cai1, cai2, cai12 = list(map(cai, [seq1, seq2, seq12])) dist = st2 - ed1 distn = (st2 - ed1) * 1. / (ed2 - st1) ratio = math.log((ed1 - st1) * 1. / (ed2 - st2)) ratio = std1 == '+' and ratio or -ratio idx = -100 bgs = Counter(seq12[idx:]) up10, up35 = find_motif(seq12[idx:], box_up10, bgs), find_motif(seq12[idx:], box_up35, bgs) if seq12[idx:]: gc = SeqUtils.GC(seq12[idx:]) try: skew = SeqUtils.GC_skew(seq12[idx:])[0] except: skew = 0. else: gc = skew = 0. bias = sharekmer(seq1, seq2) if st1 == st2 == '+': X1 = [cai1, cai2, bias, distn, ratio, gc, skew] + up10[1:] + up35[1:] else: X1 = [cai2, cai1, bias, distn, ratio, gc, skew] + up10[1:] + up35[1:] # 2D features of kmer matrix if context: seqmat12 = kpm(seq12, d=dim, k=kmer, scale=4) seqmat1 = kpm(seq1, d=dim, k=kmer, scale=4) seqmat2 = kpm(seq2, d=dim, k=kmer, scale=4) seqmat = np.concatenate((seqmat1, seqmat12, seqmat2), 1) else: seqmat = kpm(seq12, d=dim, k=kmer, scale=4) if ed1 > st2: seqmat[:] = 0 X0 = [seqmat] n12 = len(seq12) X2 = [ s2n(seq12[elem:elem + kmer], code5) for elem in range(n12 - kmer + 1) ] return X0, X1, X2
gdt2 = GenomeDiagram.Track('GC content', greytrack=1, greytrack_labels=4) gdt2.add_set(gdgs) track_list.append(gdt2) def gcSkewData(lst): ret_list = [] i = 50 for x in lst: ret_list.append((i, x)) i = i + 100 return ret_list graphdata2 = gcSkewData(SeqUtils.GC_skew(seq_string)) gdgs2 = GenomeDiagram.GraphSet('GC Skew') gdgs2.new_graph(graphdata, 'GC Skew', style='line', linewidth=2) gdt3 = GenomeDiagram.Track('GC Skew', greytrack=1, greytrack_labels=4) gdt3.add_set(gdgs2) track_list.append(gdt3) gd_diagram = GenomeDiagram.Diagram("Tomato Curly Stunt Virus, complete genome", track_size=0.7) i = 1 for track in track_list: