def get_genome_cov(bam_file, ref_file, min_dp): fdict = fasta(ref_file).fa_dict ref_cov = {} for s in fdict: ref_cov[s] = [0 for x in range(len(fdict[s]))] samtools_cmd = "samtools depth -aa --reference %s %s" % (ref_file, bam_file) log(samtools_cmd) for l in subprocess.Popen(samtools_cmd, shell=True, stdout=subprocess.PIPE).stdout: arr = l.rstrip().split() if arr[0] not in ref_cov: print "Can't find %s in FASTA...Have you used the correct reference sequence?" quit() ref_cov[arr[0]][int(arr[1]) - 1] = int(arr[2]) all_genome = [] for s in fdict: all_genome += ref_cov[s] genome_cov = {} for dp in min_dp: genome_cov[dp] = len([1 for d in all_genome if d >= dp ]) / len(all_genome) med = int(np.median(all_genome)) return genome_cov, med, ref_cov
def __init__(self,fa_file,prefix,threads=4): self.params = {} if filecheck(fa_file): self.params["fa_file"] = fa_file self.fasta = fasta(fa_file) self.params["threads"] = threads self.params["prefix"] = prefix
def ancestral_reconstruct(self): cmd = "bcftools query -f '%%CHROM\\t%%POS\n' %(bcf)s" % vars(self) variants = {} for i, l in enumerate( subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout): row = l.rstrip().split() variants[i] = (row[0], row[1]) self.reduced_bcf = "%(prefix)s.reduced.bcf" % vars(self) cmd = "bcftools view -c 3 %(bcf)s -Ob -o %(reduced_bcf)s" % vars(self) run_cmd(cmd) reduced = {} cmd = "bcftools query -f '%%CHROM\\t%%POS\n' %(reduced_bcf)s" % vars( self) for i, l in enumerate( subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout): row = l.rstrip().split() reduced[i] = (row[0], row[1]) new_bcf = bcf(self.reduced_bcf) self.fasta_file = "%(prefix)s.reduced.snps.fa" % vars(self) new_bcf.vcf_to_fasta(self.fasta_file) self.tree_file = "%s.newick.txt" % self.prefix self.reconstructed_fasta = "%s.reconstructed.fasta" % self.prefix cmd = "fastml -s %(fasta_file)s -x %(tree_file)s -j %(reconstructed_fasta)s -qf -mn" % vars( self) # run_cmd(cmd,verbose=2) fdict = fasta(self.reconstructed_fasta).fa_dict t = Tree(self.tree_file, format=1) for i in range(len(fdict.values()[0])): num_transitions = 0 for node in t.traverse("postorder"): if len(node.get_ancestors()) == 0: continue anc = node.get_ancestors()[0] nuc1 = fdict[anc.name][i] nuc2 = fdict[node.name][i] if nuc1 != "?" and nuc2 != "?" and nuc1 != "N" and nuc2 != "N": if nuc1 != nuc2: num_transitions += 1 print "%s>%s" % (nuc1, nuc2) if num_transitions > 1: print "Site: %s" % i print "Number of transitions: %s" % num_transitions print "Location: %s" % (reduced[i][1]) for node in t.traverse("postorder"): nuc = fdict[node.name][i] node.add_features(nuc=nuc) #p = probs[node.name][i][nuc] if node.name in probs else 1.0 #node.add_features(prob=p) print t.get_ascii(attributes=["name", "nuc"], show_internal=True)
def extract_gc_skew(self, filename, window=1000, step=500): fa_dict = fasta(self.ref).fa_dict # gc = [] # cov = [] hw = int(window / 2) results = defaultdict(list) for s in fa_dict: for i in range(hw, len(fa_dict[s]) - hw, step): seq = fa_dict[s][i - hw:i + hw] tmp = dict((c, seq.count(c)) for c in ["C", "G"]) results[int((tmp["G"] + tmp["C"]) / (window) * 100)].append( int(np.median(self.ref_dp[s][i - hw:i + hw]))) # gc.append(int((tmp["G"]+tmp["C"])/(window)*100)) # cov.append(int(np.median(self.ref_dp[s][i-hw:i+hw]))) # O = open(filename,"w") # for i in range(len(gc)): # O.write("%s\t%s\n" % (gc[i],cov[i])) # O.close() json.dump(results, open(filename, "w"))
def generate_consensus(self, ref): add_arguments_to_self(self, locals()) for s in self.samples: self.tmp_sample = s cmd = "bcftools view -s %(tmp_sample)s -i 'GT==\"./.\"' %(filename)s | bcftools query -f '%%CHROM\\t%%POS\\n'" % vars( self) self.tmp_file = "%(prefix)s.%(tmp_sample)s.missing.bed" % vars( self) TMP = open(self.tmp_file, "w") for l in cmd_out(cmd): row = l.rstrip().split() TMP.write("%s\t%s\t%s\n" % (row[0], int(row[1]) - 1, row[1])) TMP.close() self.tmp_fa = "%(prefix)s.%(tmp_sample)s.tmp.fasta" % vars(self) cmd = "bcftools consensus -f %(ref)s %(filename)s -o %(tmp_fa)s -m %(tmp_file)s -s %(tmp_sample)s" % vars( self) run_cmd(cmd) fa_dict = fasta(self.tmp_fa).fa_dict self.final_fa = "%(prefix)s.%(tmp_sample)s.fasta" % vars(self) FA = open(self.final_fa, "w") for seq in fa_dict: FA.write(">%s_%s\n%s" % (self.tmp_sample, seq, fa_dict[seq])) FA.close() rm_files([self.tmp_file, self.tmp_fa])
def variants2vcf(var_file, seq1_file, seq2_file, prefix, vcf_file): seq1_dict = fasta(seq1_file) seq2_dict = fasta(seq2_file) good_dp = 20 realign_min_length = 5 min_flank = 100 seq1_chrom_i = 0 seq1_pos_i = 1 seq1_i = 2 seq2_i = 3 seq2_pos_i = 4 seq2_chrom_i = 5 gap_char = "-" del_lines = [] ins_lines = [] indel_line_set = set() tmp = [] prev_type = None prev_pos = None prev_seq1_chrom = None lines = [l.rstrip().split() for l in open(var_file).readlines()] for i in range(len(lines)): #seq1_chromosome 4215484 G C #Chroosome 212835 C - row = lines[i] pos = int(row[seq1_pos_i]) seq1_chrom = row[seq1_chrom_i] if row[seq1_i] == gap_char or row[seq2_i] == gap_char: indel_line_set.add(i) if prev_pos == None: tmp = [i] elif row[seq1_i] == gap_char: if pos == prev_pos: tmp.append(i) else: if prev_type == "ins": ins_lines.append(tmp) else: del_lines.append(tmp) tmp = [i] elif row[seq2_i] == gap_char: if pos == prev_pos + 1: tmp.append(i) else: if prev_type == "ins": ins_lines.append(tmp) else: del_lines.append(tmp) tmp = [i] prev_type = "ins" if row[seq1_i] == gap_char else "del" prev_pos = pos prev_seq1_chrom = seq1_chrom variants = defaultdict(dict) # for indel_pos in del_lines: # seq1_positions = [int(lines[i][seq1_pos_i]) for i in indel_pos] # seq2_positions = [int(lines[i][seq2_pos_i]) for i in indel_pos] # bases = [lines[i][seq1_i] for i in indel_pos] # seq1_chrom = lines[0][seq1_chrom_i] # query_chrom = lines[0][seq2_chrom_i] # indel_size = len(seq1_positions) # flank_size = indel_size if indel_size>min_flank else min_flank # # seq1_start,seq1_end = seq1_positions[0]-1,seq1_positions[-1] # seq2_start,seq2_end = seq2_positions[0],seq2_positions[-1] # seq1_left_flank,seq1_right_flank = seq1_start-flank_size,seq1_end+flank_size # seq2_left_flank,seq2_right_flank = seq2_start-flank_size,seq2_end+flank_size # # if indel_size>=realign_min_length: # print "-"*40 # print "Anslysing deletion from %s to %s" % (seq1_start,seq1_end) # print "Extracting from %s %s:%s-%s" % (seq1_file,seq1_chrom,seq1_left_flank,seq1_right_flank) # print "Extracting from %s %s:%s-%s" % (seq2_file,query_chrom,seq2_left_flank,seq2_right_flank) # tmp_file_in = "%s.tmp.in.fa" % prefix # tmp_file_out = "%s.tmp.out.fa" % prefix # O = open(tmp_file_in,"w") # O.write(">seq1\n%s\n" % (seq1_dict.get_seq(seq1_chrom,seq1_left_flank,seq1_right_flank))) # O.write(">seq2\n%s\n" % (seq2_dict.get_seq(query_chrom,seq2_left_flank,seq2_right_flank))) # O.close() # muscle_align(tmp_file_in,tmp_file_out) # seq1_cnt = seq1_left_flank # seq2_cnt = seq2_left_flank # aln_dict = fasta(tmp_file_out) # tmp_seq1_bases = [] # tmp_seq2_bases = [] # tmp_seq1_positions = [] # tmp_seq2_positions = [] # for tmp_i in aln_dict.loop_pos("seq1"): # seq1_cnt = seq1_left_flank+tmp_i # seq2_cnt = seq2_left_flank+tmp_i # if aln_dict.fa_dict["seq1"][tmp_i] != aln_dict.fa_dict["seq2"][tmp_i]: # tmp_seq1_bases.append(aln_dict.fa_dict["seq1"][tmp_i]) # tmp_seq2_bases.append(aln_dict.fa_dict["seq2"][tmp_i]) # tmp_seq1_positions.append(seq1_cnt) # tmp_seq2_positions.append(seq2_cnt) # if tmp_seq1_bases!=bases: # print [lines[i][seq1_i] for i in indel_pos] # print tmp_seq1_bases # print tmp_seq2_bases # print tmp_seq1_positions # print tmp_seq2_positions # print first_seq1_pos-1 # quit() # # bases = tmp_seq1_bases # seq1_positions = tmp_seq1_positions # first_seq1_pos = seq1_positions[0] # # switch = True # start_pos = first_seq1_pos-1 # while switch: # n = seq1_dict.get_seq(seq1_chrom,start_pos) # end_base = bases[-1] # if n==end_base: # start_pos-=1 # bases.insert(0,bases.pop()) # else: # switch=False # alt_seq = seq1_dict.get_seq(seq1_chrom,start_pos) # ref_seq = alt_seq+"".join(bases) # variants[seq1_chrom][start_pos] = (ref_seq,alt_seq,"1/1",good_dp) # for indel_pos in ins_lines: # positions = [lines[i][seq1_pos_i] for i in indel_pos] # first_seq1_pos = int(positions[0]) # seq1_chrom = lines[0][seq1_chrom_i] # indel_size = len(positions) # print "-"*40 # print "Anslysing insertion from %s to %s" % (positions[0],positions[-1]) # bases = [lines[i][seq2_i] for i in indel_pos] # switch = True # start_pos = first_seq1_pos # while switch: # n = seq1_dict.get_seq(seq1_chrom,start_pos) # end_base = bases[-1] # if n==end_base: # start_pos-=1 # bases.insert(0,bases.pop()) # else: # switch=False # ref_seq = seq1_dict.get_seq(seq1_chrom,start_pos) # alt_seq = ref_seq+"".join(bases) # variants[seq1_chrom][start_pos] = (ref_seq,alt_seq,"1/1",good_dp) for i in set(range(len(lines))) - indel_line_set: row = lines[i] pos, ref_seq, alt_seq = row[seq1_pos_i:seq1_pos_i + 3] seq1_chrom = row[seq1_chrom_i] variants[seq1_chrom][int(pos)] = (ref_seq, alt_seq, "1/1", good_dp) OUT = open(vcf_file, "w") OUT.write("""##fileformat=VCFv4.1 ##reference=/home/jody/refgenome/MTB-h37rv_asm19595v2-eg18.fa ##contig=<ID=seq1_chromosome,length=4411532> ##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Raw Depth"> ##INFO=<ID=MinDP,Number=1,Type=Integer,Description="Minimum per-sample depth in this gVCF block"> ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record"> #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s """ % prefix) npos = None nseq1_chrom = None for seq1_chrom in sorted(variants): for pos in sorted(variants[seq1_chrom]): if nseq1_chrom != seq1_chrom: if pos != 1: npos = 1 OUT.write( "%s\t%s\t.\t%s\t.\t.\t.\tEND=%s;MinDP=20\tGT:DP\t0/0:%s\n" % (seq1_chrom, npos, seq1_dict.get_seq( seq1_chrom, npos), pos - 1, good_dp)) else: pass else: if pos != npos + 1: OUT.write( "%s\t%s\t.\t%s\t.\t.\t.\tEND=%s;MinDP=20\tGT:DP\t0/0:%s\n" % (nseq1_chrom, npos + 1, seq1_dict.get_seq(seq1_chrom, npos), pos - 1, good_dp)) var = variants[seq1_chrom][pos] if pos == 165608: print var if var[0] == "N" or var[0] == "n" or var[1] == "N" or var[1] == "n": if pos == 165608: print var OUT.write("%s\t%s\t.\t%s\t%s\t255\t.\t.\tGT:DP\t%s:%s\n" % (seq1_chrom, pos, var[0], ".", "./.", ".")) else: OUT.write("%s\t%s\t.\t%s\t%s\t255\t.\t.\tGT:DP\t%s:%s\n" % (seq1_chrom, pos, var[0], var[1], var[2], var[3])) npos = pos nseq1_chrom = seq1_chrom OUT.close()
def plot_cov(self, chrom, imgfile, start=None, end=None, window=10000, step=5000, optimise=True, plot_median=True, primers=None): """ Plot coverage across chromosomes Args: chrom(str): Chromosome name imgfile(str): Name of the output png window(int): Window size for the sliding window coverage calculation step(int): Step size for the sliding window coverage calculation optimise(bool): Optimise window and step size for chromosome len """ if plot_median: chrom_med_dp = np.median(self.ref_dp[chrom]) if start and end: region_size = end - start offset = int(region_size * 0.05) new_start = start - offset new_end = end + offset else: offset = False region_size = len(self.ref_dp[chrom]) start = 0 end = region_size new_start = start new_end = end if region_size < 100000: n, d = "K", 1000 elif region_size > 100000 and region_size < 1000000000: n, d = "M", 1000000 else: n, d = "G", 1000000000 if optimise: if region_size < 10000: window, step = 2, 1 elif region_size < 100000: window, step = 100, 50 elif region_size > 100000 and region_size < 1000000: window, step = 1000, 500 log("Outputting coverage plot for region (%sbp) with window=%s and step=%s" % (region_size, window, step)) x = [] y = [] hw = int(window / 2) for i in range(new_start + hw, new_end - hw, step): x.append(i / d) y.append(int(np.median(self.ref_dp[chrom][i - hw:i + hw + 1]))) fig = plt.figure() plot = fig.add_subplot(111) plot.plot(x, y) plot.set_ylim(bottom=0) if max(y) > 200: plot.set_yscale('symlog') plot.set_xlabel("Genome Position (%sb)" % n) plot.set_ylabel("Median Coverage (Window size:%s)" % window) if plot_median: plot.axhline(xmin=0, xmax=1, y=chrom_med_dp, color="orange", linestyle="dashed") if offset: plot.axvline(ymin=0, ymax=0.05, x=start / d, color="orange") plot.axvline(ymin=0, ymax=0.05, x=end / d, color="orange") if primers: locations = fasta(self.ref).find_primer_positions(primers) for primer in sorted(locations, key=lambda x: locations[x]["start"]): p = locations[primer] plot.plot((p["start"] / d, p["end"] / d), (0, 0), 'r-', lw=3) fig.savefig(imgfile)
optdict[k] = line[1] aadict[k] = line[0] f.close() # Create output folders for fasta files and report files for each sequence. for folder in (FOLDER_SEQMUT, FOLDER_RAPMUT): try: mkdir(folder) except OSError: system("rm -r " + folder) mkdir(folder) print "gene\tnr_mut\tproc_mut\tIe_org\tIq_org\tImin_org\tIe_ost\tIq_ost\tImin_ost" for gen in mrnadict.keys(): seq = mrnadict[gen] mutseq = fasta() rapfile = open(FOLDER_RAPMUT + "/" + gen + "_rapmut.tsv", "w") rap_head = [ "seq_name", "nr_codque", "typ_codque", "aa_codque", "nr_codmut", "typ_codmut", "aa_codmut", "Ie", "Iq", "Imin" ] rapfile.write("\t".join(rap_head) + "\n") nr_mut = 0 seq_name = gen + "_mut_" + str(nr_mut) Iq = elongation_time(seq[0:11]) evilcod = translacja(seq, Iq) Ie = count_ie(evilcod) Imin = Iq + Ie nr_codque = codque(evilcod) nr_codmut = codmut(seq, nr_codque)
aadict[k] = line[0] f.close() # Create output folders for fasta files and report files for each sequence. for folder in (FOLDER_SEQMUT, FOLDER_RAPMUT): try: mkdir(folder) except OSError: system("rm -r " + folder) mkdir(folder) print "gene\tnr_mut\tproc_mut\tIe_org\tIq_org\tImin_org\tIe_ost\tIq_ost\tImin_ost" for gen in mrnadict.keys(): seq = mrnadict[gen] mutseq = fasta() rapfile = open(FOLDER_RAPMUT+"/"+gen+"_rapmut.tsv", "w") rap_head = ["seq_name", "nr_codque", "typ_codque", "aa_codque", "nr_codmut", "typ_codmut", "aa_codmut", "Ie", "Iq", "Imin"] rapfile.write("\t".join(rap_head) + "\n") nr_mut = 0 seq_name = gen + "_mut_" + str(nr_mut) Iq = elongation_time(seq[0:11]) evilcod = translacja(seq, Iq) Ie = count_ie(evilcod) Imin = Iq + Ie nr_codque = codque(evilcod) nr_codmut = codmut(seq, nr_codque) Ie_org = Ie Iq_org = Iq