Esempio n. 1
0
def get_genome_cov(bam_file, ref_file, min_dp):
    fdict = fasta(ref_file).fa_dict
    ref_cov = {}
    for s in fdict:
        ref_cov[s] = [0 for x in range(len(fdict[s]))]
    samtools_cmd = "samtools depth -aa --reference %s %s" % (ref_file,
                                                             bam_file)
    log(samtools_cmd)
    for l in subprocess.Popen(samtools_cmd, shell=True,
                              stdout=subprocess.PIPE).stdout:
        arr = l.rstrip().split()
        if arr[0] not in ref_cov:
            print "Can't find %s in FASTA...Have you used the correct reference sequence?"
            quit()
        ref_cov[arr[0]][int(arr[1]) - 1] = int(arr[2])
    all_genome = []
    for s in fdict:
        all_genome += ref_cov[s]
    genome_cov = {}
    for dp in min_dp:
        genome_cov[dp] = len([1 for d in all_genome if d >= dp
                              ]) / len(all_genome)

    med = int(np.median(all_genome))
    return genome_cov, med, ref_cov
Esempio n. 2
0
	def __init__(self,fa_file,prefix,threads=4):
		self.params = {}
		if filecheck(fa_file):
			self.params["fa_file"] = fa_file
			self.fasta = fasta(fa_file)
			self.params["threads"] = threads
		self.params["prefix"] = prefix
Esempio n. 3
0
    def ancestral_reconstruct(self):
        cmd = "bcftools query -f '%%CHROM\\t%%POS\n' %(bcf)s" % vars(self)
        variants = {}
        for i, l in enumerate(
                subprocess.Popen(cmd, shell=True,
                                 stdout=subprocess.PIPE).stdout):
            row = l.rstrip().split()
            variants[i] = (row[0], row[1])
        self.reduced_bcf = "%(prefix)s.reduced.bcf" % vars(self)
        cmd = "bcftools view -c 3 %(bcf)s -Ob -o %(reduced_bcf)s" % vars(self)
        run_cmd(cmd)
        reduced = {}
        cmd = "bcftools query -f '%%CHROM\\t%%POS\n' %(reduced_bcf)s" % vars(
            self)
        for i, l in enumerate(
                subprocess.Popen(cmd, shell=True,
                                 stdout=subprocess.PIPE).stdout):
            row = l.rstrip().split()
            reduced[i] = (row[0], row[1])
        new_bcf = bcf(self.reduced_bcf)
        self.fasta_file = "%(prefix)s.reduced.snps.fa" % vars(self)
        new_bcf.vcf_to_fasta(self.fasta_file)

        self.tree_file = "%s.newick.txt" % self.prefix
        self.reconstructed_fasta = "%s.reconstructed.fasta" % self.prefix
        cmd = "fastml -s %(fasta_file)s -x %(tree_file)s -j %(reconstructed_fasta)s -qf -mn" % vars(
            self)
        #		run_cmd(cmd,verbose=2)

        fdict = fasta(self.reconstructed_fasta).fa_dict
        t = Tree(self.tree_file, format=1)

        for i in range(len(fdict.values()[0])):
            num_transitions = 0
            for node in t.traverse("postorder"):
                if len(node.get_ancestors()) == 0: continue
                anc = node.get_ancestors()[0]
                nuc1 = fdict[anc.name][i]
                nuc2 = fdict[node.name][i]
                if nuc1 != "?" and nuc2 != "?" and nuc1 != "N" and nuc2 != "N":
                    if nuc1 != nuc2:
                        num_transitions += 1
                        print "%s>%s" % (nuc1, nuc2)
            if num_transitions > 1:
                print "Site: %s" % i
                print "Number of transitions: %s" % num_transitions
                print "Location: %s" % (reduced[i][1])
                for node in t.traverse("postorder"):
                    nuc = fdict[node.name][i]
                    node.add_features(nuc=nuc)
                    #p = probs[node.name][i][nuc] if node.name in probs else 1.0
                    #node.add_features(prob=p)
                print t.get_ascii(attributes=["name", "nuc"],
                                  show_internal=True)
Esempio n. 4
0
    def extract_gc_skew(self, filename, window=1000, step=500):
        fa_dict = fasta(self.ref).fa_dict
        #		gc = []
        #		cov = []
        hw = int(window / 2)
        results = defaultdict(list)
        for s in fa_dict:
            for i in range(hw, len(fa_dict[s]) - hw, step):
                seq = fa_dict[s][i - hw:i + hw]
                tmp = dict((c, seq.count(c)) for c in ["C", "G"])
                results[int((tmp["G"] + tmp["C"]) / (window) * 100)].append(
                    int(np.median(self.ref_dp[s][i - hw:i + hw])))
#				gc.append(int((tmp["G"]+tmp["C"])/(window)*100))
#				cov.append(int(np.median(self.ref_dp[s][i-hw:i+hw])))
#		O = open(filename,"w")
#		for i in range(len(gc)):
#			O.write("%s\t%s\n" % (gc[i],cov[i]))
#		O.close()
        json.dump(results, open(filename, "w"))
Esempio n. 5
0
 def generate_consensus(self, ref):
     add_arguments_to_self(self, locals())
     for s in self.samples:
         self.tmp_sample = s
         cmd = "bcftools view -s %(tmp_sample)s -i 'GT==\"./.\"' %(filename)s | bcftools query -f '%%CHROM\\t%%POS\\n'" % vars(
             self)
         self.tmp_file = "%(prefix)s.%(tmp_sample)s.missing.bed" % vars(
             self)
         TMP = open(self.tmp_file, "w")
         for l in cmd_out(cmd):
             row = l.rstrip().split()
             TMP.write("%s\t%s\t%s\n" % (row[0], int(row[1]) - 1, row[1]))
         TMP.close()
         self.tmp_fa = "%(prefix)s.%(tmp_sample)s.tmp.fasta" % vars(self)
         cmd = "bcftools consensus -f %(ref)s %(filename)s -o %(tmp_fa)s -m %(tmp_file)s -s %(tmp_sample)s" % vars(
             self)
         run_cmd(cmd)
         fa_dict = fasta(self.tmp_fa).fa_dict
         self.final_fa = "%(prefix)s.%(tmp_sample)s.fasta" % vars(self)
         FA = open(self.final_fa, "w")
         for seq in fa_dict:
             FA.write(">%s_%s\n%s" % (self.tmp_sample, seq, fa_dict[seq]))
         FA.close()
         rm_files([self.tmp_file, self.tmp_fa])
Esempio n. 6
0
def variants2vcf(var_file, seq1_file, seq2_file, prefix, vcf_file):
    seq1_dict = fasta(seq1_file)
    seq2_dict = fasta(seq2_file)
    good_dp = 20
    realign_min_length = 5
    min_flank = 100

    seq1_chrom_i = 0
    seq1_pos_i = 1
    seq1_i = 2
    seq2_i = 3
    seq2_pos_i = 4
    seq2_chrom_i = 5
    gap_char = "-"

    del_lines = []
    ins_lines = []
    indel_line_set = set()
    tmp = []
    prev_type = None
    prev_pos = None
    prev_seq1_chrom = None
    lines = [l.rstrip().split() for l in open(var_file).readlines()]
    for i in range(len(lines)):
        #seq1_chromosome	4215484	G	C
        #Chroosome 212835	C	-

        row = lines[i]

        pos = int(row[seq1_pos_i])
        seq1_chrom = row[seq1_chrom_i]
        if row[seq1_i] == gap_char or row[seq2_i] == gap_char:
            indel_line_set.add(i)
            if prev_pos == None:
                tmp = [i]
            elif row[seq1_i] == gap_char:
                if pos == prev_pos:
                    tmp.append(i)
                else:
                    if prev_type == "ins":
                        ins_lines.append(tmp)
                    else:
                        del_lines.append(tmp)
                    tmp = [i]
            elif row[seq2_i] == gap_char:
                if pos == prev_pos + 1:
                    tmp.append(i)
                else:
                    if prev_type == "ins":
                        ins_lines.append(tmp)
                    else:
                        del_lines.append(tmp)
                    tmp = [i]
            prev_type = "ins" if row[seq1_i] == gap_char else "del"
            prev_pos = pos
            prev_seq1_chrom = seq1_chrom

    variants = defaultdict(dict)

    # for indel_pos in del_lines:
    # 	seq1_positions = [int(lines[i][seq1_pos_i]) for i in indel_pos]
    # 	seq2_positions = [int(lines[i][seq2_pos_i]) for i in indel_pos]
    # 	bases = [lines[i][seq1_i] for i in indel_pos]
    # 	seq1_chrom = lines[0][seq1_chrom_i]
    # 	query_chrom = lines[0][seq2_chrom_i]
    # 	indel_size = len(seq1_positions)
    # 	flank_size = indel_size if indel_size>min_flank else min_flank
    #
    # 	seq1_start,seq1_end = seq1_positions[0]-1,seq1_positions[-1]
    # 	seq2_start,seq2_end = seq2_positions[0],seq2_positions[-1]
    # 	seq1_left_flank,seq1_right_flank  =  seq1_start-flank_size,seq1_end+flank_size
    # 	seq2_left_flank,seq2_right_flank  =  seq2_start-flank_size,seq2_end+flank_size
    #
    # 	if indel_size>=realign_min_length:
    # 		print "-"*40
    # 		print "Anslysing deletion from %s to %s" % (seq1_start,seq1_end)
    # 		print "Extracting from %s %s:%s-%s" % (seq1_file,seq1_chrom,seq1_left_flank,seq1_right_flank)
    # 		print "Extracting from %s %s:%s-%s" % (seq2_file,query_chrom,seq2_left_flank,seq2_right_flank)
    # 		tmp_file_in = "%s.tmp.in.fa" % prefix
    # 		tmp_file_out = "%s.tmp.out.fa" % prefix
    # 		O = open(tmp_file_in,"w")
    # 		O.write(">seq1\n%s\n" % (seq1_dict.get_seq(seq1_chrom,seq1_left_flank,seq1_right_flank)))
    # 		O.write(">seq2\n%s\n" % (seq2_dict.get_seq(query_chrom,seq2_left_flank,seq2_right_flank)))
    # 		O.close()
    # 		muscle_align(tmp_file_in,tmp_file_out)
    # 		seq1_cnt = seq1_left_flank
    # 		seq2_cnt = seq2_left_flank
    # 		aln_dict = fasta(tmp_file_out)
    # 		tmp_seq1_bases = []
    # 		tmp_seq2_bases = []
    # 		tmp_seq1_positions = []
    # 		tmp_seq2_positions = []
    # 		for tmp_i in aln_dict.loop_pos("seq1"):
    # 			seq1_cnt = seq1_left_flank+tmp_i
    # 			seq2_cnt = seq2_left_flank+tmp_i
    # 			if aln_dict.fa_dict["seq1"][tmp_i] != aln_dict.fa_dict["seq2"][tmp_i]:
    # 				tmp_seq1_bases.append(aln_dict.fa_dict["seq1"][tmp_i])
    # 				tmp_seq2_bases.append(aln_dict.fa_dict["seq2"][tmp_i])
    # 				tmp_seq1_positions.append(seq1_cnt)
    # 				tmp_seq2_positions.append(seq2_cnt)
    # 		if tmp_seq1_bases!=bases:
    # 			print [lines[i][seq1_i] for i in indel_pos]
    # 			print tmp_seq1_bases
    # 			print tmp_seq2_bases
    # 			print tmp_seq1_positions
    # 			print tmp_seq2_positions
    # 			print first_seq1_pos-1
    # 			quit()
    #
    # 		bases = tmp_seq1_bases
    # 		seq1_positions = tmp_seq1_positions
    # 	first_seq1_pos = seq1_positions[0]
    #
    # 	switch = True
    # 	start_pos = first_seq1_pos-1
    # 	while switch:
    # 		n = seq1_dict.get_seq(seq1_chrom,start_pos)
    # 		end_base = bases[-1]
    # 		if n==end_base:
    # 			start_pos-=1
    # 			bases.insert(0,bases.pop())
    # 		else:
    # 			switch=False
    # 	alt_seq = seq1_dict.get_seq(seq1_chrom,start_pos)
    # 	ref_seq = alt_seq+"".join(bases)
    # 	variants[seq1_chrom][start_pos] = (ref_seq,alt_seq,"1/1",good_dp)
    # for indel_pos in ins_lines:
    # 	positions = [lines[i][seq1_pos_i] for i in indel_pos]
    # 	first_seq1_pos = int(positions[0])
    # 	seq1_chrom = lines[0][seq1_chrom_i]
    # 	indel_size = len(positions)
    # 	print "-"*40
    # 	print "Anslysing insertion from %s to %s" % (positions[0],positions[-1])
    # 	bases = [lines[i][seq2_i] for i in indel_pos]
    # 	switch = True
    # 	start_pos = first_seq1_pos
    # 	while switch:
    # 		n = seq1_dict.get_seq(seq1_chrom,start_pos)
    # 		end_base = bases[-1]
    # 		if n==end_base:
    # 			start_pos-=1
    # 			bases.insert(0,bases.pop())
    # 		else:
    # 			switch=False
    # 	ref_seq = seq1_dict.get_seq(seq1_chrom,start_pos)
    # 	alt_seq = ref_seq+"".join(bases)
    # 	variants[seq1_chrom][start_pos] = (ref_seq,alt_seq,"1/1",good_dp)

    for i in set(range(len(lines))) - indel_line_set:
        row = lines[i]
        pos, ref_seq, alt_seq = row[seq1_pos_i:seq1_pos_i + 3]
        seq1_chrom = row[seq1_chrom_i]
        variants[seq1_chrom][int(pos)] = (ref_seq, alt_seq, "1/1", good_dp)

    OUT = open(vcf_file, "w")
    OUT.write("""##fileformat=VCFv4.1
##reference=/home/jody/refgenome/MTB-h37rv_asm19595v2-eg18.fa
##contig=<ID=seq1_chromosome,length=4411532>
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">
##INFO=<ID=MinDP,Number=1,Type=Integer,Description="Minimum per-sample depth in this gVCF block">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s
""" % prefix)
    npos = None
    nseq1_chrom = None
    for seq1_chrom in sorted(variants):
        for pos in sorted(variants[seq1_chrom]):
            if nseq1_chrom != seq1_chrom:
                if pos != 1:
                    npos = 1
                    OUT.write(
                        "%s\t%s\t.\t%s\t.\t.\t.\tEND=%s;MinDP=20\tGT:DP\t0/0:%s\n"
                        % (seq1_chrom, npos, seq1_dict.get_seq(
                            seq1_chrom, npos), pos - 1, good_dp))
                else:
                    pass
            else:
                if pos != npos + 1:
                    OUT.write(
                        "%s\t%s\t.\t%s\t.\t.\t.\tEND=%s;MinDP=20\tGT:DP\t0/0:%s\n"
                        % (nseq1_chrom, npos + 1,
                           seq1_dict.get_seq(seq1_chrom,
                                             npos), pos - 1, good_dp))
            var = variants[seq1_chrom][pos]
            if pos == 165608: print var
            if var[0] == "N" or var[0] == "n" or var[1] == "N" or var[1] == "n":
                if pos == 165608: print var
                OUT.write("%s\t%s\t.\t%s\t%s\t255\t.\t.\tGT:DP\t%s:%s\n" %
                          (seq1_chrom, pos, var[0], ".", "./.", "."))
            else:
                OUT.write("%s\t%s\t.\t%s\t%s\t255\t.\t.\tGT:DP\t%s:%s\n" %
                          (seq1_chrom, pos, var[0], var[1], var[2], var[3]))
            npos = pos
            nseq1_chrom = seq1_chrom

    OUT.close()
Esempio n. 7
0
    def plot_cov(self,
                 chrom,
                 imgfile,
                 start=None,
                 end=None,
                 window=10000,
                 step=5000,
                 optimise=True,
                 plot_median=True,
                 primers=None):
        """
		Plot coverage across chromosomes

		Args:
			chrom(str):	Chromosome name
			imgfile(str): Name of the output png
			window(int): Window size for the sliding window coverage calculation
			step(int): Step size for the sliding window coverage calculation
			optimise(bool): Optimise window and step size for chromosome len
		"""
        if plot_median:
            chrom_med_dp = np.median(self.ref_dp[chrom])
        if start and end:
            region_size = end - start
            offset = int(region_size * 0.05)
            new_start = start - offset
            new_end = end + offset
        else:
            offset = False
            region_size = len(self.ref_dp[chrom])
            start = 0
            end = region_size
            new_start = start
            new_end = end
        if region_size < 100000:
            n, d = "K", 1000
        elif region_size > 100000 and region_size < 1000000000:
            n, d = "M", 1000000
        else:
            n, d = "G", 1000000000
        if optimise:
            if region_size < 10000:
                window, step = 2, 1
            elif region_size < 100000:
                window, step = 100, 50
            elif region_size > 100000 and region_size < 1000000:
                window, step = 1000, 500
        log("Outputting coverage plot for region (%sbp) with window=%s and step=%s"
            % (region_size, window, step))
        x = []
        y = []
        hw = int(window / 2)
        for i in range(new_start + hw, new_end - hw, step):
            x.append(i / d)
            y.append(int(np.median(self.ref_dp[chrom][i - hw:i + hw + 1])))
        fig = plt.figure()
        plot = fig.add_subplot(111)
        plot.plot(x, y)
        plot.set_ylim(bottom=0)
        if max(y) > 200:
            plot.set_yscale('symlog')
        plot.set_xlabel("Genome Position (%sb)" % n)
        plot.set_ylabel("Median Coverage (Window size:%s)" % window)
        if plot_median:
            plot.axhline(xmin=0,
                         xmax=1,
                         y=chrom_med_dp,
                         color="orange",
                         linestyle="dashed")
        if offset:
            plot.axvline(ymin=0, ymax=0.05, x=start / d, color="orange")
            plot.axvline(ymin=0, ymax=0.05, x=end / d, color="orange")
        if primers:
            locations = fasta(self.ref).find_primer_positions(primers)
            for primer in sorted(locations,
                                 key=lambda x: locations[x]["start"]):
                p = locations[primer]
                plot.plot((p["start"] / d, p["end"] / d), (0, 0), 'r-', lw=3)
        fig.savefig(imgfile)
Esempio n. 8
0
                optdict[k] = line[1]
            aadict[k] = line[0]
f.close()
# Create output folders for fasta files and report files for each sequence.
for folder in (FOLDER_SEQMUT, FOLDER_RAPMUT):
    try:
        mkdir(folder)
    except OSError:
        system("rm -r " + folder)
        mkdir(folder)

print "gene\tnr_mut\tproc_mut\tIe_org\tIq_org\tImin_org\tIe_ost\tIq_ost\tImin_ost"

for gen in mrnadict.keys():
    seq = mrnadict[gen]
    mutseq = fasta()
    rapfile = open(FOLDER_RAPMUT + "/" + gen + "_rapmut.tsv", "w")
    rap_head = [
        "seq_name", "nr_codque", "typ_codque", "aa_codque", "nr_codmut",
        "typ_codmut", "aa_codmut", "Ie", "Iq", "Imin"
    ]
    rapfile.write("\t".join(rap_head) + "\n")
    nr_mut = 0
    seq_name = gen + "_mut_" + str(nr_mut)
    Iq = elongation_time(seq[0:11])
    evilcod = translacja(seq, Iq)
    Ie = count_ie(evilcod)
    Imin = Iq + Ie

    nr_codque = codque(evilcod)
    nr_codmut = codmut(seq, nr_codque)
Esempio n. 9
0
			aadict[k] = line[0]
f.close()
# Create output folders for fasta files and report files for each sequence.
for folder in (FOLDER_SEQMUT, FOLDER_RAPMUT):
	try:
		mkdir(folder)				
	except OSError:
		system("rm -r " + folder)
		mkdir(folder)


print "gene\tnr_mut\tproc_mut\tIe_org\tIq_org\tImin_org\tIe_ost\tIq_ost\tImin_ost"

for gen in mrnadict.keys():
	seq    = mrnadict[gen]
	mutseq = fasta()			
	rapfile  = open(FOLDER_RAPMUT+"/"+gen+"_rapmut.tsv", "w")		
	rap_head = ["seq_name", "nr_codque", "typ_codque", "aa_codque", "nr_codmut", "typ_codmut", "aa_codmut", "Ie", "Iq", "Imin"]
	rapfile.write("\t".join(rap_head) + "\n")
	nr_mut = 0				
	seq_name   = gen + "_mut_" + str(nr_mut)
	Iq      = elongation_time(seq[0:11])
	evilcod = translacja(seq, Iq)
	Ie      = count_ie(evilcod)
	Imin    = Iq + Ie

	nr_codque  = codque(evilcod)
	nr_codmut  = codmut(seq, nr_codque)

	Ie_org   = Ie
	Iq_org   = Iq