Example #1
0
def arenosa_snps(pileup_file, o_1_file):
	Aa_SNPs = Pileup.parse(pileup_file, region_list="Chr1;Chr2;Chr3;Chr4;Chr5") # Only saves SNP positions by default
	with gzip.open(o_1_file, 'wb') as outfile:
		for entry in sorted(Aa_SNPs.pileup_dict.items(), key=lambda entry: (entry[0][0], entry[0][1])):
			(chr, pos), [ref, nuclist] = entry
			#nuclist_str = ""
			#for entry in nuclist:
			#	nuclist_str += str(entry) + "\t"
			#nuclist_str = nuclist_str[:-1]
			nuclist_str = '\t'.join([entry for entry in nuclist])
			outline = '\t'.join(["At" + str(chr),str(pos),str(ref),str(nuclist_str)]) + "\n"
			outfile.write(bytes(outline,"UTF-8"))
	return()
Example #2
0
def arenosa_snps(pileup_file, o_1_file):
    aa_SNPs = Pileup.parse(pileup_file, region_list="Chr1;Chr2;Chr3;Chr4;Chr5"
                           )  # Only saves SNP positions by default
    with gzip.open(o_1_file, 'wb') as outfile:
        for entry in sorted(aa_SNPs.pileup_dict.items(),
                            key=lambda entry: (entry[0][0], entry[0][1])):
            (chr, pos), [ref, nuclist] = entry
            nuclist_str = '\t'.join([entry for entry in nuclist])
            outline = '\t'.join(
                ["At" + str(chr),
                 str(pos),
                 str(ref),
                 str(nuclist_str)]) + "\n"
            outfile.write(bytes(outline, "UTF-8"))
    return ()
Example #3
0
def filter_sam(sam_file, SNP_file, o_2_file):
	pattern = r"(\d+)([MSID])"
	recomp = re.compile(pattern)
	Aa_SNPs = Pileup.parse(SNP_file, simplified=True, only_save_SNPs=False, region_list="AtChr1;AtChr2;AtChr3;AtChr4;AtChr5")
	if sam_file[-3:] == ".gz":
		infile = gzip.open(sam_file, 'rb')
	else:
		infile = open(sam_file)
	with gzip.open(o_2_file, 'wb') as outfile:
		for line in infile:
			if sam_file[-3:] == ".gz": line = str(line, encoding='utf8')
			line_split = line.split("\t")
			if line_split[0][:1] != "@":
				chr = line_split[2]
				if chr == "*":
					outfile.write(bytes(line,"UTF-8"))
					continue
				skip_read = False
				for i in range(11,len(line_split)):
					if line_split[i] == "NM:i:0": # Read has a perfect sequence match
						outfile.write(bytes(line,"UTF-8"))
						skip_read = True
						break
				if skip_read == True: continue
				read_name = line_split[0] # Delete me
				s_pos = int(line_split[3])
				cigar = line_split[5]
				cigar_pieces = recomp.findall(cigar)
				nucs = line_split[9]
				current_pos = s_pos
				for piece in cigar_pieces:
					val, operation = piece
					val = int(val)
					if operation == "M": # A, C, T, or G
						for i in range(current_pos, current_pos + val): # Check all nucleotides for match w/ reference
							ref, nuclist = Aa_SNPs.PosInfo(chr, i)
							if ref != -1: # Ref == -1 when there isn't an A. arenosa SNP at position (chr, i)
								cur_nuc = nucs[i - s_pos:i - s_pos + 1]
								if cur_nuc != ref: # Read doesn't contain reference nucleotide
									total_nucs = Pileup.sum_nucs(nuclist, True)
									SNP = {}
									SNP["A"] = nuclist[0]
									SNP["C"] = nuclist[1]
									SNP["G"] = nuclist[2]
									SNP["T"] = nuclist[3]
									if cur_nuc in SNP and (SNP[cur_nuc] / total_nucs) >= 0.05:
										# SNP present in >=5% of A. arenosa reads mapped to TAIR9, so consider it a valid mismap; skip read
										print(str(cur_nuc), " (", str(ref), "): ", str(round(SNP[cur_nuc] / total_nucs,2)), "\t", str(chr), ":", str(i), "\t", str(read_name), sep='') # Delete me
										skip_read = True
										break
					else: # Insertion or deletion
						ref, nuclist = Aa_SNPs.PosInfo(chr, i)
						if ref != -1: # Ref == -1 when there isn't an A. arenosa SNP at position (chr, i)
							if operation == "I":
								indel_str = "+" + str(val)
							else:
								indel_str = "-" + str(val)
							for i in range(4,len(nuclist)):
								if nuclist[i][:2] == indel_str:
									total_nucs = Pileup.sum_nucs(nuclist, True)
									if ( int(nuclist[i][3+int(val):]) / total_nucs ) >= 0.05:
										# Indel present in >=5% of A. arenosa reads mapped to TAIR9, so consider it a valid mismap; skip read
										print(str(nuclist[i]), " (", str(ref), "): ", str(round(int(nuclist[i][3+int(val):]) / total_nucs,2)), str(chr), ":", str(i), "\t", str(read_name), sep='') # Delete me
										skip_read = True
										break
					if skip_read == True:
						break
					current_pos += val
				if skip_read == True:
						continue
			outfile.write(bytes(line,"UTF-8"))
Example #4
0
with open(dup_file) as infile:
	for line in infile:
		line = line.strip()
		dup_list.append(line)
gff_genes_dict = GFF.parse_genes(gff_file)
if adj_value > 0:
	for gene_name, [chr, start_pos, end_pos] in gff_genes_dict.gene_dict.items():
		start_pos = start_pos - adj_value
		end_pos = end_pos + adj_value
		gff_genes_dict.gene_dict[gene_name] = [chr, start_pos, end_pos]
if use_random == "Y":
	# Construct a list of randomly selected genes to replace the list of duplicated genes
	#dup_list = sample(gff_genes_dict.gene_dict.keys(),len(dup_list))
	dup_list = gff_genes_dict.gene_dict.keys() # Delete me soon
dup_positions = ";".join([chr + ":" + str(s_pos) + "-" + str(e_pos) for gene_name, [chr,s_pos,e_pos] in sorted(gff_genes_dict.gene_dict.items(), key = lambda x: (x[1][0], x[1][1]) ) if gene_name in dup_list ] )
dup_pileup = Pileup.parse(pileup_file, False, dup_positions)

suns_in_gene = Counter()
suns_in_gene_prob = {}

# SUN-finding algorithm that runs when finding SUNs in As. Looks for 50/50 Con/SNP in As
with open(sun_genes_filename[:-4] + "_SUNs.txt", 'w') as sue_sun_file: # Debugging file
	fisher_line = "Pos\tRef\tRefValue\tMaxSNP\tMaxSNPValue\tP-value\n"
	sue_sun_file.write(fisher_line)
	for entry in dup_pileup:
		(chr, pos), [ref, nuclist] = entry
		if ref not in ["A","C","G","T"]: continue # Skip ambiguous nucleotide positions
		gene_name = gff_genes_dict.get_gene_name(chr, pos)
		if gene_name in dup_list:
			totalreads, consensus_num, maxSNPs = Pileup.fisher_snp_info(ref,nuclist)
			if totalreads < 12 or sum([int(v) for v in maxSNPs.values()]) == 0: continue # Either the total usable reads (consensus + maxSNP)  is < 12, or no SNPs are present
Example #5
0
def filter_sam(sam_file, SNP_file, o_2_file):
    pattern = r"(\d+)([MSID])"
    recomp = re.compile(pattern)
    aa_SNPs = Pileup.parse(SNP_file,
                           simplified=True,
                           only_save_SNPs=False,
                           region_list="AtChr1;AtChr2;AtChr3;AtChr4;AtChr5")
    if sam_file[-3:] == ".gz":
        infile = gzip.open(sam_file, 'rb')
    else:
        infile = open(sam_file)
    with gzip.open(o_2_file, 'wb') as outfile:
        for line in infile:
            if sam_file[-3:] == ".gz": line = line.decode('utf-8')
            line_split = line.split("\t")
            if line_split[0][:1] != "@":
                chr = line_split[2]
                if chr == "*":
                    outfile.write(bytes(line, "UTF-8"))
                    continue
                skip_read = False
                for i in range(11, len(line_split)):
                    if line_split[
                            i] == "NM:i:0":  # Read has a perfect sequence match
                        outfile.write(bytes(line, "UTF-8"))
                        skip_read = True
                        break
                if skip_read == True: continue
                #read_name = line_split[0] # Delete me
                s_pos = int(line_split[3])
                cigar = line_split[5]
                cigar_pieces = recomp.findall(cigar)
                nucs = line_split[9]
                current_pos = s_pos
                for piece in cigar_pieces:
                    val, operation = piece
                    val = int(val)
                    if operation == "M":  # A, C, T, or G
                        for i in range(
                                current_pos, current_pos + val
                        ):  # Check all nucleotides for match w/ reference
                            ref, nuclist = aa_SNPs.PosInfo(chr, i)
                            if ref != -1:  # Ref == -1 when there isn't an A. arenosa SNP at position (chr, i)
                                cur_nuc = nucs[i - s_pos:i - s_pos + 1]
                                if cur_nuc != ref:  # Read doesn't contain reference nucleotide
                                    total_nucs = Pileup.sum_nucs(nuclist, True)
                                    SNP = {}
                                    SNP["A"] = nuclist[0]
                                    SNP["C"] = nuclist[1]
                                    SNP["G"] = nuclist[2]
                                    SNP["T"] = nuclist[3]
                                    if cur_nuc in SNP and (SNP[cur_nuc] /
                                                           total_nucs) >= 0.05:
                                        # SNP present in >=5% of A. arenosa reads mapped to TAIR9, so consider it a valid mismap; skip read
                                        #print(str(cur_nuc), " (", str(ref), "): ", str(round(SNP[cur_nuc] / total_nucs,2)), "\t", str(chr), ":", str(i), "\t", str(read_name), sep='') # Delete me
                                        skip_read = True
                                        break
                    else:  # Insertion or deletion
                        ref, nuclist = aa_SNPs.PosInfo(chr, i)
                        if ref != -1:  # Ref == -1 when there isn't an A. arenosa SNP at position (chr, i)
                            if operation == "I":
                                indel_str = "+" + str(val)
                            else:
                                indel_str = "-" + str(val)
                            for i in range(4, len(nuclist)):
                                if nuclist[i][:2] == indel_str:
                                    total_nucs = Pileup.sum_nucs(nuclist, True)
                                    if (int(nuclist[i][3 + int(val):]) /
                                            total_nucs) >= 0.05:
                                        # Indel present in >=5% of A. arenosa reads mapped to TAIR9, so consider it a valid mismap; skip read
                                        #print(str(nuclist[i]), " (", str(ref), "): ", str(round(int(nuclist[i][3+int(val):]) / total_nucs,2)), str(chr), ":", str(i), "\t", str(read_name), sep='') # Delete me
                                        skip_read = True
                                        break
                    if skip_read == True:
                        break
                    current_pos += val
                if skip_read == True:
                    continue
            outfile.write(bytes(line, "UTF-8"))
Example #6
0
            linelist = line.split()
            if line[:1] == ">":
                if chr != "":
                    chr_lengths[chr] = chr_len
                chr = linelist[0][1:]
                chr_len = 0
            else:
                line = line.split()
                chr_len += len(line[0])
    chr_lengths[chr] = chr_len
    return(chr_lengths)

pileup, ref, win = command_line()
chr_lengths = get_chr_len(ref)

PileupFile = Pileup.parse(pileup,True)

#windows = [i for i in range(1,1501)]
windows = [i for i in range(1600,5001,100)]
for win in windows:
    outdir = "Output/HMMCovWin/" + str(win) + "bp-window/"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    for chr in sorted(chr_lengths):
        histogram = Counter()
        outlist = []
        for i in range(1,chr_lengths[chr]+1,win):
            loc = str(chr) + ":" + str(i) + "-" + str(i+win)
            cov = PileupFile.Coverage(loc,True)
            if cov in histogram: histogram[cov] += 1
Example #7
0
#!/usr/bin/env python3
import sys; sys.path.append('../Packages/')
from GenomeTools import Pileup

#filename = "/home/mattchat/SuecicaDupSearch/Data/Sue/sue1/single_bp_preprocess/set5/libSUE1_set5_aln_Aa_Filtered.pileup.gz"
#Pileup.coverage(filename, "AtChr1")
#Pileup.coverage(filename, "AtChr2")
#Pileup.coverage(filename, "AtChr3")
#Pileup.coverage(filename, "AtChr4")
#Pileup.coverage(filename, "AtChr5")
#Pileup.coverage(filename, "AtChr1;AtChr2;AtChr3;AtChr4;AtChr5")

#filename = "/home/mattchat/SuecicaDupSearch/Data/weigel_col-0/SRR013327,SRR013328_aln.pileup.gz"
#filename = "/home/mattchat/SuecicaDupSearch/Data/weigel_bur-0/SRR013331,SRR013333_aln.pileup.gz"
#filename = "/home/mattchat/SuecicaDupSearch/Data/weigel_tsu-1/SRR013335,SRR013337_aln.pileup.gz"
#filename = "/home/mattchat/SuecicaDupSearch/Data/C24/C24_all_reads_aln.pileup.gz"
filename = "/home/mattchat/SuecicaDupSearch/Data/Ler-0/ERR031544,SRR279136_TAIR9_aln.pileup.gz"
Pileup.coverage(filename, "Chr1")
Pileup.coverage(filename, "Chr2")
Pileup.coverage(filename, "Chr3")
Pileup.coverage(filename, "Chr4")
Pileup.coverage(filename, "Chr5")
Pileup.coverage(filename, "Chr1;Chr2;Chr3;Chr4;Chr5")
Example #8
0
            if line[:1] == ">":
                if chr != "":
                    chr_lengths[chr] = chr_len
                chr = linelist[0][1:]
                chr_len = 0
            else:
                line = line.split()
                chr_len += len(line[0])
    chr_lengths[chr] = chr_len
    return (chr_lengths)


pileup, ref, win = command_line()
chr_lengths = get_chr_len(ref)

PileupFile = Pileup.parse(pileup, True)

#windows = [i for i in range(1,1501)]
windows = [i for i in range(1600, 5001, 100)]
for win in windows:
    outdir = "Output/HMMCovWin/" + str(win) + "bp-window/"
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    for chr in sorted(chr_lengths):
        histogram = Counter()
        outlist = []
        for i in range(1, chr_lengths[chr] + 1, win):
            loc = str(chr) + ":" + str(i) + "-" + str(i + win)
            cov = PileupFile.Coverage(loc, True)
            if cov in histogram: histogram[cov] += 1