def arenosa_snps(pileup_file, o_1_file): Aa_SNPs = Pileup.parse(pileup_file, region_list="Chr1;Chr2;Chr3;Chr4;Chr5") # Only saves SNP positions by default with gzip.open(o_1_file, 'wb') as outfile: for entry in sorted(Aa_SNPs.pileup_dict.items(), key=lambda entry: (entry[0][0], entry[0][1])): (chr, pos), [ref, nuclist] = entry #nuclist_str = "" #for entry in nuclist: # nuclist_str += str(entry) + "\t" #nuclist_str = nuclist_str[:-1] nuclist_str = '\t'.join([entry for entry in nuclist]) outline = '\t'.join(["At" + str(chr),str(pos),str(ref),str(nuclist_str)]) + "\n" outfile.write(bytes(outline,"UTF-8")) return()
def arenosa_snps(pileup_file, o_1_file): aa_SNPs = Pileup.parse(pileup_file, region_list="Chr1;Chr2;Chr3;Chr4;Chr5" ) # Only saves SNP positions by default with gzip.open(o_1_file, 'wb') as outfile: for entry in sorted(aa_SNPs.pileup_dict.items(), key=lambda entry: (entry[0][0], entry[0][1])): (chr, pos), [ref, nuclist] = entry nuclist_str = '\t'.join([entry for entry in nuclist]) outline = '\t'.join( ["At" + str(chr), str(pos), str(ref), str(nuclist_str)]) + "\n" outfile.write(bytes(outline, "UTF-8")) return ()
def filter_sam(sam_file, SNP_file, o_2_file): pattern = r"(\d+)([MSID])" recomp = re.compile(pattern) Aa_SNPs = Pileup.parse(SNP_file, simplified=True, only_save_SNPs=False, region_list="AtChr1;AtChr2;AtChr3;AtChr4;AtChr5") if sam_file[-3:] == ".gz": infile = gzip.open(sam_file, 'rb') else: infile = open(sam_file) with gzip.open(o_2_file, 'wb') as outfile: for line in infile: if sam_file[-3:] == ".gz": line = str(line, encoding='utf8') line_split = line.split("\t") if line_split[0][:1] != "@": chr = line_split[2] if chr == "*": outfile.write(bytes(line,"UTF-8")) continue skip_read = False for i in range(11,len(line_split)): if line_split[i] == "NM:i:0": # Read has a perfect sequence match outfile.write(bytes(line,"UTF-8")) skip_read = True break if skip_read == True: continue read_name = line_split[0] # Delete me s_pos = int(line_split[3]) cigar = line_split[5] cigar_pieces = recomp.findall(cigar) nucs = line_split[9] current_pos = s_pos for piece in cigar_pieces: val, operation = piece val = int(val) if operation == "M": # A, C, T, or G for i in range(current_pos, current_pos + val): # Check all nucleotides for match w/ reference ref, nuclist = Aa_SNPs.PosInfo(chr, i) if ref != -1: # Ref == -1 when there isn't an A. arenosa SNP at position (chr, i) cur_nuc = nucs[i - s_pos:i - s_pos + 1] if cur_nuc != ref: # Read doesn't contain reference nucleotide total_nucs = Pileup.sum_nucs(nuclist, True) SNP = {} SNP["A"] = nuclist[0] SNP["C"] = nuclist[1] SNP["G"] = nuclist[2] SNP["T"] = nuclist[3] if cur_nuc in SNP and (SNP[cur_nuc] / total_nucs) >= 0.05: # SNP present in >=5% of A. arenosa reads mapped to TAIR9, so consider it a valid mismap; skip read print(str(cur_nuc), " (", str(ref), "): ", str(round(SNP[cur_nuc] / total_nucs,2)), "\t", str(chr), ":", str(i), "\t", str(read_name), sep='') # Delete me skip_read = True break else: # Insertion or deletion ref, nuclist = Aa_SNPs.PosInfo(chr, i) if ref != -1: # Ref == -1 when there isn't an A. arenosa SNP at position (chr, i) if operation == "I": indel_str = "+" + str(val) else: indel_str = "-" + str(val) for i in range(4,len(nuclist)): if nuclist[i][:2] == indel_str: total_nucs = Pileup.sum_nucs(nuclist, True) if ( int(nuclist[i][3+int(val):]) / total_nucs ) >= 0.05: # Indel present in >=5% of A. arenosa reads mapped to TAIR9, so consider it a valid mismap; skip read print(str(nuclist[i]), " (", str(ref), "): ", str(round(int(nuclist[i][3+int(val):]) / total_nucs,2)), str(chr), ":", str(i), "\t", str(read_name), sep='') # Delete me skip_read = True break if skip_read == True: break current_pos += val if skip_read == True: continue outfile.write(bytes(line,"UTF-8"))
with open(dup_file) as infile: for line in infile: line = line.strip() dup_list.append(line) gff_genes_dict = GFF.parse_genes(gff_file) if adj_value > 0: for gene_name, [chr, start_pos, end_pos] in gff_genes_dict.gene_dict.items(): start_pos = start_pos - adj_value end_pos = end_pos + adj_value gff_genes_dict.gene_dict[gene_name] = [chr, start_pos, end_pos] if use_random == "Y": # Construct a list of randomly selected genes to replace the list of duplicated genes #dup_list = sample(gff_genes_dict.gene_dict.keys(),len(dup_list)) dup_list = gff_genes_dict.gene_dict.keys() # Delete me soon dup_positions = ";".join([chr + ":" + str(s_pos) + "-" + str(e_pos) for gene_name, [chr,s_pos,e_pos] in sorted(gff_genes_dict.gene_dict.items(), key = lambda x: (x[1][0], x[1][1]) ) if gene_name in dup_list ] ) dup_pileup = Pileup.parse(pileup_file, False, dup_positions) suns_in_gene = Counter() suns_in_gene_prob = {} # SUN-finding algorithm that runs when finding SUNs in As. Looks for 50/50 Con/SNP in As with open(sun_genes_filename[:-4] + "_SUNs.txt", 'w') as sue_sun_file: # Debugging file fisher_line = "Pos\tRef\tRefValue\tMaxSNP\tMaxSNPValue\tP-value\n" sue_sun_file.write(fisher_line) for entry in dup_pileup: (chr, pos), [ref, nuclist] = entry if ref not in ["A","C","G","T"]: continue # Skip ambiguous nucleotide positions gene_name = gff_genes_dict.get_gene_name(chr, pos) if gene_name in dup_list: totalreads, consensus_num, maxSNPs = Pileup.fisher_snp_info(ref,nuclist) if totalreads < 12 or sum([int(v) for v in maxSNPs.values()]) == 0: continue # Either the total usable reads (consensus + maxSNP) is < 12, or no SNPs are present
def filter_sam(sam_file, SNP_file, o_2_file): pattern = r"(\d+)([MSID])" recomp = re.compile(pattern) aa_SNPs = Pileup.parse(SNP_file, simplified=True, only_save_SNPs=False, region_list="AtChr1;AtChr2;AtChr3;AtChr4;AtChr5") if sam_file[-3:] == ".gz": infile = gzip.open(sam_file, 'rb') else: infile = open(sam_file) with gzip.open(o_2_file, 'wb') as outfile: for line in infile: if sam_file[-3:] == ".gz": line = line.decode('utf-8') line_split = line.split("\t") if line_split[0][:1] != "@": chr = line_split[2] if chr == "*": outfile.write(bytes(line, "UTF-8")) continue skip_read = False for i in range(11, len(line_split)): if line_split[ i] == "NM:i:0": # Read has a perfect sequence match outfile.write(bytes(line, "UTF-8")) skip_read = True break if skip_read == True: continue #read_name = line_split[0] # Delete me s_pos = int(line_split[3]) cigar = line_split[5] cigar_pieces = recomp.findall(cigar) nucs = line_split[9] current_pos = s_pos for piece in cigar_pieces: val, operation = piece val = int(val) if operation == "M": # A, C, T, or G for i in range( current_pos, current_pos + val ): # Check all nucleotides for match w/ reference ref, nuclist = aa_SNPs.PosInfo(chr, i) if ref != -1: # Ref == -1 when there isn't an A. arenosa SNP at position (chr, i) cur_nuc = nucs[i - s_pos:i - s_pos + 1] if cur_nuc != ref: # Read doesn't contain reference nucleotide total_nucs = Pileup.sum_nucs(nuclist, True) SNP = {} SNP["A"] = nuclist[0] SNP["C"] = nuclist[1] SNP["G"] = nuclist[2] SNP["T"] = nuclist[3] if cur_nuc in SNP and (SNP[cur_nuc] / total_nucs) >= 0.05: # SNP present in >=5% of A. arenosa reads mapped to TAIR9, so consider it a valid mismap; skip read #print(str(cur_nuc), " (", str(ref), "): ", str(round(SNP[cur_nuc] / total_nucs,2)), "\t", str(chr), ":", str(i), "\t", str(read_name), sep='') # Delete me skip_read = True break else: # Insertion or deletion ref, nuclist = aa_SNPs.PosInfo(chr, i) if ref != -1: # Ref == -1 when there isn't an A. arenosa SNP at position (chr, i) if operation == "I": indel_str = "+" + str(val) else: indel_str = "-" + str(val) for i in range(4, len(nuclist)): if nuclist[i][:2] == indel_str: total_nucs = Pileup.sum_nucs(nuclist, True) if (int(nuclist[i][3 + int(val):]) / total_nucs) >= 0.05: # Indel present in >=5% of A. arenosa reads mapped to TAIR9, so consider it a valid mismap; skip read #print(str(nuclist[i]), " (", str(ref), "): ", str(round(int(nuclist[i][3+int(val):]) / total_nucs,2)), str(chr), ":", str(i), "\t", str(read_name), sep='') # Delete me skip_read = True break if skip_read == True: break current_pos += val if skip_read == True: continue outfile.write(bytes(line, "UTF-8"))
linelist = line.split() if line[:1] == ">": if chr != "": chr_lengths[chr] = chr_len chr = linelist[0][1:] chr_len = 0 else: line = line.split() chr_len += len(line[0]) chr_lengths[chr] = chr_len return(chr_lengths) pileup, ref, win = command_line() chr_lengths = get_chr_len(ref) PileupFile = Pileup.parse(pileup,True) #windows = [i for i in range(1,1501)] windows = [i for i in range(1600,5001,100)] for win in windows: outdir = "Output/HMMCovWin/" + str(win) + "bp-window/" if not os.path.exists(outdir): os.makedirs(outdir) for chr in sorted(chr_lengths): histogram = Counter() outlist = [] for i in range(1,chr_lengths[chr]+1,win): loc = str(chr) + ":" + str(i) + "-" + str(i+win) cov = PileupFile.Coverage(loc,True) if cov in histogram: histogram[cov] += 1
if line[:1] == ">": if chr != "": chr_lengths[chr] = chr_len chr = linelist[0][1:] chr_len = 0 else: line = line.split() chr_len += len(line[0]) chr_lengths[chr] = chr_len return (chr_lengths) pileup, ref, win = command_line() chr_lengths = get_chr_len(ref) PileupFile = Pileup.parse(pileup, True) #windows = [i for i in range(1,1501)] windows = [i for i in range(1600, 5001, 100)] for win in windows: outdir = "Output/HMMCovWin/" + str(win) + "bp-window/" if not os.path.exists(outdir): os.makedirs(outdir) for chr in sorted(chr_lengths): histogram = Counter() outlist = [] for i in range(1, chr_lengths[chr] + 1, win): loc = str(chr) + ":" + str(i) + "-" + str(i + win) cov = PileupFile.Coverage(loc, True) if cov in histogram: histogram[cov] += 1