def predict(inFileName, outFileName, strain, binSize): """Walks through a file of site calls and predicts Whether the region is M, N, or H based on a sliding window of the given binSize. Returns a tuple with the number of chromosomes and regions predicted""" with open(inFileName) as sitesFile, open(outFileName, 'w') as predFile: #Load whole file into memory since it is only a couple of MBs reader = csv.reader(sitesFile, delimiter='\t') #dictionary of lists chromeMap = defaultdict(list) #We really only want to work with sites that have been called as N,M,H #so just count up the number of U sites between the more meaningful #sites for summary stats later uCount = 0 for row in reader: if (row[2] == 'U'): uCount = uCount + 1 else: chromeMap[row[0]].append((row[2], int(row[1]), uCount)) uCount = 0 #Write header if TABLE_PRINT: predFile.write(','.join( ("Strain", "Chromosome", "Start", "End", "Size (kbp)", "Number of SNPs", "N2/Genotyped", "MY14/Genotyped", "Heterozygous/Genotyped", "Incongruities/Genotyped", "Unknown/All", "Previous Prediction", "Prediction", "Next Prediction", "Link")) + '\n') else: predFile.write('\t'.join( ("Chromosome", "Breakpoint Lower", "Breakpoint Upper", "Prediction", "Number of SNPs", "N2/Genotyped", "MY14/Genotyped", "Heterozygous/Genotyped", "Unknown/All")) + '\n') #Gather up thresholds for the binSize and binSize/2 because sometimes #we break the window in half for more positional knowledge fullBinThresh = util.cutoffs(binSize) halfBinThresh = util.cutoffs(binSize / 2) thresholds = (fullBinThresh[0], fullBinThresh[1], halfBinThresh[0], halfBinThresh[1]) #The total number of chromosomes and regions predicted chromeCount = 0 regCount = 0 #Make predictions for each chromosome for chrome in sorted(chromeMap.keys()): chromes, regs = predictChrome(strain, chrome, chromeMap[chrome], binSize, thresholds, predFile) chromeCount += chromes regCount += regs return chromeCount, regCount
def predict(inFileName, outFileName, strain, binSize): """Walks through a file of site calls and predicts Whether the region is M, N, or H based on a sliding window of the given binSize. Returns a tuple with the number of chromosomes and regions predicted""" with open(inFileName) as sitesFile, open(outFileName, 'w') as predFile: #Load whole file into memory since it is only a couple of MBs reader = csv.reader(sitesFile, delimiter='\t') #dictionary of lists chromeMap = defaultdict(list) #We really only want to work with sites that have been called as N,M,H #so just count up the number of U sites between the more meaningful #sites for summary stats later uCount = 0 for row in reader: if(row[2] == 'U'): uCount = uCount + 1 else: chromeMap[row[0]].append((row[2], int(row[1]), uCount)) uCount = 0 #Write header if TABLE_PRINT: predFile.write(','.join(("Strain", "Chromosome", "Start", "End", "Size (kbp)", "Number of SNPs", "N2/Genotyped", "MY14/Genotyped", "Heterozygous/Genotyped", "Incongruities/Genotyped", "Unknown/All", "Previous Prediction", "Prediction", "Next Prediction", "Link")) + '\n') else: predFile.write('\t'.join(("Chromosome", "Breakpoint Lower", "Breakpoint Upper", "Prediction", "Number of SNPs", "N2/Genotyped", "MY14/Genotyped", "Heterozygous/Genotyped", "Unknown/All")) + '\n') #Gather up thresholds for the binSize and binSize/2 because sometimes #we break the window in half for more positional knowledge fullBinThresh = util.cutoffs(binSize) halfBinThresh = util.cutoffs(binSize / 2) thresholds = (fullBinThresh[0], fullBinThresh[1], halfBinThresh[0], halfBinThresh[1]) #The total number of chromosomes and regions predicted chromeCount = 0 regCount = 0 #Make predictions for each chromosome for chrome in sorted(chromeMap.keys()): chromes, regs = predictChrome(strain, chrome, chromeMap[chrome], binSize, thresholds, predFile) chromeCount += chromes regCount += regs return chromeCount, regCount
"""Clean up mpileup calls so that we are left with just the bases, not the extra stuff it adds""" # Easy stuff first reads = re.sub(r"[$*<>]|\^.", "", reads) # Now do the trickier inserts/deletes # First find the regex while(1): match = re.search(r"[+-](\d+)", reads) if not match: break num = int(match.group(1)) # now find the index and keep the string before and after the insert/delete startIndex = reads.find(match.group(0)) endIndex = startIndex + 1 + len(match.group(1)) + num reads = reads[0:startIndex] + reads[endIndex:] return reads if __name__ == '__main__': # print clean('*A$C>G<T+23ACACACACACACACACACAGGGGT^JC-2TTC-11AAAAAAAAAAAC+4TTTTC') hetCutoffs = {} for i in range(1, 500): hetCutoffs[i] = util.cutoffs(i) callReads(hetCutoffs, *sys.argv[1:])
"""Clean up mpileup calls so that we are left with just the bases, not the extra stuff it adds""" # Easy stuff first reads = re.sub(r"[$*<>]|\^.", "", reads) # Now do the trickier inserts/deletes # First find the regex while (1): match = re.search(r"[+-](\d+)", reads) if not match: break num = int(match.group(1)) # now find the index and keep the string before and after the insert/delete startIndex = reads.find(match.group(0)) endIndex = startIndex + 1 + len(match.group(1)) + num reads = reads[0:startIndex] + reads[endIndex:] return reads if __name__ == '__main__': # print clean('*A$C>G<T+23ACACACACACACACACACAGGGGT^JC-2TTC-11AAAAAAAAAAAC+4TTTTC') hetCutoffs = {} for i in range(1, 500): hetCutoffs[i] = util.cutoffs(i) callReads(hetCutoffs, *sys.argv[1:])