Example #1
0
def predict(inFileName, outFileName, strain, binSize):
    """Walks through a file of site calls and predicts
	Whether the region is M, N, or H based on a sliding
	window of the given binSize.
	Returns a tuple with the number of chromosomes and regions predicted"""

    with open(inFileName) as sitesFile, open(outFileName, 'w') as predFile:
        #Load whole file into memory since it is only a couple of MBs
        reader = csv.reader(sitesFile, delimiter='\t')
        #dictionary of lists
        chromeMap = defaultdict(list)

        #We really only want to work with sites that have been called as N,M,H
        #so just count up the number of U sites between the more meaningful
        #sites for summary stats later
        uCount = 0
        for row in reader:
            if (row[2] == 'U'):
                uCount = uCount + 1
            else:
                chromeMap[row[0]].append((row[2], int(row[1]), uCount))
                uCount = 0

        #Write header
        if TABLE_PRINT:
            predFile.write(','.join(
                ("Strain", "Chromosome", "Start", "End", "Size (kbp)",
                 "Number of SNPs", "N2/Genotyped", "MY14/Genotyped",
                 "Heterozygous/Genotyped", "Incongruities/Genotyped",
                 "Unknown/All", "Previous Prediction", "Prediction",
                 "Next Prediction", "Link")) + '\n')
        else:
            predFile.write('\t'.join(
                ("Chromosome", "Breakpoint Lower", "Breakpoint Upper",
                 "Prediction", "Number of SNPs", "N2/Genotyped",
                 "MY14/Genotyped", "Heterozygous/Genotyped", "Unknown/All")) +
                           '\n')

        #Gather up thresholds for the binSize and binSize/2 because sometimes
        #we break the window in half for more positional knowledge
        fullBinThresh = util.cutoffs(binSize)
        halfBinThresh = util.cutoffs(binSize / 2)
        thresholds = (fullBinThresh[0], fullBinThresh[1], halfBinThresh[0],
                      halfBinThresh[1])

        #The total number of chromosomes and regions predicted
        chromeCount = 0
        regCount = 0

        #Make predictions for each chromosome
        for chrome in sorted(chromeMap.keys()):
            chromes, regs = predictChrome(strain, chrome, chromeMap[chrome],
                                          binSize, thresholds, predFile)
            chromeCount += chromes
            regCount += regs

        return chromeCount, regCount
Example #2
0
def predict(inFileName, outFileName, strain, binSize):
	"""Walks through a file of site calls and predicts
	Whether the region is M, N, or H based on a sliding
	window of the given binSize.
	Returns a tuple with the number of chromosomes and regions predicted"""

	with open(inFileName) as sitesFile, open(outFileName, 'w') as predFile:
		#Load whole file into memory since it is only a couple of MBs
		reader = csv.reader(sitesFile, delimiter='\t')
		#dictionary of lists
		chromeMap = defaultdict(list)

		#We really only want to work with sites that have been called as N,M,H
		#so just count up the number of U sites between the more meaningful
		#sites for summary stats later
		uCount = 0
		for row in reader:
			if(row[2] == 'U'):
				uCount = uCount + 1
			else:
				chromeMap[row[0]].append((row[2], int(row[1]), uCount)) 
				uCount = 0

		#Write header
		if TABLE_PRINT:
			predFile.write(','.join(("Strain", "Chromosome", "Start", "End", "Size (kbp)", "Number of SNPs", "N2/Genotyped", "MY14/Genotyped", "Heterozygous/Genotyped", "Incongruities/Genotyped", "Unknown/All", "Previous Prediction", "Prediction", "Next Prediction", "Link")) + '\n')
		else:
			predFile.write('\t'.join(("Chromosome", "Breakpoint Lower", "Breakpoint Upper", "Prediction", "Number of SNPs", "N2/Genotyped", "MY14/Genotyped", "Heterozygous/Genotyped", "Unknown/All")) + '\n')
			
		#Gather up thresholds for the binSize and binSize/2 because sometimes
		#we break the window in half for more positional knowledge
		fullBinThresh = util.cutoffs(binSize)
		halfBinThresh = util.cutoffs(binSize / 2)
		thresholds = (fullBinThresh[0], fullBinThresh[1], halfBinThresh[0], halfBinThresh[1])

		#The total number of chromosomes and regions predicted
		chromeCount = 0
		regCount = 0

		#Make predictions for each chromosome
		for chrome in sorted(chromeMap.keys()):
			chromes, regs = predictChrome(strain, chrome, chromeMap[chrome], binSize, thresholds, predFile)
			chromeCount += chromes
			regCount += regs
			
		return chromeCount, regCount
  """Clean up mpileup calls so that we are left with just the bases,
  not the extra stuff it adds"""

  # Easy stuff first
  reads = re.sub(r"[$*<>]|\^.", "", reads)

  # Now do the trickier inserts/deletes
  # First find the regex
  while(1):
    match = re.search(r"[+-](\d+)", reads)
    if not match:
      break

    num = int(match.group(1))
    # now find the index and keep the string before and after the insert/delete
    startIndex = reads.find(match.group(0))
    endIndex = startIndex + 1 + len(match.group(1)) + num
    reads = reads[0:startIndex] + reads[endIndex:]    

  return reads

if __name__ == '__main__':
  # print clean('*A$C>G<T+23ACACACACACACACACACAGGGGT^JC-2TTC-11AAAAAAAAAAAC+4TTTTC')
  hetCutoffs = {}
  
  for i in range(1, 500):
    hetCutoffs[i] = util.cutoffs(i)
    
  callReads(hetCutoffs, *sys.argv[1:])
  
    """Clean up mpileup calls so that we are left with just the bases,
  not the extra stuff it adds"""

    # Easy stuff first
    reads = re.sub(r"[$*<>]|\^.", "", reads)

    # Now do the trickier inserts/deletes
    # First find the regex
    while (1):
        match = re.search(r"[+-](\d+)", reads)
        if not match:
            break

        num = int(match.group(1))
        # now find the index and keep the string before and after the insert/delete
        startIndex = reads.find(match.group(0))
        endIndex = startIndex + 1 + len(match.group(1)) + num
        reads = reads[0:startIndex] + reads[endIndex:]

    return reads


if __name__ == '__main__':
    # print clean('*A$C>G<T+23ACACACACACACACACACAGGGGT^JC-2TTC-11AAAAAAAAAAAC+4TTTTC')
    hetCutoffs = {}

    for i in range(1, 500):
        hetCutoffs[i] = util.cutoffs(i)

    callReads(hetCutoffs, *sys.argv[1:])