def main(): setupLogging(fi="psvpTools.log", debug=False) logging.info("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") parseArgs(sys.argv[1:]) p = Psvp(inputFile, name, callers, annotationColumns) outpath = outputDirectory + "/" + p.name #All sites #sizeDistribution pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".all.sizeDist.txt") #callerDistribution pp(callerDistribution(p), outpath + ".all.callDist.txt") #sampleDistribution pp(sampleDistribution(p), outpath + ".all.sampDist.txt") #'Well-supported' sites - i.e. multicaller support & multisample support p = findCallerSupport(p, minCallers=2) p.clearEmptySites(mask=False) p = findSampleSupport(p, minSamples=2, mask=False) #sizeDistribution pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".sup.sizeDist.txt") #callerDistribution pp(callerDistribution(p), outpath + ".sup.callDist.txt") #sampleDistribution pp(sampleDistribution(p), outpath + ".sup.sampDist.txt") logging.info("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
def main(): setupLogging(fi="psvpTools.log") logging.info("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") p = Psvp(inputFile=INPUT_FILE, name=NAME, callers=CALLERS, annotationColumns=ANNOTATION_COLUMNS) outpath = OUT_DIR + "/" + NAME with open(outpath + "totals.txt", "w") as out: out.write("") # need to clear this file to later append some output # All sites # Here, we look at all sites in our data, which consists of a .psvp for # each chromosome concatenated together into one file, 'data.psvp'. # psvpTools's class Psvp will keep the first header it comes across and # discard the rest - important for if you are trying to output a # filtered psvp for use with other tools, possibly in other formats. with open(outpath + "totals.txt", "a") as out: # Let's get a count for every category and put it in this file, starting # with 'All sites'. out.write("All:\t") out.write(str(countSites(p)) + "\n") #'prettyPrint' a size, caller, and sample distribution pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".all.sizeDist.txt") pp(callerDistribution(p), outpath + ".all.callDist.txt") pp(sampleDistribution(p), outpath + ".all.sampDist.txt") #'Well-supported' sites - i.e. multicaller support & multisample support # Now that we have some information on all sites, we want to look more # closely at sites we're more confident aren't errors. So, let's: # 1. Filter out all entries that aren't supported by at least two of our # programs # 2. Remove any sites which may not contain any nonzero entries now that # we've modified them. We set mask=False to delete empty sites rather # than mask them, since we won't be using them later. # 3. Filter out sites that don't have at least 2 samples with entries. p = findCallerSupport(p, minCallers=2) p.clearEmptySites(mask=False) p = findSampleSupport(p, minSamples=2, mask=False) with open(outpath + "totals.txt", "a") as out: out.write("Supported:\t") out.write(str(countSites(p)) + "\n") pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".sup.sizeDist.txt") pp(callerDistribution(p), outpath + ".sup.callDist.txt") pp(sampleDistribution(p), outpath + ".sup.sampDist.txt") #'Well-supported' sites NOT in the reference (sample column 73) # Finally, let's focus on sites which don't show up as variants in the # reference rhesus, which was included in the data so that we might # filter out more false calls from our programs. The reference rhesus was # the 74th sample column, or 73 columns away from the start of our # the sample columns, so we defined annotationColumns as [73] when # we created our Psvp. The reference data was left out of all previous # psvpStat function calls because we had marked that column as an # annotation. # Now, we filter out sites which have nonzero data in the reference sample. # We set reverse=True because we want sites that DO NOT have annotations. p = findAnnotated(p, reverse=True, mask=False) with open(outpath + "totals.txt", "a") as out: out.write("Supported and not in reference:\t") out.write(str(countSites(p)) + "\n") pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".snr.sizeDist.txt") pp(callerDistribution(p), outpath + ".snr.callDist.txt") pp(sampleDistribution(p), outpath + ".snr.sampDist.txt") logging.info("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")