Example #1
0
def main():
    setupLogging(fi="psvpTools.log", debug=False)
    
    logging.info("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")

    parseArgs(sys.argv[1:])
    p = Psvp(inputFile, name, callers, annotationColumns)    
    outpath = outputDirectory + "/" + p.name

#All sites
    #sizeDistribution
    pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".all.sizeDist.txt")
    #callerDistribution
    pp(callerDistribution(p), outpath + ".all.callDist.txt")
    #sampleDistribution
    pp(sampleDistribution(p), outpath + ".all.sampDist.txt")

#'Well-supported' sites - i.e. multicaller support & multisample support
    p = findCallerSupport(p, minCallers=2)
    p.clearEmptySites(mask=False)
    p = findSampleSupport(p, minSamples=2, mask=False)

    #sizeDistribution
    pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".sup.sizeDist.txt")
    #callerDistribution
    pp(callerDistribution(p), outpath + ".sup.callDist.txt")
    #sampleDistribution
    pp(sampleDistribution(p), outpath + ".sup.sampDist.txt")
    
    logging.info("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
Example #2
0
def main():
    setupLogging(fi="psvpTools.log")
    logging.info("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")

    p = Psvp(inputFile=INPUT_FILE, name=NAME, callers=CALLERS, annotationColumns=ANNOTATION_COLUMNS)
    outpath = OUT_DIR + "/" + NAME

    with open(outpath + "totals.txt", "w") as out:
        out.write("")  # need to clear this file to later append some output

    # All sites

    # Here, we look at all sites in our data, which consists of a .psvp for
    #   each chromosome concatenated together into one file, 'data.psvp'.
    #   psvpTools's class Psvp will keep the first header it comes across and
    #   discard the rest - important for if you are trying to output a
    #   filtered psvp for use with other tools, possibly in other formats.
    with open(outpath + "totals.txt", "a") as out:
        # Let's get a count for every category and put it in this file, starting
        #   with 'All sites'.
        out.write("All:\t")
        out.write(str(countSites(p)) + "\n")
    #'prettyPrint' a size, caller, and sample distribution
    pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".all.sizeDist.txt")
    pp(callerDistribution(p), outpath + ".all.callDist.txt")
    pp(sampleDistribution(p), outpath + ".all.sampDist.txt")

    #'Well-supported' sites - i.e. multicaller support & multisample support

    # Now that we have some information on all sites, we want to look more
    #   closely at sites we're more confident aren't errors. So, let's:
    #   1.  Filter out all entries that aren't supported by at least two of our
    #       programs
    #   2.  Remove any sites which may not contain any nonzero entries now that
    #       we've modified them. We set mask=False to delete empty sites rather
    #       than mask them, since we won't be using them later.
    #   3.  Filter out sites that don't have at least 2 samples with entries.
    p = findCallerSupport(p, minCallers=2)
    p.clearEmptySites(mask=False)
    p = findSampleSupport(p, minSamples=2, mask=False)

    with open(outpath + "totals.txt", "a") as out:
        out.write("Supported:\t")
        out.write(str(countSites(p)) + "\n")
    pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".sup.sizeDist.txt")
    pp(callerDistribution(p), outpath + ".sup.callDist.txt")
    pp(sampleDistribution(p), outpath + ".sup.sampDist.txt")

    #'Well-supported' sites NOT in the reference (sample column 73)
    # Finally, let's focus on sites which don't show up as variants in the
    #   reference rhesus, which was included in the data so that we might
    #   filter out more false calls from our programs. The reference rhesus was
    #   the 74th sample column, or 73 columns away from the start of our
    #   the sample columns, so we defined annotationColumns as [73] when
    #   we created our Psvp. The reference data was left out of all previous
    #   psvpStat function calls because we had marked that column as an
    #   annotation.
    # Now, we filter out sites which have nonzero data in the reference sample.
    #   We set reverse=True because we want sites that DO NOT have annotations.
    p = findAnnotated(p, reverse=True, mask=False)

    with open(outpath + "totals.txt", "a") as out:
        out.write("Supported and not in reference:\t")
        out.write(str(countSites(p)) + "\n")
    pp(sizeDistribution(p, sizeBins=SIZE_BINS), outpath + ".snr.sizeDist.txt")
    pp(callerDistribution(p), outpath + ".snr.callDist.txt")
    pp(sampleDistribution(p), outpath + ".snr.sampDist.txt")

    logging.info("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")