Esempio n. 1
0
def main():
    
    #python radiaCompare.py TCGA-AB-2995 12 ../data/test/TCGA-AB-2995.vcf ../data/test/TCGA-AB-2995.vcf -c "SOM=Somatic" -s ../stats/radia/cmpRadBB.tab --log=DEBUG 
        
    # create the usage statement
    usage = "usage: python %prog id chrom radFile compareFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)
    
    i_cmdLineParser.add_option("-c", "--compareList", dest="compareList", metavar="COMPARE_LIST", help="a comma separated list of key/values comparisons where the key is in RADIA and the value is in the compare file")
    i_cmdLineParser.add_option("-s", "--statsFilename", dest="statsFilename", metavar="STATS_FILE", help="the name of the stats file, sys.stdout by default")
    i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default")
    i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDERR by default")
    i_cmdLineParser.add_option("-o", "--overlapFilename", dest="overlapFilename", metavar="OVERLAP_FILE", help="the name of the overlap file")
    i_cmdLineParser.add_option("-n", "--nonOverlapFilename", dest="nonOverlapFilename", metavar="NON_OVERLAP_FILE", help="the name of the non-overlap file")
    
    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(4,17,1)
    i_argLength = len(sys.argv)
    
    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)
    
    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_chr = str(i_cmdLineArgs[1])
    i_radiaFilename = str(i_cmdLineArgs[2])
    i_compareFilename = str(i_cmdLineArgs[3])
    
    # get the optional params with default values   
    i_logLevel = i_cmdLineOptions.logLevel
    
    # try to get any optional parameters with no defaults   
    # check for any errors
    writeFilenameList = []
    readFilenameList = [i_radiaFilename, i_compareFilename]
     
    i_statsFilename = None
    i_logFilename = None
    i_compareString = None
    i_overlapFilename = None
    i_nonOverlapFilename = None
    i_compareDict = collections.defaultdict(list)
    if (i_cmdLineOptions.overlapFilename != None):
        i_overlapFilename = str(i_cmdLineOptions.overlapFilename)
        writeFilenameList += [i_overlapFilename]
    if (i_cmdLineOptions.nonOverlapFilename != None):
        i_nonOverlapFilename = str(i_cmdLineOptions.nonOverlapFilename)
        writeFilenameList += [i_nonOverlapFilename]
    if (i_cmdLineOptions.statsFilename != None):
        i_statsFilename = str(i_cmdLineOptions.statsFilename)
        writeFilenameList += [i_statsFilename]
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
        writeFilenameList += [i_logFilename]
    if (i_cmdLineOptions.compareList != None):
        i_compareString = str(i_cmdLineOptions.compareList)
        i_compareList = i_compareString.split(",")
        
        for keyValue in i_compareList:
            (key, value) = keyValue.split("=")
            i_compareDict[key] = value
        
    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)
    
    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        
    # set the debug    
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # do some debugging
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("chr=%s", i_chr)
        logging.debug("radiaFile=%s", i_radiaFilename)
        logging.debug("overlapFilename=%s" % i_overlapFilename)
        logging.debug("nonOverlapFilename=%s" % i_nonOverlapFilename)
        logging.debug("compareFile=%s", i_compareFilename)
        logging.debug("statsFile=%s", i_statsFilename)
        logging.debug("logLevel=%s", i_logLevel)
        logging.debug("logFile=%s", i_logFilename)
        logging.debug("compareDict=%s", i_compareDict)
        
    if (not radiaUtil.check_for_argv_errors(None, readFilenameList, writeFilenameList)):
        sys.exit(1)           
    
    compare_events(i_id, i_chr, i_radiaFilename, i_compareFilename, i_statsFilename, i_overlapFilename, i_nonOverlapFilename, i_compareDict, i_debug)
       
    return
Esempio n. 2
0
def main():
    
    # command for running this on a small test case: 
    #python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads
    
    startTime = time.time()
    
    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatInputFile blatOutputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)
    
    # add the optional parameters
    i_cmdLineParser.add_option("-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed")
    i_cmdLineParser.add_option("-k", "--keepPreviousFilters", action="store_true", default=False, dest="keepPreviousFilters", help="by default the previous filters are overwritten with the blat filter, include this argument if the previous filters should be kept")
    
    i_cmdLineParser.add_option("-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-b", "--blatOutputFormat", dest="blatOutputFormat", metavar="OUTPUT_FORMAT", default="BLAST", help="the BLAT output format, BLAST by default")
    i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default")
    i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default")
    
    i_cmdLineParser.add_option("-n", "--blatDnaNormalReads", action="store_true", default=False, dest="blatDnaNormalReads", help="include this argument if the normal DNA reads should be processed")
    i_cmdLineParser.add_option("-x", "--blatRnaNormalReads", action="store_true", default=False, dest="blatRnaNormalReads", help="include this argument if the normal RNA reads should be processed")
    i_cmdLineParser.add_option("-t", "--blatDnaTumorReads", action="store_true", default=False, dest="blatDnaTumorReads", help="include this argument if the tumor DNA reads should be processed")
    i_cmdLineParser.add_option("-r", "--blatRnaTumorReads", action="store_true", default=False, dest="blatRnaTumorReads", help="include this argument if the tumor RNA reads should be processed")

    i_cmdLineParser.add_option("-d", "--readDepthCutoff", type="int", default=int(4), dest="readDepthCutoff", metavar="READ_DP_CUTOFF", help="the minimum number of valid reads that are necessary, %default by default")
    i_cmdLineParser.add_option("-p", "--readPercentCutoff", type="float", default=float(0.10), dest="readPercentCutoff", metavar="READ_PERCENT_CUTOFF", help="the minimum percentage of valid reads that are necessary, %default by default")
    
    #i_cmdLineParser.add_option("-e", "--eValueCutoff", type="float", default=float(10e-6), dest="eValueCutoff", metavar="EVAL_CUTOFF", help="the e-value cutoff for determining if a blat hit is significant, %default by default")
    #i_cmdLineParser.add_option("-u", "--upperIdentityCutoff", type="float", default=float(0.95), dest="upperIdentityCutoff", metavar="UPPER_CUTOFF", help="the upper cutoff for the match length adjusted identity to determine if a blat hit is significant, %default by default")
    #i_cmdLineParser.add_option("-l", "--lowerIdentityCutoff", type="float", default=float(0.5), dest="lowerIdentityCutoff", metavar="LOWER_CUTOFF", help="the lower cutoff for the match length adjusted identity to determine if a second blat hit is significant, %default by default")
          
    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5,27,1)
    i_argLength = len(sys.argv)
    
    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)
    
    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatInputFilename = str(i_cmdLineArgs[2])
    i_blatOutputFilename = str(i_cmdLineArgs[3])
    
    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly
    i_keepPreviousFiltersFlag = i_cmdLineOptions.keepPreviousFilters
    i_blatOutputFormat = i_cmdLineOptions.blatOutputFormat
    i_logLevel = i_cmdLineOptions.logLevel
    i_readDepthCutoff = i_cmdLineOptions.readDepthCutoff
    i_readPercentCutoff = i_cmdLineOptions.readPercentCutoff
    #i_eValueCutoff = i_cmdLineOptions.eValueCutoff
    #i_upperIdentityCutoff = i_cmdLineOptions.upperIdentityCutoff
    #i_lowerIdentityCutoff = i_cmdLineOptions.lowerIdentityCutoff
    
    i_blatDnaNormalReads = i_cmdLineOptions.blatDnaNormalReads
    i_blatDnaTumorReads = i_cmdLineOptions.blatDnaTumorReads
    i_blatRnaNormalReads = i_cmdLineOptions.blatRnaNormalReads
    i_blatRnaTumorReads = i_cmdLineOptions.blatRnaTumorReads
    
    # try to get any optional parameters with no defaults    
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)
    
    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        
    # set the debug    
    i_debug = (i_numericLogLevel < logging.WARNING)
    
    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatInputFilename=%s", i_blatInputFilename)
        logging.debug("blatOutputFilename=%s", i_blatOutputFilename)
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag)
        logging.debug("blatOutputFormat=%s", i_blatOutputFormat)
        
        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)
        
        logging.debug("readDepthCutoff=%s", i_readDepthCutoff)
        logging.debug("readPerentCutoff=%s", i_readPercentCutoff)
            
    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]
        
    i_readFilenameList = [i_vcfFilename, i_blatInputFilename, i_blatOutputFilename]
    
    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = get_write_fileHandler(i_outputFilename)
    
    # get the BLAT results
    i_blatCoordinateDict = parse_blat_output(i_blatOutputFilename, i_blatOutputFormat, i_debug)
    
    # get the VCF generator   
    i_vcfGenerator  = get_vcf_data(i_vcfFilename, i_passedVCFCallsOnlyFlag, i_debug)
    
    for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore, vcfFilterSet, vcfInfoDict, restOfLine) in i_vcfGenerator:
        if (i_debug):
            logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, str(vcfFilterSet), str(vcfInfoDict), restOfLine)      
           
        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        for modType in modTypes:
            
            blatHitsDict = dict()
            blatOverallReadDepth = 0
            numValidReads = 0
    
            if (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                if ("rnaNormal" in i_blatCoordinateDict[vcfChr + "_" + str(vcfStopCoordinate)]):
                    # for each coordinate, get a dict of reads and corresponding blat hits
                    blatHitsDict = i_blatCoordinateDict[vcfChr + "_" + str(vcfStopCoordinate)]["rnaNormal"]
            elif ((modType == "SOM" or modType == "TUM_EDIT") and i_blatRnaTumorReads):
                if ("rnaTumor" in i_blatCoordinateDict[vcfChr + "_" + str(vcfStopCoordinate)]):
                    # for each coordinate, get a dict of reads and corresponding blat hits
                    blatHitsDict = i_blatCoordinateDict[vcfChr + "_" + str(vcfStopCoordinate)]["rnaTumor"]
                
            # for each read, investigate the blat hits to see if this read is valid
            for (readId, blatHitList) in blatHitsDict.iteritems():
                if (i_debug):
                    logging.info("num of blat hits for read %s=%s", readId, len(blatHitList))
                
                blatOverallReadDepth +=1
    
                # find out if the read is valid or if it maps to other places in the genome
                if (i_blatOutputFormat == "PSL"):
                    (isValidRead, validRead) = is_valid_read_psl_format(blatHitList, vcfChr, vcfStopCoordinate, i_debug)
                elif (i_blatOutputFormat == "BLAST"):
                    (isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 0, i_debug)
                    #(isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 1, i_debug)
                    #(isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 2, i_debug)
                
                # if we have only one valid blat hit, then the read doesn't map to other places in the genome very well, so let's use it
                if (isValidRead):
                    numValidReads += 1
    
                    if (i_debug):
                        logging.debug("ValidRead: %s", validRead) 
            
            if (blatOverallReadDepth > 0):
                altPercent = round(numValidReads/float(blatOverallReadDepth),2)
            else:
                altPercent = 0.0
    
            if (numValidReads < i_readDepthCutoff or altPercent < i_readPercentCutoff):
                modTypeFilters[modType] = "blat"
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True
                
            if (i_debug):
                logging.info("blatOverallReadDepth=%s, numValidReads=%s, altPercent=%s", str(blatOverallReadDepth), str(numValidReads), str(altPercent))
                logging.info("modType=%s, passed? %s", modType, modTypeFilters[modType])
                logging.info("blatFilter originalDepth=%s, afterBlatDepth=%s", str(blatOverallReadDepth), str(numValidReads))
            
        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "blat"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)
        
        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges 
         
        # if at least one passed, then set pass
        if (atLeastOnePass):
            vcfFilterSet = ["PASS"]
        else:
            # if the user wants to keep the previous filters
            if (i_keepPreviousFiltersFlag):
                # if the call previous passed, then just set blat
                if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet):
                    vcfFilterSet = ["blat"] 
                # otherwise, add it to the previous filters
                else:
                    vcfFilterSet.add("blat")
            # otherwise, just set the blat filter
            else:
                vcfFilterSet = ["blat"]
        
        output = [vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, ";".join(vcfFilterSet)]
        
        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:    
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"
        
        output.append(infoField.rstrip(";"))
        output.append(restOfLine)
        
        if (i_outputFilename != None):
            i_outputFileHandler.write("\t".join(output) + "\n")
        else:
            print >> sys.stdout, "\t".join(output)
            
    stopTime = time.time()  
    logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) 
        
    # close the files 
    if (i_outputFilename != None):
        i_outputFileHandler.close()
        
    return
Esempio n. 3
0
def main():

    # python filterByPybed.py TCGA-AB-2995 12 ../data/test/filterBlacklist.bed
    # ../data/test/TCGA-AB-2995.vcf blck -d FILTER --includeOverlaps
    # --includeFilterName --log=DEBUG -f "##FILTER=<ID=blck,Description=
    # \"Position overlaps 1000 Genomes Project blacklist\">"

    # create the usage statement
    usage = "usage: python %prog id chrom filterFile vcfFile filterName [Opts]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-f", "--filterHeader",
        dest="filterHeader", metavar="FILTER_HEADER",
        help="the INFO or FORMAT line to be included in the VCF header")
    i_cmdLineParser.add_option(
        "-p", "--includeOverlaps", action="store_true", default=False,
        dest="includeOverlaps",
        help="whether the events that overlap should be considered to PASS " +
             "(True) or FILTER (False), %default by default")
    i_cmdLineParser.add_option(
        "-n", "--includeFilterName", action="store_true", default=False,
        dest="includeFilterName",
        help="whether the filter name should be included in the INFO or " +
             "FILTER fields of the VCF output, %default by default")
    i_cmdLineParser.add_option(
        "-c", "--includeFilterCount", action="store_true", default=False,
        dest="includeFilterCount",
        help="whether the number of overlaps with the filters should be " +
             "included, %default by default")
    i_cmdLineParser.add_option(
        "-d", "--filterField", default="FILTER",
        dest="filterField",
        help="the column where the filter name should be included " +
             "(e.g. INFO or FILTER) in the VCF output, %default by default")
    i_cmdLineParser.add_option(
        "-s", "--idField", default="ID",
        dest="idField",
        help="the column where the id should be included " +
             "(e.g. ID or INFO) in the VCF output, %default by default")
    i_cmdLineParser.add_option(
        "-i", "--includeIdName", action="store_true", default=False,
        dest="includeIdName",
        help="whether the name found in the filtering file should " +
             "be included in the VCF output, %default by default")
    i_cmdLineParser.add_option(
        "-o", "--outputFilename",
        dest="outputFilename", metavar="OUTPUT_FILE",
        help="the name of the output file, sys.stdout by default")
    i_cmdLineParser.add_option(
        "-l", "--log", default="WARNING",
        dest="logLevel", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-b", "--binSize",
        dest="binSize", default=int(10000), metavar="BIN_SIZE",
        help="the size of the interval between each bin, %default by default")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5, 29, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_chr = str(i_cmdLineArgs[1])
    i_filterFilename = str(i_cmdLineArgs[2])
    i_vcfFilename = str(i_cmdLineArgs[3])
    i_filterName = str(i_cmdLineArgs[4])

    # get the optional params with default values
    i_includeOverlapsFlag = i_cmdLineOptions.includeOverlaps
    i_includeFilterName = i_cmdLineOptions.includeFilterName
    i_includeFilterCount = i_cmdLineOptions.includeFilterCount
    i_filterField = i_cmdLineOptions.filterField
    i_idField = i_cmdLineOptions.idField
    i_includeIdName = i_cmdLineOptions.includeIdName
    i_logLevel = i_cmdLineOptions.logLevel
    i_binSize = i_cmdLineOptions.binSize

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    i_filterHeader = None
    if (i_cmdLineOptions.outputFilename is not None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
    if (i_cmdLineOptions.filterHeader is not None):
        i_filterHeader = str(i_cmdLineOptions.filterHeader)

    # assuming logLevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not sys.stdout):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # do some debugging
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("chr=%s", i_chr)
        logging.debug("filterFile=%s", i_filterFilename)
        logging.debug("vcfFile=%s", i_vcfFilename)
        logging.debug("output=%s", i_outputFilename)
        logging.debug("logLevel=%s", i_logLevel)
        logging.debug("logFile=%s", i_logFilename)
        logging.debug("filterName=%s", i_filterName)
        logging.debug("filterHeader=%s", i_filterHeader)
        logging.debug("includeOverlapsFlag=%s", i_includeOverlapsFlag)
        logging.debug("includeFilterName=%s", i_includeFilterName)
        logging.debug("includeFilterCount=%s", i_includeFilterCount)
        logging.debug("filterField=%s", i_filterField)
        logging.debug("idField=%s", i_idField)
        logging.debug("includeIdName=%s", i_includeIdName)
        logging.debug("binSize=%s", i_binSize)

    # check for any errors
    writeFilenameList = []
    if (i_outputFilename is not None):
        writeFilenameList += [i_outputFilename]
    if (i_logFilename is not None):
        writeFilenameList += [i_logFilename]

    readFilenameList = [i_filterFilename, i_vcfFilename]
    if (not radiaUtil.check_for_argv_errors(None,
                                            readFilenameList,
                                            writeFilenameList)):
        sys.exit(1)

    filter_events(i_id, i_chr, i_filterFilename, i_vcfFilename,
                  i_outputFilename, i_filterName, i_filterField,
                  i_includeOverlapsFlag, i_includeFilterName, i_idField,
                  i_includeIdName, i_includeFilterCount, i_filterHeader,
                  i_binSize, i_debug)

    return
Esempio n. 4
0
def main():

    # command for running this on a small test case:
    #python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatInputFile blatOutputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    # add the optional parameters
    i_cmdLineParser.add_option(
        "-c",
        "--allVCFCalls",
        action="store_false",
        default=True,
        dest="passedVCFCallsOnly",
        help=
        "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed"
    )
    i_cmdLineParser.add_option(
        "-k",
        "--keepPreviousFilters",
        action="store_true",
        default=False,
        dest="keepPreviousFilters",
        help=
        "by default the previous filters are overwritten with the blat filter, include this argument if the previous filters should be kept"
    )

    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-b",
                               "--blatOutputFormat",
                               dest="blatOutputFormat",
                               metavar="OUTPUT_FORMAT",
                               default="BLAST",
                               help="the BLAT output format, BLAST by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help=
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
    )
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")

    i_cmdLineParser.add_option(
        "-n",
        "--blatDnaNormalReads",
        action="store_true",
        default=False,
        dest="blatDnaNormalReads",
        help="include this argument if the normal DNA reads should be processed"
    )
    i_cmdLineParser.add_option(
        "-x",
        "--blatRnaNormalReads",
        action="store_true",
        default=False,
        dest="blatRnaNormalReads",
        help="include this argument if the normal RNA reads should be processed"
    )
    i_cmdLineParser.add_option(
        "-t",
        "--blatDnaTumorReads",
        action="store_true",
        default=False,
        dest="blatDnaTumorReads",
        help="include this argument if the tumor DNA reads should be processed"
    )
    i_cmdLineParser.add_option(
        "-r",
        "--blatRnaTumorReads",
        action="store_true",
        default=False,
        dest="blatRnaTumorReads",
        help="include this argument if the tumor RNA reads should be processed"
    )

    i_cmdLineParser.add_option(
        "-d",
        "--readDepthCutoff",
        type="int",
        default=int(4),
        dest="readDepthCutoff",
        metavar="READ_DP_CUTOFF",
        help=
        "the minimum number of valid reads that are necessary, %default by default"
    )
    i_cmdLineParser.add_option(
        "-p",
        "--readPercentCutoff",
        type="float",
        default=float(0.10),
        dest="readPercentCutoff",
        metavar="READ_PERCENT_CUTOFF",
        help=
        "the minimum percentage of valid reads that are necessary, %default by default"
    )

    #i_cmdLineParser.add_option("-e", "--eValueCutoff", type="float", default=float(10e-6), dest="eValueCutoff", metavar="EVAL_CUTOFF", help="the e-value cutoff for determining if a blat hit is significant, %default by default")
    #i_cmdLineParser.add_option("-u", "--upperIdentityCutoff", type="float", default=float(0.95), dest="upperIdentityCutoff", metavar="UPPER_CUTOFF", help="the upper cutoff for the match length adjusted identity to determine if a blat hit is significant, %default by default")
    #i_cmdLineParser.add_option("-l", "--lowerIdentityCutoff", type="float", default=float(0.5), dest="lowerIdentityCutoff", metavar="LOWER_CUTOFF", help="the lower cutoff for the match length adjusted identity to determine if a second blat hit is significant, %default by default")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5, 27, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatInputFilename = str(i_cmdLineArgs[2])
    i_blatOutputFilename = str(i_cmdLineArgs[3])

    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly
    i_keepPreviousFiltersFlag = i_cmdLineOptions.keepPreviousFilters
    i_blatOutputFormat = i_cmdLineOptions.blatOutputFormat
    i_logLevel = i_cmdLineOptions.logLevel
    i_readDepthCutoff = i_cmdLineOptions.readDepthCutoff
    i_readPercentCutoff = i_cmdLineOptions.readPercentCutoff
    #i_eValueCutoff = i_cmdLineOptions.eValueCutoff
    #i_upperIdentityCutoff = i_cmdLineOptions.upperIdentityCutoff
    #i_lowerIdentityCutoff = i_cmdLineOptions.lowerIdentityCutoff

    i_blatDnaNormalReads = i_cmdLineOptions.blatDnaNormalReads
    i_blatDnaTumorReads = i_cmdLineOptions.blatDnaTumorReads
    i_blatRnaNormalReads = i_cmdLineOptions.blatRnaNormalReads
    i_blatRnaTumorReads = i_cmdLineOptions.blatRnaTumorReads

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
            i_logLevel)

    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatInputFilename=%s", i_blatInputFilename)
        logging.debug("blatOutputFilename=%s", i_blatOutputFilename)
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag)
        logging.debug("blatOutputFormat=%s", i_blatOutputFormat)

        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)

        logging.debug("readDepthCutoff=%s", i_readDepthCutoff)
        logging.debug("readPerentCutoff=%s", i_readPercentCutoff)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [
        i_vcfFilename, i_blatInputFilename, i_blatOutputFilename
    ]

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = get_write_fileHandler(i_outputFilename)

    # get the BLAT results
    i_blatCoordinateDict = parse_blat_output(i_blatOutputFilename,
                                             i_blatOutputFormat, i_debug)

    # get the VCF generator
    i_vcfGenerator = get_vcf_data(i_vcfFilename, i_passedVCFCallsOnlyFlag,
                                  i_debug)

    for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore,
         vcfFilterSet, vcfInfoDict, restOfLine) in i_vcfGenerator:
        if (i_debug):
            logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr,
                          str(vcfStopCoordinate), vcfId,
                          vcfRef, vcfAlt, vcfScore, str(vcfFilterSet),
                          str(vcfInfoDict), restOfLine)

        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        for modType in modTypes:

            blatHitsDict = dict()
            blatOverallReadDepth = 0
            numValidReads = 0

            if (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                if ("rnaNormal"
                        in i_blatCoordinateDict[vcfChr + "_" +
                                                str(vcfStopCoordinate)]):
                    # for each coordinate, get a dict of reads and corresponding blat hits
                    blatHitsDict = i_blatCoordinateDict[
                        vcfChr + "_" + str(vcfStopCoordinate)]["rnaNormal"]
            elif ((modType == "SOM" or modType == "TUM_EDIT")
                  and i_blatRnaTumorReads):
                if ("rnaTumor"
                        in i_blatCoordinateDict[vcfChr + "_" +
                                                str(vcfStopCoordinate)]):
                    # for each coordinate, get a dict of reads and corresponding blat hits
                    blatHitsDict = i_blatCoordinateDict[
                        vcfChr + "_" + str(vcfStopCoordinate)]["rnaTumor"]

            # for each read, investigate the blat hits to see if this read is valid
            for (readId, blatHitList) in blatHitsDict.iteritems():
                if (i_debug):
                    logging.debug("num of blat hits for read %s=%s", readId,
                                  len(blatHitList))

                blatOverallReadDepth += 1

                # find out if the read is valid or if it maps to other places in the genome
                if (i_blatOutputFormat == "PSL"):
                    (isValidRead, validRead) = is_valid_read_psl_format(
                        blatHitList, vcfChr, vcfStopCoordinate, i_debug)
                elif (i_blatOutputFormat == "BLAST"):
                    (isValidRead, validRead) = is_valid_read_blast_format(
                        blatHitList, vcfChr, vcfStopCoordinate, 0, i_debug)
                    #(isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 1, i_debug)
                    #(isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 2, i_debug)

                # if we have only one valid blat hit, then the read doesn't map to other places in the genome very well, so let's use it
                if (isValidRead):
                    numValidReads += 1

                    if (i_debug):
                        logging.debug("ValidRead: %s", validRead)

            if (blatOverallReadDepth > 0):
                altPercent = round(numValidReads / float(blatOverallReadDepth),
                                   2)
            else:
                altPercent = 0.0

            if (numValidReads < i_readDepthCutoff
                    or altPercent < i_readPercentCutoff):
                modTypeFilters[modType] = "blat"
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True

            if (i_debug):
                logging.debug(
                    "blatOverallReadDepth=%s, numValidReads=%s, altPercent=%s",
                    str(blatOverallReadDepth), str(numValidReads),
                    str(altPercent))
                logging.debug("modType=%s, passed? %s", modType,
                              modTypeFilters[modType])
                logging.debug("blatFilter originalDepth=%s, afterBlatDepth=%s",
                              str(blatOverallReadDepth), str(numValidReads))

        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "blat"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)

        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges

        # if at least one passed, then set pass
        if (atLeastOnePass):
            vcfFilterSet = ["PASS"]
        else:
            # if the user wants to keep the previous filters
            if (i_keepPreviousFiltersFlag):
                # if the call previous passed, then just set blat
                if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet):
                    vcfFilterSet = ["blat"]
                # otherwise, add it to the previous filters
                else:
                    vcfFilterSet.add("blat")
            # otherwise, just set the blat filter
            else:
                vcfFilterSet = ["blat"]

            # update the mod filters
            modTypes = vcfInfoDict["MT"]
            modChanges = vcfInfoDict["MC"]
            origins = vcfInfoDict["ORIGIN"]
            modFilters = [] if vcfInfoDict["MF"] is None else vcfInfoDict["MF"]
            modFilterTypes = [] if vcfInfoDict["MFT"] is None else vcfInfoDict[
                "MFT"]

            for origin in origins:
                for (modType, modChange) in izip(modTypes, modChanges):
                    modFilterTypes.append("_".join(
                        [origin, modType, modChange]))
                    modFilters.append("_".join(vcfFilterSet))

            vcfInfoDict["MF"] = modFilters
            vcfInfoDict["MFT"] = modFilterTypes

        output = [
            vcfChr,
            str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore,
            ";".join(vcfFilterSet)
        ]

        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"

        output.append(infoField.rstrip(";"))
        output.append(restOfLine)

        if (i_outputFilename != None):
            i_outputFileHandler.write("\t".join(output) + "\n")
        else:
            print >> sys.stdout, "\t".join(output)

    stopTime = time.time()
    logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs",
                 i_id, ((stopTime - startTime) / (3600)),
                 ((stopTime - startTime) / 60), (stopTime - startTime))

    # close the files
    if (i_outputFilename != None):
        i_outputFileHandler.close()

    return
def main():

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog passingFile originalFile outputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-l", "--log", default="WARNING",
        dest="logLevel", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 10, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_passingFilename = i_cmdLineArgs[0]
    i_originalFilename = i_cmdLineArgs[1]
    i_outputFilename = i_cmdLineArgs[2]

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel

    i_logFilename = None
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("passingFile=%s", i_passingFilename)
        logging.debug("originalFile=%s", i_originalFilename)
        logging.debug("outputFilename=%s", i_outputFilename)

    # check for any errors
    i_readFilenameList = [i_passingFilename, i_originalFilename]
    i_writeFilenameList = [i_outputFilename]
    i_dirList = None

    if (not radiaUtil.check_for_argv_errors(i_dirList,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # get the VCF generator
    (passHeaderList,
     chromLine,
     passInfoList,
     passFilterList,
     passCoordinateDict) = get_vcf_data(i_passingFilename, i_debug)

    (orgHeaderList,
     chromLine,
     orgInfoList,
     orgFilterList,
     orgCoordinateDict) = get_vcf_data(i_originalFilename, i_debug)

    outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    for headerLine in orgHeaderList:
        outputFileHandler.write(headerLine + "\n")

    for headerLine in orgInfoList:
        outputFileHandler.write(headerLine + "\n")

    for headerLine in passInfoList:
        if (headerLine not in orgInfoList):
            outputFileHandler.write(headerLine + "\n")

    for headerLine in orgFilterList:
        outputFileHandler.write(headerLine + "\n")

    for headerLine in passFilterList:
        if (headerLine not in orgFilterList):
            outputFileHandler.write(headerLine + "\n")

    outputFileHandler.write(chromLine + "\n")

    numericKeys = orgCoordinateDict.keys()
    numericKeys.sort(key=int)
    for coordinate in numericKeys:
        if (coordinate in passCoordinateDict):
            line = passCoordinateDict[coordinate]
        else:
            line = orgCoordinateDict[coordinate]
        outputFileHandler.write(line)

    stopTime = time.time()
    logging.info("Total time=%s hrs, %s mins, %s secs",
                 ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60),
                 (stopTime-startTime))

    # close the files
    outputFileHandler.close()

    return
Esempio n. 6
0
def main():
    
    # command for running this on a small test case: 
    #python createBlatFile.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf ../data/test/tmp/ --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads
 
    startTime = time.time()
    
    # create the usage statement
    usage = "usage: python %prog id vcfFile headerFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)
    
    # add the optional parameters
    i_cmdLineParser.add_option("-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed")
    i_cmdLineParser.add_option("-b", "--allReadBases", action="store_false", default=True, dest="altBasesOnly", help="by default only the reads with the alternate base are processed, include this argument if all of the reads should be processed")
    i_cmdLineParser.add_option("-d", "--maxReadDepth", type="int", default=int(8000), dest="maxReadDepth", metavar="MAX_READ_DEPTH", help="the maximum read depth to process from the samtools view command, %default by default")
    
    i_cmdLineParser.add_option("-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default")
    i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default")
    
    i_cmdLineParser.add_option("", "--transcriptNameTag", dest="transcriptNameTag", help="the INFO key where the original transcript name can be found")
    i_cmdLineParser.add_option("", "--transcriptCoordinateTag", dest="transcriptCoordinateTag", help="the INFO key where the original transcript coordinate can be found")
    i_cmdLineParser.add_option("", "--transcriptStrandTag", dest="transcriptStrandTag", help="the INFO key where the original transcript strand can be found")
    i_cmdLineParser.add_option("", "--rnaIncludeSecondaryAlignments", action="store_true", default=False, dest="rnaIncludeSecondaryAlignments", help="if you align the RNA to transcript isoforms, then you may want to include RNA secondary alignments in the samtools mpileups")
    
    i_cmdLineParser.add_option("-n", "--blatDnaNormalReads", action="store_true", default=False, dest="blatDnaNormalReads", help="include this argument if the normal DNA reads should be processed")
    i_cmdLineParser.add_option("-x", "--blatRnaNormalReads", action="store_true", default=False, dest="blatRnaNormalReads", help="include this argument if the normal RNA reads should be processed")
    i_cmdLineParser.add_option("-t", "--blatDnaTumorReads", action="store_true", default=False, dest="blatDnaTumorReads", help="include this argument if the tumor DNA reads should be processed")
    i_cmdLineParser.add_option("-r", "--blatRnaTumorReads", action="store_true", default=False, dest="blatRnaTumorReads", help="include this argument if the tumor RNA reads should be processed")
    
    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3,22,1)
    i_argLength = len(sys.argv)
    
    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)
    
    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_vcfFilename = i_cmdLineArgs[1]
    i_headerFilename = i_cmdLineArgs[2]
    
    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly
    i_altBasesOnlyFlag = i_cmdLineOptions.altBasesOnly
    i_maxReadDepth = i_cmdLineOptions.maxReadDepth
    i_logLevel = i_cmdLineOptions.logLevel
    i_rnaIncludeSecondaryAlignments = i_cmdLineOptions.rnaIncludeSecondaryAlignments
    
    i_blatDnaNormalReads = i_cmdLineOptions.blatDnaNormalReads
    i_blatDnaTumorReads = i_cmdLineOptions.blatDnaTumorReads
    i_blatRnaNormalReads = i_cmdLineOptions.blatRnaNormalReads
    i_blatRnaTumorReads = i_cmdLineOptions.blatRnaTumorReads
    
    # try to get any optional parameters with no defaults    
    i_readFilenameList = [i_vcfFilename, i_headerFilename]
    i_writeFilenameList = []
    
    i_logFilename = None
    i_outputFilename = None
    i_transcriptNameTag = None
    i_transcriptCoordinateTag = None
    i_transcriptStrandTag = None
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
        i_writeFilenameList += [i_logFilename]
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
        i_writeFilenameList += [i_outputFilename]
    if (i_cmdLineOptions.transcriptNameTag != None):
        i_transcriptNameTag = i_cmdLineOptions.transcriptNameTag
    if (i_cmdLineOptions.transcriptCoordinateTag != None):
        i_transcriptCoordinateTag = i_cmdLineOptions.transcriptCoordinateTag
    if (i_cmdLineOptions.transcriptStrandTag != None):
        i_transcriptStrandTag = i_cmdLineOptions.transcriptStrandTag
           
    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)
    
    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        
    # set the debug flag    
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("headerFilename=%s", i_headerFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("altBasesOnlyFlag? %s", i_altBasesOnlyFlag)
        logging.debug("maxReadDepth %s", i_maxReadDepth)
        
        logging.debug("transcriptNameTag %s", i_transcriptNameTag)
        logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag)
        logging.debug("transcriptStrandTag %s", i_transcriptStrandTag)
        logging.debug("rnaIncludeSecondaryAlignments=%s" % i_rnaIncludeSecondaryAlignments)
        
        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)
                    
    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)):
        sys.exit(1)
        
    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
                
    # get the VCF generator
    i_vcfGenerator  = get_vcf_data(i_vcfFilename, i_headerFilename, i_passedVCFCallsOnlyFlag, i_debug)    
   
    # for each VCF call that should be investigated   
    for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore, vcfFilterSet, vcfInfoDict, restOfLine, vcfParamsDict) in i_vcfGenerator:
        if (i_debug):
            logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, str(vcfFilterSet), str(vcfInfoDict), restOfLine) 
        
        modTypes = vcfInfoDict["MT"]
        for modType in modTypes:
            
            # get the reads contributing to a call and put them in a blat query file
            if (i_blatDnaNormalReads):
                write_to_blat_file(i_outputFileHandler, 
                                   vcfChr, 
                                   vcfStopCoordinate, 
                                   [vcfChr], 
                                   [vcfStopCoordinate], 
                                   [None], 
                                   vcfParamsDict, 
                                   vcfInfoDict, 
                                   "dnaNormal", 
                                   i_altBasesOnlyFlag, 
                                   False,
                                   i_maxReadDepth, 
                                   i_debug)
                
            if (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                # if we should process the transcripts
                if ((i_transcriptNameTag != None) and (i_transcriptNameTag in vcfInfoDict)):
                    write_to_blat_file(i_outputFileHandler, 
                                       vcfChr, 
                                       vcfStopCoordinate, 
                                       vcfInfoDict[i_transcriptNameTag], 
                                       vcfInfoDict[i_transcriptCoordinateTag], 
                                       vcfInfoDict[i_transcriptStrandTag], 
                                       vcfParamsDict, 
                                       vcfInfoDict, 
                                       "rnaNormal", 
                                       i_altBasesOnlyFlag, 
                                       i_rnaIncludeSecondaryAlignments,
                                       i_maxReadDepth, 
                                       i_debug)
                else:
                    write_to_blat_file(i_outputFileHandler, 
                                       vcfChr, 
                                       vcfStopCoordinate, 
                                       [vcfChr], 
                                       [vcfStopCoordinate], 
                                       [None], 
                                       vcfParamsDict, 
                                       vcfInfoDict, 
                                       "rnaNormal", 
                                       i_altBasesOnlyFlag, 
                                       i_rnaIncludeSecondaryAlignments, 
                                       i_maxReadDepth,
                                       i_debug)
            
            if (i_blatDnaTumorReads):
                write_to_blat_file(i_outputFileHandler, 
                                   vcfChr, 
                                   vcfStopCoordinate, 
                                   [vcfChr], 
                                   [vcfStopCoordinate], 
                                   [None], 
                                   vcfParamsDict, 
                                   vcfInfoDict, 
                                   "dnaTumor", 
                                   i_altBasesOnlyFlag, 
                                   False, 
                                   i_maxReadDepth,
                                   i_debug)
                
            if ((modType == "SOM" or modType == "TUM_EDIT") and i_blatRnaTumorReads):
                # if we should process the transcripts
                if ((i_transcriptNameTag != None) and (i_transcriptNameTag in vcfInfoDict)):
                    write_to_blat_file(i_outputFileHandler, 
                                       vcfChr, 
                                       vcfStopCoordinate,
                                       list(vcfInfoDict[i_transcriptNameTag]), 
                                       vcfInfoDict[i_transcriptCoordinateTag], 
                                       vcfInfoDict[i_transcriptStrandTag], 
                                       vcfParamsDict, 
                                       vcfInfoDict, 
                                       "rnaTumor", 
                                       i_altBasesOnlyFlag, 
                                       i_rnaIncludeSecondaryAlignments,
                                       i_maxReadDepth, 
                                       i_debug)
                else:
                    write_to_blat_file(i_outputFileHandler, 
                                       vcfChr, 
                                       vcfStopCoordinate, 
                                       [vcfChr], 
                                       [vcfStopCoordinate], 
                                       [None], 
                                       vcfParamsDict, 
                                       vcfInfoDict, 
                                       "rnaTumor", 
                                       i_altBasesOnlyFlag, 
                                       i_rnaIncludeSecondaryAlignments,
                                       i_maxReadDepth, 
                                       i_debug)
            
    stopTime = time.time()       
    logging.info("createBlatFile.py Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime))         
        
    # close the files 
    if (i_outputFilename != None):
        i_outputFileHandler.close()
        
    return
Esempio n. 7
0
def main():

    #python filterByPybed.py TCGA-AB-2995 12 ../data/test/filterBlacklist.bed ../data/test/TCGA-AB-2995.vcf blck -d FILTER --includeOverlaps --includeFilterName --log=DEBUG -f "##FILTER=<ID=blck,Description=\"Position overlaps 1000 Genomes Project blacklist\">"

    # create the usage statement
    usage = "usage: python %prog id chrom filterFile vcfFile filterName [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-f",
        "--filterHeader",
        dest="filterHeader",
        metavar="FILTER_HEADER",
        help="the INFO or FORMAT line that should be included in the VCF header"
    )
    i_cmdLineParser.add_option(
        "-p",
        "--includeOverlaps",
        action="store_true",
        default=False,
        dest="includeOverlaps",
        help=
        "whether the events that overlap should be considered to PASS (True) or FILTER (False), %default by default"
    )
    i_cmdLineParser.add_option(
        "-n",
        "--includeFilterName",
        action="store_true",
        default=False,
        dest="includeFilterName",
        help=
        "whether the filter name should be included in the INFO or FILTER fields of the VCF output, %default by default"
    )
    i_cmdLineParser.add_option(
        "-c",
        "--includeFilterCount",
        action="store_true",
        default=False,
        dest="includeFilterCount",
        help=
        "whether the number of overlaps with the filters should be included, %default by default"
    )
    i_cmdLineParser.add_option(
        "-d",
        "--filterField",
        default="FILTER",
        dest="filterField",
        help=
        "the column where the filter name should be included (INFO or FILTER) field of the VCF output, %default by default"
    )
    i_cmdLineParser.add_option(
        "-i",
        "--includeIdName",
        action="store_true",
        default=False,
        dest="includeIdName",
        help=
        "whether the name found in the filtering file should be included in the ID field of the VCF output, %default by default"
    )
    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        help="the name of the output file, sys.stdout by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help=
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
    )
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-b",
        "--binSize",
        dest="binSize",
        default=int(10000),
        metavar="BIN_SIZE",
        help="the size of the interval between each bin, %default by default")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5, 27, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_chr = str(i_cmdLineArgs[1])
    i_filterFilename = str(i_cmdLineArgs[2])
    i_vcfFilename = str(i_cmdLineArgs[3])
    i_filterName = str(i_cmdLineArgs[4])

    # get the optional params with default values
    i_includeOverlapsFlag = i_cmdLineOptions.includeOverlaps
    i_includeFilterName = i_cmdLineOptions.includeFilterName
    i_includeFilterCount = i_cmdLineOptions.includeFilterCount
    i_filterField = i_cmdLineOptions.filterField
    i_includeIdName = i_cmdLineOptions.includeIdName
    i_logLevel = i_cmdLineOptions.logLevel
    i_binSize = i_cmdLineOptions.binSize

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    i_filterHeader = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
    if (i_cmdLineOptions.filterHeader != None):
        i_filterHeader = str(i_cmdLineOptions.filterHeader)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
            i_logLevel)

    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # do some debugging
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("chr=%s", i_chr)
        logging.debug("filterFile=%s", i_filterFilename)
        logging.debug("vcfFile=%s", i_vcfFilename)
        logging.debug("output=%s", i_outputFilename)
        logging.debug("logLevel=%s", i_logLevel)
        logging.debug("logFile=%s", i_logFilename)
        logging.debug("filterName=%s", i_filterName)
        logging.debug("filterHeader=%s", i_filterHeader)
        logging.debug("includeOverlapsFlag=%s", i_includeOverlapsFlag)
        logging.debug("includeFilterName=%s", i_includeFilterName)
        logging.debug("includeFilterCount=%s", i_includeFilterCount)
        logging.debug("filterField=%s", i_filterField)
        logging.debug("includeIdName=%s", i_includeIdName)
        logging.debug("binSize=%s", i_binSize)

    # check for any errors
    writeFilenameList = []
    if (i_outputFilename != None):
        writeFilenameList += [i_outputFilename]
    if (i_logFilename != None):
        writeFilenameList += [i_logFilename]

    readFilenameList = [i_filterFilename, i_vcfFilename]
    if (not radiaUtil.check_for_argv_errors(None, readFilenameList,
                                            writeFilenameList)):
        sys.exit(1)

    filter_events(i_id, i_chr, i_filterFilename, i_vcfFilename,
                  i_outputFilename, i_filterName, i_filterField,
                  i_includeOverlapsFlag, i_includeFilterName, i_includeIdName,
                  i_includeFilterCount, i_filterHeader, i_binSize, i_debug)

    return
Esempio n. 8
0
def main():

    # create the usage statement
    usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Opts]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-o", "--outputFilename", default=sys.stdout,
        dest="outputFilename", metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-l", "--log",
        dest="logLevel", default="WARNING", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-c", "--allVCFCalls", action="store_false", default=True,
        dest="passedVCFCallsOnly",
        help="by default only the VCF calls that have passed all filters " +
             "thus far are processed, include this argument if all of the " +
             "VCF calls should be processed")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 14, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_vcfFilename = str(i_cmdLineArgs[0])
    i_rnaGeneFilename = str(i_cmdLineArgs[1])
    i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename is not None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not sys.stdout):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename)
        logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename is not sys.stdout):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename,
                          i_rnaGeneFilename,
                          i_rnaGeneFamilyFilename]

    if (not radiaUtil.check_for_argv_errors(None,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the input stream
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename)

    # open the output stream
    if i_outputFilename is not sys.stdout:
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
    else:
        i_outputFileHandler = i_outputFilename

    # get the RNA gene blacklists
    (i_rnaGeneList,
     i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename,
                                          i_rnaGeneFamilyFilename,
                                          i_debug)

    hasAddedFilterHeader = False

    for line in i_vcfFileHandler:

        if (i_debug):
            logging.debug("vcfLine: %s", line)

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue

        # if we find the FILTER section, then add the filters from here
        elif ((not hasAddedFilterHeader) and (line.startswith("##FILTER"))):
            hasAddedFilterHeader = True
            i_outputFileHandler.write(
                "##FILTER=<ID=rgene,Description=\"This gene is on the " +
                "RNA gene blacklist\">\n")
            i_outputFileHandler.write(
                "##FILTER=<ID=rgfam,Description=\"This gene family is on " +
                "the RNA gene family blacklist\">\n")
            i_outputFileHandler.write(line)

        # these lines are from previous scripts in the pipeline, so output them
        elif (line.startswith("#")):
            i_outputFileHandler.write(line)

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (i_passedVCFCallsOnlyFlag and "PASS" not in line):
            i_outputFileHandler.write(line)

        # now we are to the data
        else:

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            # split the line on the tab
            splitLine = line.split("\t")

            filterSet = set(splitLine[6].split(";"))

            # if there are no filters so far, then clear the list
            if (len(filterSet) == 1 and "PASS" in filterSet):
                filterSet = set()

            # parse the info column and create a dict
            infoList = splitLine[7].split(";")
            infoDict = collections.defaultdict(list)
            for info in infoList:
                keyValueList = info.split("=")
                # some keys are just singular without a value (e.g. DB, etc.)
                if (len(keyValueList) == 1):
                    infoDict[keyValueList[0]] = ["True"]
                else:
                    # the value can be a comma separated list
                    infoDict[keyValueList[0]] = keyValueList[1].split(",")

            effectList = infoDict["EFF"]
            effectRegEx = re.compile("(\\w).*\\({1}")
            ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"]

            isRnaBlacklistGene = False
            isRnaBlacklistGeneFamily = False

            for rawEffect in effectList:
                rawEffect = rawEffect.rstrip(")")
                iterator = effectRegEx.finditer(rawEffect)

                # for each match object in the iterator
                for match in iterator:
                    effect = match.group()
                    rawEffect = rawEffect.replace(effect, "")
                    effect = effect.rstrip("(")

                if (effect in ignoreEffectsList):
                    continue

                effectParts = rawEffect.split("|")
                # effectImpact = effectParts[0]
                # functionalClass = effectParts[1]
                # codonChange = effectParts[2]
                # aaChange = effectParts[3]
                # aaLength = effectParts[4]
                geneName = effectParts[5]
                transcriptBiotype = effectParts[6]
                # geneCoding = effectParts[7]
                # ensembleId = effectParts[8]
                # exonNumber = effectParts[9]
                # genotypeNumber = effectParts[10]

                # the RNA gene list can have "RP11" and that
                # should filter out any gene with RP11 in it
                for rnaGene in i_rnaGeneList:
                    if (rnaGene in geneName):
                        isRnaBlacklistGene = True
                        break

                if (transcriptBiotype in i_rnaGeneFamilyList):
                    isRnaBlacklistGeneFamily = True

            output = ["\t".join(splitLine[0:6])]

            # if the filter should be applied
            if (isRnaBlacklistGene):
                filterSet.add("rgene")
            # if the filter should be applied
            if (isRnaBlacklistGeneFamily):
                filterSet.add("rgfam")

            # if there are no filters so far, then this call passes
            if (len(filterSet) == 0):
                filterSet.add("PASS")

            output.append(";".join(filterSet))

            output.append("\t".join(splitLine[7:]))

            if (i_outputFilename is not sys.stdout):
                i_outputFileHandler.write("\t".join(output) + "\n")
            else:
                print >> sys.stdout, "\t".join(output)

    # close the files
    i_vcfFileHandler.close()
    if (i_outputFilename is not sys.stdout):
        i_outputFileHandler.close()

    return
Esempio n. 9
0
def main():

    # command for running this on a small test case:
    # python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf
    # ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl
    # --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatOutputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    # add the optional parameters
    i_cmdLineParser.add_option(
        "-c",
        "--allVCFCalls",
        action="store_false",
        default=True,
        dest="passedVCFCallsOnly",
        help="by default only the VCF calls that have passed all filters " +
        "thus far are processed, include this argument if all of the " +
        "VCF calls should be processed")
    i_cmdLineParser.add_option(
        "-k",
        "--keepPreviousFilters",
        action="store_true",
        default=False,
        dest="keepPreviousFilters",
        help="by default the previous filters are overwritten with the blat " +
        "filter, include this argument if the previous filters should " +
        "be kept")

    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        default=sys.stdout,
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-b",
                               "--blatOutputFormat",
                               dest="blatOutputFormat",
                               metavar="OUTPUT_FORMAT",
                               default="BLAST",
                               help="the BLAT output format, BLAST by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
        "%default by default")
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")

    i_cmdLineParser.add_option(
        "",
        "--transcriptNameTag",
        dest="transcriptNameTag",
        help="the INFO key where the original transcript name can be found")
    i_cmdLineParser.add_option(
        "",
        "--transcriptCoordinateTag",
        dest="transcriptCoordinateTag",
        help="the INFO key where the original transcript" +
        "coordinate can be found")
    i_cmdLineParser.add_option(
        "",
        "--transcriptStrandTag",
        dest="transcriptStrandTag",
        help="the INFO key where the original transcript strand can be found")
    i_cmdLineParser.add_option(
        "",
        "--rnaIncludeSecondaryAlignments",
        action="store_true",
        default=False,
        dest="rnaIncludeSecondaryAlignments",
        help="if you align the RNA to transcript isoforms, then you may " +
        "want to include RNA secondary alignments in the pileup")

    i_cmdLineParser.add_option(
        "-n",
        "--blatDnaNormalReads",
        action="store_true",
        default=False,
        dest="blatDnaNormalReads",
        help="include this argument if the normal DNA reads " +
        "should be processed")
    i_cmdLineParser.add_option(
        "-x",
        "--blatRnaNormalReads",
        action="store_true",
        default=False,
        dest="blatRnaNormalReads",
        help="include this argument if the normal RNA reads " +
        "should be processed")
    i_cmdLineParser.add_option(
        "-t",
        "--blatDnaTumorReads",
        action="store_true",
        default=False,
        dest="blatDnaTumorReads",
        help="include this argument if the tumor DNA reads " +
        "should be processed")
    i_cmdLineParser.add_option(
        "-r",
        "--blatRnaTumorReads",
        action="store_true",
        default=False,
        dest="blatRnaTumorReads",
        help="include this argument if the tumor RNA reads " +
        "should be processed")

    i_cmdLineParser.add_option(
        "-d",
        "--minReadDepth",
        type="int",
        default=int(4),
        dest="minReadDepth",
        metavar="MIN_READ_DP",
        help="the minimum number of valid reads that are necessary, " +
        "%default by default")
    i_cmdLineParser.add_option(
        "-p",
        "--minReadPercent",
        type="float",
        default=float(0.10),
        dest="minReadPercent",
        metavar="MIN_READ_PCT",
        help="the minimum percentage of valid reads that are necessary, " +
        "%default by default")
    i_cmdLineParser.add_option(
        "-m",
        "--minOrderMagnitude",
        type="int",
        default=float(0),
        dest="minOrderMagnitude",
        metavar="MIN_ORDER_MAGNITUDE",
        help="the minimum order of magnitude difference between the blat " +
        "hit at the query position vs. the next best blat hit in order " +
        "for the read to be valid, %default by default")
    '''
    i_cmdLineParser.add_option(
        "-e", "--minEValue", type="float", default=float(10e-6),
        dest="minEValue", metavar="MIN_EVALUE",
        help="the minimum e-value needed for a blat hit to be significant, " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-u", "--maxIdentity", type="float", default=float(0.95),
        dest="maxIdentity", metavar="MAX_IDENTITY",
        help="the maximum match length adjusted identity for a blat hit to " +
             "be significant, %default by default")
    i_cmdLineParser.add_option(
        "-l", "--minIdentity", type="float", default=float(0.5),
        dest="minIdentity", metavar="MIN_IDENTITY",
        help="the minimum match length adjusted identity for a blat hit to " +
             "be significant, %default by default")
    '''

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5, 27, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (cmdLineOpts, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatOutputFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = cmdLineOpts.passedVCFCallsOnly
    i_keepPreviousFiltersFlag = cmdLineOpts.keepPreviousFilters
    i_blatOutputFormat = cmdLineOpts.blatOutputFormat
    i_logLevel = cmdLineOpts.logLevel
    i_rnaIncludeSecondaryAlignments = cmdLineOpts.rnaIncludeSecondaryAlignments
    i_minReadDepth = cmdLineOpts.minReadDepth
    i_minReadPercent = cmdLineOpts.minReadPercent
    i_minOrderMagnitude = cmdLineOpts.minOrderMagnitude
    # i_minEValue = cmdLineOpts.minEValue
    # i_maxIdentity = cmdLineOpts.maxIdentity
    # i_minIdentity = cmdLineOpts.minIdentity

    i_blatDnaNormalReads = cmdLineOpts.blatDnaNormalReads
    i_blatDnaTumorReads = cmdLineOpts.blatDnaTumorReads
    i_blatRnaNormalReads = cmdLineOpts.blatRnaNormalReads
    i_blatRnaTumorReads = cmdLineOpts.blatRnaTumorReads

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    i_transcriptNameTag = None
    i_transcriptCoordinateTag = None
    i_transcriptStrandTag = None
    if (cmdLineOpts.outputFilename is not None):
        i_outputFilename = cmdLineOpts.outputFilename
    if (cmdLineOpts.logFilename is not None):
        i_logFilename = cmdLineOpts.logFilename
    if (cmdLineOpts.transcriptNameTag is not None):
        i_transcriptNameTag = cmdLineOpts.transcriptNameTag
    if (cmdLineOpts.transcriptCoordinateTag is not None):
        i_transcriptCoordinateTag = cmdLineOpts.transcriptCoordinateTag
    if (cmdLineOpts.transcriptStrandTag is not None):
        i_transcriptStrandTag = cmdLineOpts.transcriptStrandTag

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the " +
            "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatOutputFilename=%s", i_blatOutputFilename)
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag)
        logging.debug("blatOutputFormat=%s", i_blatOutputFormat)

        logging.debug("transcriptNameTag %s", i_transcriptNameTag)
        logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag)
        logging.debug("transcriptStrandTag %s", i_transcriptStrandTag)
        logging.debug("rnaInclSecAlign=%s" % i_rnaIncludeSecondaryAlignments)

        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)

        logging.debug("minReadDepth=%s", i_minReadDepth)
        logging.debug("minReadPercent=%s", i_minReadPercent)
        logging.debug("minOrderMagnitude=%s", i_minOrderMagnitude)

    # check for any errors
    i_writeFilenameList = []
    if (cmdLineOpts.outputFilename is not sys.stdout):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename, i_blatOutputFilename]

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    if i_outputFilename is not sys.stdout:
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
    else:
        i_outputFileHandler = i_outputFilename

    # get the VCF generator
    i_vcfGenerator = get_vcf_data(i_vcfFilename, i_passedVCFCallsOnlyFlag,
                                  i_debug)

    # get the blat hits generator
    i_blatGenerator = parse_blat_output(i_blatOutputFilename,
                                        i_blatOutputFormat, i_debug)

    for (vcfLine, blatHitsDict) in izip(i_vcfGenerator, i_blatGenerator):

        if (i_debug):
            logging.debug("VCF Line=%s", vcfLine)
            logging.debug("Len Blat Hits=%s", len(blatHitsDict))

        # parse the VCF line
        splitLine = vcfLine.split("\t")

        # the coordinate is the second element
        vcfChr = splitLine[0]
        vcfStopCoordinate = int(splitLine[1])
        vcfIds = splitLine[2]
        vcfRef = splitLine[3]
        vcfAlts = splitLine[4]
        vcfScore = splitLine[5]
        vcfFilterSet = set(splitLine[6].split(";"))
        vcfInfoList = splitLine[7].split(";")
        vcfInfoDict = collections.defaultdict(list)
        for info in vcfInfoList:
            keyValueList = info.split("=")
            # some keys are just singular without a value (e.g. DB, etc.)
            if (len(keyValueList) == 1):
                vcfInfoDict[keyValueList[0]] = ["True"]
            else:
                # the value can be a comma separated list
                vcfInfoDict[keyValueList[0]] = keyValueList[1].split(",")
        vcfRestOfLine = "\t".join(splitLine[8:])

        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        for modType in modTypes:

            blatOverallReadDepth = 0
            numValidReads = 0

            prefix = ""
            if (modType == "GERM" and i_blatDnaNormalReads):
                prefix = "dnaNormal"
            elif (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                prefix = "rnaNormal"
            elif (modType == "SOM" and i_blatDnaTumorReads):
                prefix = "dnaTumor"
            elif ((modType == "SOM" or modType == "TUM_EDIT")
                  and i_blatRnaTumorReads):
                prefix = "rnaTumor"

            # get the expected prefix
            vcfKey = "_".join([prefix, vcfChr, str(vcfStopCoordinate)])

            # for each read, investigate the blat
            # hits to see if this read is valid
            for (readId, blatHitList) in blatHitsDict.iteritems():
                if (i_debug):
                    logging.debug("num of blat hits for read %s=%s", readId,
                                  len(blatHitList))

                # if the readId does not start with the vcfKey,
                # then something is wrong. the VCF and blat hits
                # need to be in sync...
                if (not readId.startswith(vcfKey)):
                    logging.error("The blat query seems to be out of sync " +
                                  "with the blat hits.")
                    logging.error("VCF Line=%s", vcfLine)
                    logging.error("readId=%s, blatHitsDict=%s", readId,
                                  blatHitsDict[readId][1])
                    sys.exit(1)

                blatOverallReadDepth += 1

                # find out if the read is valid or if it
                # maps to other places in the genome
                if (i_blatOutputFormat == "PSL"):
                    # if we should process the transcripts
                    if ((i_transcriptNameTag is not None)
                            and (i_transcriptNameTag in vcfInfoDict)):
                        (isValidRead, validRead) = is_valid_read_psl_format(
                            blatHitList, vcfInfoDict[i_transcriptNameTag],
                            vcfInfoDict[i_transcriptCoordinateTag],
                            i_rnaIncludeSecondaryAlignments, i_debug)
                    else:
                        (isValidRead, validRead) = is_valid_read_psl_format(
                            blatHitList, [vcfChr], [vcfStopCoordinate],
                            i_rnaIncludeSecondaryAlignments, i_debug)

                elif (i_blatOutputFormat == "BLAST"):
                    # if we should process the transcripts
                    if ((i_transcriptNameTag is not None)
                            and (i_transcriptNameTag in vcfInfoDict)):
                        (isValidRead, validRead) = is_valid_read_blast_format(
                            blatHitList, vcfInfoDict[i_transcriptNameTag],
                            vcfInfoDict[i_transcriptCoordinateTag],
                            i_rnaIncludeSecondaryAlignments,
                            i_minOrderMagnitude, i_debug)
                    else:
                        (isValidRead, validRead) = is_valid_read_blast_format(
                            blatHitList, [vcfChr], [vcfStopCoordinate],
                            i_rnaIncludeSecondaryAlignments,
                            i_minOrderMagnitude, i_debug)

                # if we have only one valid blat hit, then the read doesn't
                # map to other places in the genome very well, so let's use it
                if (isValidRead):
                    numValidReads += 1

                    if (i_debug):
                        logging.debug("ValidRead: %s", validRead)
                elif (i_debug):
                    logging.debug("not a valid read")

            if (blatOverallReadDepth > 0):
                tmpAltPct = numValidReads / float(blatOverallReadDepth)
                altPercent = round(tmpAltPct, 2)
            else:
                altPercent = 0.0

            if (numValidReads < i_minReadDepth
                    or altPercent < i_minReadPercent):
                modTypeFilters[modType] = "blat"
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True

            if (i_debug):
                logging.debug(
                    "blatOverallReadDepth=%s, numValidReads=%s, " +
                    "altPercent=%s", str(blatOverallReadDepth),
                    str(numValidReads), str(altPercent))
                logging.debug("modType=%s, passed? %s", modType,
                              modTypeFilters[modType])
                logging.debug("blatFilter originalDepth=%s, validBlatDepth=%s",
                              str(blatOverallReadDepth), str(numValidReads))

        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "blat"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)

        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges

        # if at least one passed, then set pass
        if (atLeastOnePass):
            vcfFilterSet = ["PASS"]
        else:
            # if the user wants to keep the previous filters
            if (i_keepPreviousFiltersFlag):
                # if the call previous passed, then just set blat
                if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet):
                    vcfFilterSet = ["blat"]
                # otherwise, add it to the previous filters
                else:
                    vcfFilterSet.add("blat")
            # otherwise, just set the blat filter
            else:
                vcfFilterSet = ["blat"]

            # update the mod filters
            modTypes = vcfInfoDict["MT"]
            modChanges = vcfInfoDict["MC"]
            origins = vcfInfoDict["ORIGIN"]
            if vcfInfoDict["MF"] is None:
                modFilters = []
            else:
                modFilters = vcfInfoDict["MF"]
            if vcfInfoDict["MFT"] is None:
                modFilterTypes = []
            else:
                modFilterTypes = vcfInfoDict["MFT"]

            for origin in origins:
                for (modType, modChange) in izip(modTypes, modChanges):
                    modFilterTypes.append("_".join(
                        [origin, modType, modChange]))
                    modFilters.append("_".join(vcfFilterSet))

            vcfInfoDict["MF"] = modFilters
            vcfInfoDict["MFT"] = modFilterTypes

        output = [
            vcfChr,
            str(vcfStopCoordinate), vcfIds, vcfRef, vcfAlts, vcfScore,
            ";".join(vcfFilterSet)
        ]

        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"

        output.append(infoField.rstrip(";"))
        output.append(vcfRestOfLine)

        i_outputFileHandler.write("\t".join(output) + "\n")

    stopTime = time.time()
    logging.info(
        "filterByBlat.py for Id %s: Total time=%s hrs, %s mins, " + "%s secs",
        i_id, ((stopTime - startTime) / (3600)), ((stopTime - startTime) / 60),
        (stopTime - startTime))

    # close the files
    if (i_outputFilename is not sys.stdout):
        i_outputFileHandler.close()

    return
Esempio n. 10
0
def main():

    # create the usage statement
    usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help=
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
    )
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-c",
        "--allVCFCalls",
        action="store_false",
        default=True,
        dest="passedVCFCallsOnly",
        help=
        "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed"
    )

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 14, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_vcfFilename = str(i_cmdLineArgs[0])
    i_rnaGeneFilename = str(i_cmdLineArgs[1])
    i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
            i_logLevel)

    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename)
        logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [
        i_vcfFilename, i_rnaGeneFilename, i_rnaGeneFamilyFilename
    ]

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    # get the RNA gene blacklists
    (i_rnaGeneList,
     i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename,
                                          i_rnaGeneFamilyFilename, i_debug)

    hasAddedHeader = False
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename)
    vcfHeader = "##FILTER=<ID=rgene,Description=\"This gene is on the RNA gene blacklist\">\n"
    vcfHeader += "##FILTER=<ID=rgfam,Description=\"This gene family is on the RNA gene family blacklist\">\n"

    for line in i_vcfFileHandler:

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (i_debug):
            logging.debug("vcfLine: %s", line)

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue
        # if we find the FILTER section, then add the filters from here
        elif ((not hasAddedHeader)
              and (line.startswith("##FILTER") or line.startswith("##INFO"))):
            hasAddedHeader = True
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(vcfHeader)
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, vcfHeader
                print >> sys.stdout, line

        # these lines are from previous scripts in the pipeline, so output them
        elif (line.startswith("#")):
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, line

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (i_passedVCFCallsOnlyFlag and "PASS" not in line):
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, line

        # now we are to the data
        else:

            # split the line on the tab
            splitLine = line.split("\t")

            filterSet = set(splitLine[6].split(";"))

            # if there are no filters so far, then clear the list
            if (len(filterSet) == 1 and "PASS" in filterSet):
                filterSet = set()

            # parse the info column and create a dict
            infoList = splitLine[7].split(";")
            infoDict = collections.defaultdict(list)
            for info in infoList:
                keyValueList = info.split("=")
                # some keys are just singular without a value (e.g. DB, SOMATIC, etc.)
                if (len(keyValueList) == 1):
                    infoDict[keyValueList[0]] = ["True"]
                else:
                    # the value can be a comma separated list
                    infoDict[keyValueList[0]] = keyValueList[1].split(",")

            effectList = infoDict["EFF"]
            effectRegEx = re.compile("(\\w).*\\({1}")
            ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"]

            isRnaBlacklistGene = False
            isRnaBlacklistGeneFamily = False

            for rawEffect in effectList:
                rawEffect = rawEffect.rstrip(")")
                iterator = effectRegEx.finditer(rawEffect)

                # for each match object in the iterator
                for match in iterator:
                    effect = match.group()
                    rawEffect = rawEffect.replace(effect, "")
                    effect = effect.rstrip("(")

                if (effect in ignoreEffectsList):
                    continue

                effectParts = rawEffect.split("|")
                #effectImpact = effectParts[0]
                #functionalClass = effectParts[1]
                #codonChange = effectParts[2]
                #aaChange = effectParts[3]
                #aaLength = effectParts[4]
                geneName = effectParts[5]
                transcriptBiotype = effectParts[6]
                #geneCoding = effectParts[7]
                #ensembleId = effectParts[8]
                #exonNumber = effectParts[9]
                #genotypeNumber = effectParts[10]

                # the RNA gene list can have "RP11" and that
                # should filter out any gene with RP11 in it
                for rnaGene in i_rnaGeneList:
                    if (rnaGene in geneName):
                        isRnaBlacklistGene = True
                        break

                if (transcriptBiotype in i_rnaGeneFamilyList):
                    isRnaBlacklistGeneFamily = True

            output = ["\t".join(splitLine[0:6])]

            # if the filter should be applied
            if (isRnaBlacklistGene):
                filterSet.add("rgene")
            # if the filter should be applied
            if (isRnaBlacklistGeneFamily):
                filterSet.add("rgfam")

            # if there are no filters so far, then this call passes
            if (len(filterSet) == 0):
                filterSet.add("PASS")

            output.append(";".join(filterSet))

            output.append("\t".join(splitLine[7:]))

            if (i_outputFilename != None):
                i_outputFileHandler.write("\t".join(output) + "\n")
            else:
                print >> sys.stdout, "\t".join(output)

    # close the files
    i_vcfFileHandler.close()
    if (i_outputFilename != None):
        i_outputFileHandler.close()

    return
Esempio n. 11
0
def main():

    # command for running this on a small test case:
    #python mergeChroms.py TCGA-BH-A18P-11A-43D-A12B-09_TCGA-BH-A18P-01A-11D-A12B-09 ../data/test/ ../data/test/ --log=DEBUG

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id inputDir outputDir [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help=
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
    )
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "",
        "--gzip",
        action="store_true",
        default=False,
        dest="gzip",
        help=
        "include this argument if the final VCF should be compressed with gzip"
    )

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 10, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_inputDir = i_cmdLineArgs[1]
    i_outputDir = i_cmdLineArgs[2]

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_gzip = i_cmdLineOptions.gzip

    i_logFilename = None
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
            i_logLevel)

    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("inputDir=%s", i_inputDir)
        logging.debug("outputDir=%s", i_outputDir)
        logging.debug("logLevel=%s", i_logLevel)
        logging.debug("logFile=%s", i_logFilename)
        logging.debug("gzip=%s", i_gzip)

    # check for any errors
    i_readFilenameList = None
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]
    else:
        i_writeFilenameList = None
    i_dirList = [i_inputDir, i_outputDir]

    if (not radiaUtil.check_for_argv_errors(i_dirList, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # get the VCF generator
    (headerDict, coordinateDict) = get_vcf_data(i_id, i_inputDir, i_debug)

    if (i_gzip):
        i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf.gz")
    else:
        i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf")

    outputFileHandler = get_write_fileHandler(i_outputFilename)

    # if we have header info to output
    if (len(headerDict["metadata"]) > 0):
        # output the header information
        outputFileHandler.write("\n".join(headerDict["metadata"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["filter"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["info"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["format"]) + "\n")
        outputFileHandler.write("".join(headerDict["chrom"]) + "\n")

    # first output the numerical chroms in order
    numericChromKeys = coordinateDict["numbers"].keys()
    numericChromKeys.sort(key=int)
    for chrom in numericChromKeys:
        outputFileHandler.write("\n".join(coordinateDict["numbers"][chrom]) +
                                "\n")

    # then output the alphabetical chroms in order
    letterChromKeys = coordinateDict["letters"].keys()
    letterChromKeys.sort(key=str)
    for chrom in letterChromKeys:
        outputFileHandler.write("\n".join(coordinateDict["letters"][chrom]) +
                                "\n")

    stopTime = time.time()
    logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs",
                 i_id, ((stopTime - startTime) / (3600)),
                 ((stopTime - startTime) / 60), (stopTime - startTime))
    # close the files
    outputFileHandler.close()

    return
Esempio n. 12
0
def main():
           
    startTime = time.time()
    
    # create the usage statement
    usage = "usage: python %prog passingFile originalFile outputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)
    
    i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default")
    i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default")
    
    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3,10,1)
    i_argLength = len(sys.argv)
    
    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)
    
    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_passingFilename = i_cmdLineArgs[0]
    i_originalFilename = i_cmdLineArgs[1]
    i_outputFilename = i_cmdLineArgs[2]
    
    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    
    i_logFilename = None
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
    
    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)
    
    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        
    # set the debug    
    i_debug = (i_numericLogLevel == logging.DEBUG)
    
    # output some debug info
    if (i_debug):
        logging.debug("passingFile=%s", i_passingFilename)
        logging.debug("originalFile=%s", i_originalFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
                    
    # check for any errors
    i_readFilenameList = [i_passingFilename, i_originalFilename]
    i_writeFilenameList = [i_outputFilename]
    i_dirList = None
    
    if (not radiaUtil.check_for_argv_errors(i_dirList, i_readFilenameList, i_writeFilenameList)):
        sys.exit(1)
                
    # get the VCF generator
    (passHeaderList, chromLine, passInfoList, passFilterList, passCoordinateDict) = get_vcf_data(i_passingFilename, i_debug)
    (orgHeaderList, chromLine, orgInfoList, orgFilterList, orgCoordinateDict) = get_vcf_data(i_originalFilename, i_debug)    
    
    outputFileHandler = get_write_fileHandler(i_outputFilename)
    
    for headerLine in orgHeaderList:
        outputFileHandler.write(headerLine + "\n")
    
    for headerLine in orgInfoList:
        outputFileHandler.write(headerLine + "\n")
        
    for headerLine in passInfoList:
        if (headerLine not in orgInfoList):
            outputFileHandler.write(headerLine + "\n")
            
    for headerLine in orgFilterList:
        outputFileHandler.write(headerLine + "\n")
        
    for headerLine in passFilterList:
        if (headerLine not in orgFilterList):
            outputFileHandler.write(headerLine + "\n")
        
    outputFileHandler.write(chromLine + "\n")
    
    numericKeys = orgCoordinateDict.keys()
    numericKeys.sort(key=int)
    for coordinate in numericKeys:
        if (coordinate in passCoordinateDict):
            line = passCoordinateDict[coordinate]
        else:
            line = orgCoordinateDict[coordinate]
        outputFileHandler.write(line)
        
    stopTime = time.time() 
    logging.info("Total time=%s hrs, %s mins, %s secs", ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime))    
    # close the files 
    outputFileHandler.close()
    
    return
Esempio n. 13
0
def main():

    # command for running this on a small test case:
    # python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf
    # ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl
    # --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatOutputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    # add the optional parameters
    i_cmdLineParser.add_option(
        "-c", "--allVCFCalls", action="store_false", default=True,
        dest="passedVCFCallsOnly",
        help="by default only the VCF calls that have passed all filters " +
             "thus far are processed, include this argument if all of the " +
             "VCF calls should be processed")
    i_cmdLineParser.add_option(
        "-k", "--keepPreviousFilters", action="store_true", default=False,
        dest="keepPreviousFilters",
        help="by default the previous filters are overwritten with the blat " +
             "filter, include this argument if the previous filters should " +
             "be kept")

    i_cmdLineParser.add_option(
        "-o", "--outputFilename",
        dest="outputFilename", metavar="OUTPUT_FILE", default=sys.stdout,
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-b", "--blatOutputFormat",
        dest="blatOutputFormat", metavar="OUTPUT_FORMAT", default="BLAST",
        help="the BLAT output format, BLAST by default")
    i_cmdLineParser.add_option(
        "-l", "--log",
        dest="logLevel", default="WARNING", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")

    i_cmdLineParser.add_option(
        "", "--transcriptNameTag",
        dest="transcriptNameTag",
        help="the INFO key where the original transcript name can be found")
    i_cmdLineParser.add_option(
        "", "--transcriptCoordinateTag",
        dest="transcriptCoordinateTag",
        help="the INFO key where the original transcript" +
             "coordinate can be found")
    i_cmdLineParser.add_option(
        "", "--transcriptStrandTag",
        dest="transcriptStrandTag",
        help="the INFO key where the original transcript strand can be found")
    i_cmdLineParser.add_option(
        "", "--rnaIncludeSecondaryAlignments",
        action="store_true", default=False,
        dest="rnaIncludeSecondaryAlignments",
        help="if you align the RNA to transcript isoforms, then you may " +
             "want to include RNA secondary alignments in the pileup")

    i_cmdLineParser.add_option(
        "-n", "--blatDnaNormalReads", action="store_true", default=False,
        dest="blatDnaNormalReads",
        help="include this argument if the normal DNA reads " +
             "should be processed")
    i_cmdLineParser.add_option(
        "-x", "--blatRnaNormalReads", action="store_true", default=False,
        dest="blatRnaNormalReads",
        help="include this argument if the normal RNA reads " +
             "should be processed")
    i_cmdLineParser.add_option(
        "-t", "--blatDnaTumorReads", action="store_true", default=False,
        dest="blatDnaTumorReads",
        help="include this argument if the tumor DNA reads " +
             "should be processed")
    i_cmdLineParser.add_option(
        "-r", "--blatRnaTumorReads", action="store_true", default=False,
        dest="blatRnaTumorReads",
        help="include this argument if the tumor RNA reads " +
             "should be processed")

    i_cmdLineParser.add_option(
        "-d", "--minReadDepth", type="int", default=int(4),
        dest="minReadDepth", metavar="MIN_READ_DP",
        help="the minimum number of valid reads that are necessary, " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-p", "--minReadPercent", type="float", default=float(0.10),
        dest="minReadPercent", metavar="MIN_READ_PCT",
        help="the minimum percentage of valid reads that are necessary, " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-m", "--minOrderMagnitude", type="int", default=float(0),
        dest="minOrderMagnitude", metavar="MIN_ORDER_MAGNITUDE",
        help="the minimum order of magnitude difference between the blat " +
             "hit at the query position vs. the next best blat hit in order " +
             "for the read to be valid, %default by default")

    '''
    i_cmdLineParser.add_option(
        "-e", "--minEValue", type="float", default=float(10e-6),
        dest="minEValue", metavar="MIN_EVALUE",
        help="the minimum e-value needed for a blat hit to be significant, " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-u", "--maxIdentity", type="float", default=float(0.95),
        dest="maxIdentity", metavar="MAX_IDENTITY",
        help="the maximum match length adjusted identity for a blat hit to " +
             "be significant, %default by default")
    i_cmdLineParser.add_option(
        "-l", "--minIdentity", type="float", default=float(0.5),
        dest="minIdentity", metavar="MIN_IDENTITY",
        help="the minimum match length adjusted identity for a blat hit to " +
             "be significant, %default by default")
    '''

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5, 27, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (cmdLineOpts, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatOutputFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = cmdLineOpts.passedVCFCallsOnly
    i_keepPreviousFiltersFlag = cmdLineOpts.keepPreviousFilters
    i_blatOutputFormat = cmdLineOpts.blatOutputFormat
    i_logLevel = cmdLineOpts.logLevel
    i_rnaIncludeSecondaryAlignments = cmdLineOpts.rnaIncludeSecondaryAlignments
    i_minReadDepth = cmdLineOpts.minReadDepth
    i_minReadPercent = cmdLineOpts.minReadPercent
    i_minOrderMagnitude = cmdLineOpts.minOrderMagnitude
    # i_minEValue = cmdLineOpts.minEValue
    # i_maxIdentity = cmdLineOpts.maxIdentity
    # i_minIdentity = cmdLineOpts.minIdentity

    i_blatDnaNormalReads = cmdLineOpts.blatDnaNormalReads
    i_blatDnaTumorReads = cmdLineOpts.blatDnaTumorReads
    i_blatRnaNormalReads = cmdLineOpts.blatRnaNormalReads
    i_blatRnaTumorReads = cmdLineOpts.blatRnaTumorReads

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    i_transcriptNameTag = None
    i_transcriptCoordinateTag = None
    i_transcriptStrandTag = None
    if (cmdLineOpts.outputFilename is not None):
        i_outputFilename = cmdLineOpts.outputFilename
    if (cmdLineOpts.logFilename is not None):
        i_logFilename = cmdLineOpts.logFilename
    if (cmdLineOpts.transcriptNameTag is not None):
        i_transcriptNameTag = cmdLineOpts.transcriptNameTag
    if (cmdLineOpts.transcriptCoordinateTag is not None):
        i_transcriptCoordinateTag = cmdLineOpts.transcriptCoordinateTag
    if (cmdLineOpts.transcriptStrandTag is not None):
        i_transcriptStrandTag = cmdLineOpts.transcriptStrandTag

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatOutputFilename=%s", i_blatOutputFilename)
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag)
        logging.debug("blatOutputFormat=%s", i_blatOutputFormat)

        logging.debug("transcriptNameTag %s", i_transcriptNameTag)
        logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag)
        logging.debug("transcriptStrandTag %s", i_transcriptStrandTag)
        logging.debug("rnaInclSecAlign=%s" % i_rnaIncludeSecondaryAlignments)

        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)

        logging.debug("minReadDepth=%s", i_minReadDepth)
        logging.debug("minReadPercent=%s", i_minReadPercent)
        logging.debug("minOrderMagnitude=%s", i_minOrderMagnitude)

    # check for any errors
    i_writeFilenameList = []
    if (cmdLineOpts.outputFilename is not sys.stdout):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename, i_blatOutputFilename]

    if (not radiaUtil.check_for_argv_errors(None,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    if i_outputFilename is not sys.stdout:
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
    else:
        i_outputFileHandler = i_outputFilename

    # get the VCF generator
    i_vcfGenerator = get_vcf_data(i_vcfFilename,
                                  i_passedVCFCallsOnlyFlag,
                                  i_debug)

    # get the blat hits generator
    i_blatGenerator = parse_blat_output(i_blatOutputFilename,
                                        i_blatOutputFormat,
                                        i_debug)

    for (vcfLine, blatHitsDict) in izip(i_vcfGenerator, i_blatGenerator):

        if (i_debug):
            logging.debug("VCF Line=%s", vcfLine)
            logging.debug("Len Blat Hits=%s", len(blatHitsDict))

        # parse the VCF line
        splitLine = vcfLine.split("\t")

        # the coordinate is the second element
        vcfChr = splitLine[0]
        vcfStopCoordinate = int(splitLine[1])
        vcfIds = splitLine[2]
        vcfRef = splitLine[3]
        vcfAlts = splitLine[4]
        vcfScore = splitLine[5]
        vcfFilterSet = set(splitLine[6].split(";"))
        vcfInfoList = splitLine[7].split(";")
        vcfInfoDict = collections.defaultdict(list)
        for info in vcfInfoList:
            keyValueList = info.split("=")
            # some keys are just singular without a value (e.g. DB, etc.)
            if (len(keyValueList) == 1):
                vcfInfoDict[keyValueList[0]] = ["True"]
            else:
                # the value can be a comma separated list
                vcfInfoDict[keyValueList[0]] = keyValueList[1].split(",")
        vcfRestOfLine = "\t".join(splitLine[8:])

        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        for modType in modTypes:

            blatOverallReadDepth = 0
            numValidReads = 0

            prefix = ""
            if (modType == "GERM" and i_blatDnaNormalReads):
                prefix = "dnaNormal"
            elif (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                prefix = "rnaNormal"
            elif (modType == "SOM" and i_blatDnaTumorReads):
                prefix = "dnaTumor"
            elif ((modType == "SOM" or modType == "TUM_EDIT") and
                  i_blatRnaTumorReads):
                prefix = "rnaTumor"

            # get the expected prefix
            vcfKey = "_".join([prefix, vcfChr, str(vcfStopCoordinate)])

            # for each read, investigate the blat
            # hits to see if this read is valid
            for (readId, blatHitList) in blatHitsDict.iteritems():
                if (i_debug):
                    logging.debug("num of blat hits for read %s=%s",
                                  readId, len(blatHitList))

                # if the readId does not start with the vcfKey,
                # then something is wrong. the VCF and blat hits
                # need to be in sync...
                if (not readId.startswith(vcfKey)):
                    logging.error("The blat query seems to be out of sync " +
                                    "with the blat hits.")
                    logging.error("VCF Line=%s", vcfLine)
                    logging.error("readId=%s, blatHitsDict=%s",
                                    readId, blatHitsDict[readId][1])
                    sys.exit(1)

                blatOverallReadDepth += 1

                # find out if the read is valid or if it
                # maps to other places in the genome
                if (i_blatOutputFormat == "PSL"):
                    # if we should process the transcripts
                    if ((i_transcriptNameTag is not None) and
                        (i_transcriptNameTag in vcfInfoDict)):
                        (isValidRead, validRead) = is_valid_read_psl_format(
                                        blatHitList,
                                        vcfInfoDict[i_transcriptNameTag],
                                        vcfInfoDict[i_transcriptCoordinateTag],
                                        i_rnaIncludeSecondaryAlignments,
                                        i_debug)
                    else:
                        (isValidRead, validRead) = is_valid_read_psl_format(
                                        blatHitList,
                                        [vcfChr],
                                        [vcfStopCoordinate],
                                        i_rnaIncludeSecondaryAlignments,
                                        i_debug)

                elif (i_blatOutputFormat == "BLAST"):
                    # if we should process the transcripts
                    if ((i_transcriptNameTag is not None) and
                        (i_transcriptNameTag in vcfInfoDict)):
                        (isValidRead, validRead) = is_valid_read_blast_format(
                                        blatHitList,
                                        vcfInfoDict[i_transcriptNameTag],
                                        vcfInfoDict[i_transcriptCoordinateTag],
                                        i_rnaIncludeSecondaryAlignments,
                                        i_minOrderMagnitude,
                                        i_debug)
                    else:
                        (isValidRead, validRead) = is_valid_read_blast_format(
                                        blatHitList,
                                        [vcfChr],
                                        [vcfStopCoordinate],
                                        i_rnaIncludeSecondaryAlignments,
                                        i_minOrderMagnitude,
                                        i_debug)

                # if we have only one valid blat hit, then the read doesn't
                # map to other places in the genome very well, so let's use it
                if (isValidRead):
                    numValidReads += 1

                    if (i_debug):
                        logging.debug("ValidRead: %s", validRead)
                elif (i_debug):
                    logging.debug("not a valid read")

            if (blatOverallReadDepth > 0):
                tmpAltPct = numValidReads/float(blatOverallReadDepth)
                altPercent = round(tmpAltPct, 2)
            else:
                altPercent = 0.0

            if (numValidReads < i_minReadDepth or
                altPercent < i_minReadPercent):
                modTypeFilters[modType] = "blat"
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True

            if (i_debug):
                logging.debug("blatOverallReadDepth=%s, numValidReads=%s, " +
                              "altPercent=%s", str(blatOverallReadDepth),
                              str(numValidReads), str(altPercent))
                logging.debug("modType=%s, passed? %s", modType,
                              modTypeFilters[modType])
                logging.debug("blatFilter originalDepth=%s, validBlatDepth=%s",
                              str(blatOverallReadDepth), str(numValidReads))

        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "blat"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)

        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges

        # if at least one passed, then set pass
        if (atLeastOnePass):
            vcfFilterSet = ["PASS"]
        else:
            # if the user wants to keep the previous filters
            if (i_keepPreviousFiltersFlag):
                # if the call previous passed, then just set blat
                if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet):
                    vcfFilterSet = ["blat"]
                # otherwise, add it to the previous filters
                else:
                    vcfFilterSet.add("blat")
            # otherwise, just set the blat filter
            else:
                vcfFilterSet = ["blat"]

            # update the mod filters
            modTypes = vcfInfoDict["MT"]
            modChanges = vcfInfoDict["MC"]
            origins = vcfInfoDict["ORIGIN"]
            if vcfInfoDict["MF"] is None:
                modFilters = []
            else:
                modFilters = vcfInfoDict["MF"]
            if vcfInfoDict["MFT"] is None:
                modFilterTypes = []
            else:
                modFilterTypes = vcfInfoDict["MFT"]

            for origin in origins:
                for (modType, modChange) in izip(modTypes, modChanges):
                    modFilterTypes.append("_".join([origin,
                                                    modType,
                                                    modChange]))
                    modFilters.append("_".join(vcfFilterSet))

            vcfInfoDict["MF"] = modFilters
            vcfInfoDict["MFT"] = modFilterTypes

        output = [vcfChr, str(vcfStopCoordinate), vcfIds, vcfRef,
                  vcfAlts, vcfScore, ";".join(vcfFilterSet)]

        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"

        output.append(infoField.rstrip(";"))
        output.append(vcfRestOfLine)

        i_outputFileHandler.write("\t".join(output) + "\n")

    stopTime = time.time()
    logging.info("filterByBlat.py for Id %s: Total time=%s hrs, %s mins, " +
                 "%s secs", i_id, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))

    # close the files
    if (i_outputFilename is not sys.stdout):
        i_outputFileHandler.close()

    return
Esempio n. 14
0
def main():
    
    # command for running this on a small test case: 
    #python createBlatFile.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf ../data/test/tmp/ --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads
 
    startTime = time.time()
    
    # create the usage statement
    usage = "usage: python %prog id vcfFile headerFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)
    
    # add the optional parameters
    i_cmdLineParser.add_option("-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed")
    i_cmdLineParser.add_option("-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-b", "--allReadBases", action="store_false", default=True, dest="altBasesOnly", help="by default only the reads with the alternate base are processed, include this argument if all of the reads should be processed")
    i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default")
    i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default")
    
    i_cmdLineParser.add_option("-n", "--blatDnaNormalReads", action="store_true", default=False, dest="blatDnaNormalReads", help="include this argument if the normal DNA reads should be processed")
    i_cmdLineParser.add_option("-x", "--blatRnaNormalReads", action="store_true", default=False, dest="blatRnaNormalReads", help="include this argument if the normal RNA reads should be processed")
    i_cmdLineParser.add_option("-t", "--blatDnaTumorReads", action="store_true", default=False, dest="blatDnaTumorReads", help="include this argument if the tumor DNA reads should be processed")
    i_cmdLineParser.add_option("-r", "--blatRnaTumorReads", action="store_true", default=False, dest="blatRnaTumorReads", help="include this argument if the tumor RNA reads should be processed")
    
    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3,22,1)
    i_argLength = len(sys.argv)
    
    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)
    
    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_vcfFilename = i_cmdLineArgs[1]
    i_headerFilename = i_cmdLineArgs[2]
    
    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly
    i_altBasesOnlyFlag = i_cmdLineOptions.altBasesOnly
    i_logLevel = i_cmdLineOptions.logLevel
    
    i_blatDnaNormalReads = i_cmdLineOptions.blatDnaNormalReads
    i_blatDnaTumorReads = i_cmdLineOptions.blatDnaTumorReads
    i_blatRnaNormalReads = i_cmdLineOptions.blatRnaNormalReads
    i_blatRnaTumorReads = i_cmdLineOptions.blatRnaTumorReads
    
    # try to get any optional parameters with no defaults    
    i_readFilenameList = [i_vcfFilename, i_headerFilename]
    i_writeFilenameList = []
    
    i_logFilename = None
    i_outputFilename = None
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
        i_writeFilenameList += [i_logFilename]
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
        i_writeFilenameList += [i_outputFilename]
        
    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)
    
    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        
    # set the debug    
    i_debug = (i_numericLogLevel < logging.WARNING)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("headerFilename=%s", i_headerFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("altBasesOnlyFlag? %s", i_altBasesOnlyFlag)
        
        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)
                    
    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)):
        sys.exit(1)
        
    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = get_write_fileHandler(i_outputFilename)
                
    # get the VCF generator
    i_vcfGenerator  = get_vcf_data(i_vcfFilename, i_headerFilename, i_passedVCFCallsOnlyFlag, i_altBasesOnlyFlag, i_debug)    
   
    # for each VCF call that should be investigated   
    for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore, vcfFilterSet, vcfInfoDict, restOfLine, vcfParamsDict) in i_vcfGenerator:
        if (i_debug):
            logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, str(vcfFilterSet), str(vcfInfoDict), restOfLine) 
        
        modTypes = vcfInfoDict["MT"]
        for modType in modTypes:
            # get the reads contributing to a call and put them in a blat query file
            if (i_blatDnaNormalReads):
                write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, vcfParamsDict, vcfInfoDict, "dnaNormal", i_debug)                      
                
            if (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, vcfParamsDict, vcfInfoDict, "rnaNormal", i_debug)
            
            if (i_blatDnaTumorReads):
                write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, vcfParamsDict, vcfInfoDict, "dnaTumor", i_debug)    
                
            if ((modType == "SOM" or modType == "TUM_EDIT") and i_blatRnaTumorReads):
                write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, vcfParamsDict, vcfInfoDict, "rnaTumor", i_debug)
            
    stopTime = time.time()       
    logging.info("Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime))         
        
    # close the files 
    if (i_outputFilename != None):
        i_outputFileHandler.close()
        
    return
Esempio n. 15
0
def main():

    # command for running this on a small test case:
    # python mergeChroms.py TCGA-BH-A18P
    # ../data/test/ ../data/test/ --log=DEBUG

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id inputDir outputDir [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-l", "--log",
        dest="logLevel", default="WARNING", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")
    i_cmdLineParser.add_option(
        "", "--gzip",
        dest="gzip", action="store_true", default=False,
        help="include this argument if the final VCF should be " +
             "compressed with gzip")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 10, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_inputDir = i_cmdLineArgs[1]
    i_outputDir = i_cmdLineArgs[2]

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_gzip = i_cmdLineOptions.gzip

    i_logFilename = None
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("inputDir=%s", i_inputDir)
        logging.debug("outputDir=%s", i_outputDir)
        logging.debug("logLevel=%s", i_logLevel)
        logging.debug("logFile=%s", i_logFilename)
        logging.debug("gzip=%s", i_gzip)

    # check for any errors
    i_readFilenameList = None
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]
    else:
        i_writeFilenameList = None
    i_dirList = [i_inputDir, i_outputDir]

    if (not radiaUtil.check_for_argv_errors(i_dirList,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # get the VCF generator
    (headerDict, coordDict) = get_vcf_data(i_id, i_inputDir, i_debug)

    if (i_gzip):
        i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf.gz")
    else:
        i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf")

    outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    # if we have header info to output
    if (len(headerDict["metadata"]) > 0):
        # output the header information
        outputFileHandler.write("\n".join(headerDict["metadata"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["filter"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["info"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["format"]) + "\n")
        outputFileHandler.write("".join(headerDict["chrom"]) + "\n")

    # first output the numerical chroms in order
    numericChromKeys = coordDict["numbers"].keys()
    numericChromKeys.sort(key=int)
    for chrom in numericChromKeys:
        outputFileHandler.write("\n".join(coordDict["numbers"][chrom]) + "\n")

    # then output the alphabetical chroms in order
    letterChromKeys = coordDict["letters"].keys()
    letterChromKeys.sort(key=str)
    for chrom in letterChromKeys:
        outputFileHandler.write("\n".join(coordDict["letters"][chrom]) + "\n")

    stopTime = time.time()
    logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs",
                 i_id, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))

    # close the files
    outputFileHandler.close()

    return
Esempio n. 16
0
def main():
    
    #python radiaCompare.py TCGA-AB-2995 12 ../data/test/TCGA-AB-2995.vcf ../data/test/TCGA-AB-2995.vcf -c "SOM=Somatic" -s ../stats/radia/cmpRadBB.tab --log=DEBUG 
        
    # create the usage statement
    usage = "usage: python %prog id chrom radFile compareFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)
    
    i_cmdLineParser.add_option("-c", "--compareList", dest="compareList", metavar="COMPARE_LIST", help="a comma separated list of key/values comparisons where the key is in RADIA and the value is in the compare file")
    i_cmdLineParser.add_option("-s", "--statsFilename", dest="statsFilename", metavar="STATS_FILE", help="the name of the stats file, sys.stdout by default")
    i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default")
    i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDERR by default")
    i_cmdLineParser.add_option("-o", "--overlapFilename", dest="overlapFilename", metavar="OVERLAP_FILE", help="the name of the overlap file")
    i_cmdLineParser.add_option("-n", "--nonOverlapFilename", dest="nonOverlapFilename", metavar="NON_OVERLAP_FILE", help="the name of the non-overlap file")
    
    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(4,17,1)
    i_argLength = len(sys.argv)
    
    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)
    
    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_chr = str(i_cmdLineArgs[1])
    i_radiaFilename = str(i_cmdLineArgs[2])
    i_compareFilename = str(i_cmdLineArgs[3])
    
    # get the optional params with default values   
    i_logLevel = i_cmdLineOptions.logLevel
    
    # try to get any optional parameters with no defaults   
    # check for any errors
    writeFilenameList = []
    readFilenameList = [i_radiaFilename, i_compareFilename]
     
    i_statsFilename = None
    i_logFilename = None
    i_compareString = None
    i_overlapFilename = None
    i_nonOverlapFilename = None
    i_compareDict = collections.defaultdict(list)
    if (i_cmdLineOptions.overlapFilename != None):
        i_overlapFilename = str(i_cmdLineOptions.overlapFilename)
        writeFilenameList += [i_overlapFilename]
    if (i_cmdLineOptions.nonOverlapFilename != None):
        i_nonOverlapFilename = str(i_cmdLineOptions.nonOverlapFilename)
        writeFilenameList += [i_nonOverlapFilename]
    if (i_cmdLineOptions.statsFilename != None):
        i_statsFilename = str(i_cmdLineOptions.statsFilename)
        writeFilenameList += [i_statsFilename]
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
        writeFilenameList += [i_logFilename]
    if (i_cmdLineOptions.compareList != None):
        i_compareString = str(i_cmdLineOptions.compareList)
        i_compareList = i_compareString.split(",")
        
        for keyValue in i_compareList:
            (key, value) = keyValue.split("=")
            i_compareDict[key] = value
        
    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)
    
    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        
    # set the debug    
    i_debug = (i_numericLogLevel < logging.WARNING)

    # do some debugging
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("chr=%s", i_chr)
        logging.debug("radiaFile=%s", i_radiaFilename)
        logging.debug("overlapFilename=%s" % i_overlapFilename)
        logging.debug("nonOverlapFilename=%s" % i_nonOverlapFilename)
        logging.debug("compareFile=%s", i_compareFilename)
        logging.debug("statsFile=%s", i_statsFilename)
        logging.debug("logLevel=%s", i_logLevel)
        logging.debug("logFile=%s", i_logFilename)
        logging.debug("compareDict=%s", i_compareDict)
        
    if (not radiaUtil.check_for_argv_errors(None, readFilenameList, writeFilenameList)):
        sys.exit(1)           
    
    compare_events(i_id, i_chr, i_radiaFilename, i_compareFilename, i_statsFilename, i_overlapFilename, i_nonOverlapFilename, i_compareDict, i_debug)
       
    return
Esempio n. 17
0
def main():

    # python mergeRnaAndDnaFiles.py TCGA-AB-2995 5
    # ../data/test/TCGA-AB-2995_dnaFile.vcf
    # ../data/test/TCGA-AB-2995_rnaFile.vcf
    # ../data/test/TCGA-AB-2995_rnaFile.vcf
    # ../data/test/

    startTime = time.time()

    # create the usage statement
    usage = ("usage: python %prog id chrom dnaFile rnaFile rnaOverlapsFile " +
             "rnaNonOverlapsFile outputFile [Options]")
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-l", "--log", default="WARNING",
        dest="logLevel", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(6, 15, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_chrom = i_cmdLineArgs[1]
    i_dnaFilename = i_cmdLineArgs[2]
    i_rnaFilename = i_cmdLineArgs[3]
    i_overlapsFilename = i_cmdLineArgs[4]
    i_nonOverlapsFilename = i_cmdLineArgs[5]
    i_outputFilename = i_cmdLineArgs[6]

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel

    i_logFilename = None
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("chrom=%s", i_chrom)
        logging.debug("dnaFilename=%s", i_dnaFilename)
        logging.debug("rnaFilename=%s", i_rnaFilename)
        logging.debug("overlapsFilename=%s", i_overlapsFilename)
        logging.debug("nonOverlapsFilename=%s", i_nonOverlapsFilename)
        logging.debug("outputFilename=%s", i_outputFilename)

    # check for any errors
    i_readFilenameList = [i_dnaFilename, i_rnaFilename, i_overlapsFilename]
    i_writeFilenameList = [i_outputFilename]
    i_dirList = None

    if (not radiaUtil.check_for_argv_errors(i_dirList,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # get the VCF generator
    (headerList,
     coordinateDict) = merge_vcf_data(i_dnaFilename,
                                      i_rnaFilename,
                                      i_overlapsFilename,
                                      i_nonOverlapsFilename,
                                      i_debug)

    outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    for headerLine in headerList:
        outputFileHandler.write(headerLine)

    numericKeys = coordinateDict.keys()
    numericKeys.sort(key=int)
    for coordinate in numericKeys:
        line = coordinateDict[coordinate]
        line = line.rstrip("\r\n")
        # split the line on the tab
        splitLine = line.split("\t")

        # set the SST field in the INFO
        splitLine[7] = set_sst_field(splitLine[7])
        outputFileHandler.write("\t".join(splitLine) + "\n")

    stopTime = time.time()
    logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs",
                 i_id, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))

    # close the files
    outputFileHandler.close()

    return
Esempio n. 18
0
def main():
        
    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatInputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)
    
    i_cmdLineParser.add_option("-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default")
    i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default")
    
    i_cmdLineParser.add_option("-d", "--readDepthCutoff", type="int", default=int(4), dest="readDepthCutoff", metavar="READ_DP_CUTOFF", help="the minimum number of reads that are necessary before applying this filter, %default by default")
    i_cmdLineParser.add_option("-p", "--readPercentCutoff", type="float", default=float(0.95), dest="readPercentCutoff", metavar="READ_PERCENT_CUTOFF", help="the maximum percentage of reads with the alternative allele at the beginning or end of the reads, %default by default")
    
    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(4,17,1)
    i_argLength = len(sys.argv)
    
    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)
    
    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatInputFilename = str(i_cmdLineArgs[2])
    
    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_readDepthCutoff = i_cmdLineOptions.readDepthCutoff
    i_readPercentCutoff = i_cmdLineOptions.readPercentCutoff
    
    # try to get any optional parameters with no defaults    
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
        
    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)
    
    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        
    # set the debug    
    i_debug = (i_numericLogLevel < logging.WARNING)
    
    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatInputFilename=%s", i_blatInputFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("readDepthCutoff=%s", i_readDepthCutoff)
        logging.debug("readPerentCutoff=%s", i_readPercentCutoff)
            
    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]
        
    i_readFilenameList = [i_vcfFilename, i_blatInputFilename]
    
    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = get_write_fileHandler(i_outputFilename)
    
    # get the BLAT input
    i_blatCoordinateDict = parse_blat_input(i_blatInputFilename, i_debug)
    
    # get the VCF generator   
    i_vcfGenerator  = get_vcf_data(i_vcfFilename, i_debug)
    
    for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore, vcfFilterSet, vcfInfoDict, restOfLine) in i_vcfGenerator:
        if (i_debug):
            logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, str(vcfFilterSet), str(vcfInfoDict), restOfLine) 
            
        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        blatHitsList = list()
    
        # get the coordinate for this position
        coordinate = vcfChr + "_" + str(vcfStopCoordinate)
        for modType in modTypes:
            if (modType == "NOR_EDIT"):
                if (coordinate in i_blatCoordinateDict and "rnaNormal" in i_blatCoordinateDict[coordinate]):
                    # for each coordinate, get a dict of corresponding blat hits
                    blatHitsList = i_blatCoordinateDict[coordinate]["rnaNormal"]
            elif (modType == "SOM" or modType == "TUM_EDIT"):
                if (coordinate in i_blatCoordinateDict and "rnaTumor" in i_blatCoordinateDict[coordinate]):
                    # for each coordinate, get a dict of corresponding blat hits
                    blatHitsList = i_blatCoordinateDict[coordinate]["rnaTumor"]
            
            if (i_debug):
                logging.debug("coordinate=%s", coordinate)
            
            starts = 0
            ends = 0
            middles = 0
            total = 0
            
            # for each read, investigate the blat input
            for readId in blatHitsList:
                
                readIdList = readId.split("_")
                total += 1
                position = readIdList[7]
                
                if (i_debug):
                    logging.debug("readId=%s, position=%s", readId, position)
                
                if (position == "start"):
                    starts += 1
                elif (position == "end"):
                    ends += 1
                elif (position == "middle"):
                    middles += 1    
                            
            # if we have enough reads
            if (total > i_readDepthCutoff):        
                if (round(starts/float(total),2) >= i_readPercentCutoff):
                    modTypeFilters[modType] = "pbias"
                elif (round(ends/float(total),2) >= i_readPercentCutoff):
                    modTypeFilters[modType] = "pbias"
                #elif (round(middles/float(total),2) >= i_readPercentCutoff):
                #    modTypeFilters[modType] = "pbias"
                else:
                    modTypeFilters[modType] = "PASS"
                    atLeastOnePass = True
            # if we don't have the minimum number of reads, then pass
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True
                
            if (i_debug):
                logging.info("coordinate=%s, starts=%s, middles=%s, ends=%s, total=%s, positionalBias=%s", coordinate, starts, middles, ends, total, str(modTypeFilters))
           
        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "pbias"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)
        
        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges 
         
        # if at least one passed, then set pass
        if (not atLeastOnePass):
            vcfFilterSet.add("pbias")
                
        output = [vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore]
            
        # if there are no filters so far, then this call passes
        if (len(vcfFilterSet) == 0):
            vcfFilterSet.add("PASS")
            
        output.append(";".join(vcfFilterSet))
        
        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:    
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"
        
        output.append(infoField.rstrip(";"))
        
        output.append(restOfLine)
        
        if (i_outputFilename != None):
            i_outputFileHandler.write("\t".join(output) + "\n")
        else:
            print >> sys.stdout, "\t".join(output)
                
    # close the files 
    if (i_outputFilename != None):
        i_outputFileHandler.close()
        
    return
Esempio n. 19
0
def main():

    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatInputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help=
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
    )
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")

    i_cmdLineParser.add_option(
        "-d",
        "--readDepthCutoff",
        type="int",
        default=int(4),
        dest="readDepthCutoff",
        metavar="READ_DP_CUTOFF",
        help=
        "the minimum number of reads that are necessary before applying this filter, %default by default"
    )
    i_cmdLineParser.add_option(
        "-p",
        "--readPercentCutoff",
        type="float",
        default=float(0.95),
        dest="readPercentCutoff",
        metavar="READ_PERCENT_CUTOFF",
        help=
        "the maximum percentage of reads with the alternative allele at the beginning or end of the reads, %default by default"
    )

    i_cmdLineParser.add_option(
        "-k",
        "--keepPreviousFilters",
        action="store_true",
        default=False,
        dest="keepPreviousFilters",
        help=
        "by default the previous filters are overwritten with the pbias filter, include this argument if the previous filters should be kept"
    )

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(4, 17, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatInputFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_readDepthCutoff = i_cmdLineOptions.readDepthCutoff
    i_readPercentCutoff = i_cmdLineOptions.readPercentCutoff
    i_keepPreviousFiltersFlag = i_cmdLineOptions.keepPreviousFilters

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
            i_logLevel)

    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatInputFilename=%s", i_blatInputFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("readDepthCutoff=%s", i_readDepthCutoff)
        logging.debug("readPerentCutoff=%s", i_readPercentCutoff)
        logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename, i_blatInputFilename]

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = get_write_fileHandler(i_outputFilename)

    # get the BLAT input
    i_blatCoordinateDict = parse_blat_input(i_blatInputFilename, i_debug)

    # get the VCF generator
    i_vcfGenerator = get_vcf_data(i_vcfFilename, i_debug)

    for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore,
         vcfFilterSet, vcfInfoDict, restOfLine) in i_vcfGenerator:
        if (i_debug):
            logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr,
                          str(vcfStopCoordinate), vcfId,
                          vcfRef, vcfAlt, vcfScore, str(vcfFilterSet),
                          str(vcfInfoDict), restOfLine)

        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        blatHitsList = list()

        # get the coordinate for this position
        coordinate = vcfChr + "_" + str(vcfStopCoordinate)
        for modType in modTypes:
            if (modType == "NOR_EDIT"):
                if (coordinate in i_blatCoordinateDict
                        and "rnaNormal" in i_blatCoordinateDict[coordinate]):
                    # for each coordinate, get a dict of corresponding blat hits
                    blatHitsList = i_blatCoordinateDict[coordinate][
                        "rnaNormal"]
            elif (modType == "SOM" or modType == "TUM_EDIT"):
                if (coordinate in i_blatCoordinateDict
                        and "rnaTumor" in i_blatCoordinateDict[coordinate]):
                    # for each coordinate, get a dict of corresponding blat hits
                    blatHitsList = i_blatCoordinateDict[coordinate]["rnaTumor"]

            if (i_debug):
                logging.debug("coordinate=%s", coordinate)

            starts = 0
            ends = 0
            middles = 0
            total = 0

            # for each read, investigate the blat input
            for readId in blatHitsList:

                readIdList = readId.split("_")
                total += 1
                position = readIdList[7]

                if (i_debug):
                    logging.debug("readId=%s, position=%s", readId, position)

                if (position == "start"):
                    starts += 1
                elif (position == "end"):
                    ends += 1
                elif (position == "middle"):
                    middles += 1

            # if we have enough reads
            if (total > i_readDepthCutoff):
                if (round(starts / float(total), 2) >= i_readPercentCutoff):
                    modTypeFilters[modType] = "pbias"
                elif (round(ends / float(total), 2) >= i_readPercentCutoff):
                    modTypeFilters[modType] = "pbias"
                #elif (round(middles/float(total),2) >= i_readPercentCutoff):
                #    modTypeFilters[modType] = "pbias"
                else:
                    modTypeFilters[modType] = "PASS"
                    atLeastOnePass = True
            # if we don't have the minimum number of reads, then pass
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True

            if (i_debug):
                logging.debug(
                    "coordinate=%s, starts=%s, middles=%s, ends=%s, total=%s, positionalBias=%s",
                    coordinate, starts, middles, ends, total,
                    str(modTypeFilters))

        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "pbias"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)

        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges

        # if at least one passed, then set pass
        if (atLeastOnePass):
            vcfFilterSet = ["PASS"]
        else:
            # if the user wants to keep the previous filters
            if (i_keepPreviousFiltersFlag):
                # if the call previously passed, then just set pbias
                if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet):
                    vcfFilterSet = ["pbias"]
                # otherwise, add it to the previous filters
                else:
                    vcfFilterSet.add("pbias")
            # otherwise, just set the pbias filter
            else:
                vcfFilterSet = ["pbias"]

            # update the mod filters
            modTypes = vcfInfoDict["MT"]
            modChanges = vcfInfoDict["MC"]
            origins = vcfInfoDict["ORIGIN"]
            modFilters = [] if vcfInfoDict["MF"] is None else vcfInfoDict["MF"]
            modFilterTypes = [] if vcfInfoDict["MFT"] is None else vcfInfoDict[
                "MFT"]

            for origin in origins:
                for (modType, modChange) in izip(modTypes, modChanges):
                    modFilterTypes.append("_".join(
                        [origin, modType, modChange]))
                    modFilters.append("_".join(vcfFilterSet))

            vcfInfoDict["MF"] = modFilters
            vcfInfoDict["MFT"] = modFilterTypes

        output = [
            vcfChr,
            str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore
        ]

        # if there are no filters so far, then this call passes
        if (len(vcfFilterSet) == 0):
            vcfFilterSet.add("PASS")

        output.append(";".join(vcfFilterSet))

        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"

        output.append(infoField.rstrip(";"))

        output.append(restOfLine)

        if (i_outputFilename != None):
            i_outputFileHandler.write("\t".join(output) + "\n")
        else:
            print >> sys.stdout, "\t".join(output)

    # close the files
    if (i_outputFilename != None):
        i_outputFileHandler.close()

    return