Esempio n. 1
0
def main():

    # create the usage statement
    usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Opts]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-o", "--outputFilename", default=sys.stdout,
        dest="outputFilename", metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-l", "--log",
        dest="logLevel", default="WARNING", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-c", "--allVCFCalls", action="store_false", default=True,
        dest="passedVCFCallsOnly",
        help="by default only the VCF calls that have passed all filters " +
             "thus far are processed, include this argument if all of the " +
             "VCF calls should be processed")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 14, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_vcfFilename = str(i_cmdLineArgs[0])
    i_rnaGeneFilename = str(i_cmdLineArgs[1])
    i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename is not None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not sys.stdout):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename)
        logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename is not sys.stdout):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename,
                          i_rnaGeneFilename,
                          i_rnaGeneFamilyFilename]

    if (not radiaUtil.check_for_argv_errors(None,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the input stream
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename)

    # open the output stream
    if i_outputFilename is not sys.stdout:
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
    else:
        i_outputFileHandler = i_outputFilename

    # get the RNA gene blacklists
    (i_rnaGeneList,
     i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename,
                                          i_rnaGeneFamilyFilename,
                                          i_debug)

    hasAddedFilterHeader = False

    for line in i_vcfFileHandler:

        if (i_debug):
            logging.debug("vcfLine: %s", line)

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue

        # if we find the FILTER section, then add the filters from here
        elif ((not hasAddedFilterHeader) and (line.startswith("##FILTER"))):
            hasAddedFilterHeader = True
            i_outputFileHandler.write(
                "##FILTER=<ID=rgene,Description=\"This gene is on the " +
                "RNA gene blacklist\">\n")
            i_outputFileHandler.write(
                "##FILTER=<ID=rgfam,Description=\"This gene family is on " +
                "the RNA gene family blacklist\">\n")
            i_outputFileHandler.write(line)

        # these lines are from previous scripts in the pipeline, so output them
        elif (line.startswith("#")):
            i_outputFileHandler.write(line)

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (i_passedVCFCallsOnlyFlag and "PASS" not in line):
            i_outputFileHandler.write(line)

        # now we are to the data
        else:

            # strip the carriage return and newline characters
            line = line.rstrip("\r\n")

            # split the line on the tab
            splitLine = line.split("\t")

            filterSet = set(splitLine[6].split(";"))

            # if there are no filters so far, then clear the list
            if (len(filterSet) == 1 and "PASS" in filterSet):
                filterSet = set()

            # parse the info column and create a dict
            infoList = splitLine[7].split(";")
            infoDict = collections.defaultdict(list)
            for info in infoList:
                keyValueList = info.split("=")
                # some keys are just singular without a value (e.g. DB, etc.)
                if (len(keyValueList) == 1):
                    infoDict[keyValueList[0]] = ["True"]
                else:
                    # the value can be a comma separated list
                    infoDict[keyValueList[0]] = keyValueList[1].split(",")

            effectList = infoDict["EFF"]
            effectRegEx = re.compile("(\\w).*\\({1}")
            ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"]

            isRnaBlacklistGene = False
            isRnaBlacklistGeneFamily = False

            for rawEffect in effectList:
                rawEffect = rawEffect.rstrip(")")
                iterator = effectRegEx.finditer(rawEffect)

                # for each match object in the iterator
                for match in iterator:
                    effect = match.group()
                    rawEffect = rawEffect.replace(effect, "")
                    effect = effect.rstrip("(")

                if (effect in ignoreEffectsList):
                    continue

                effectParts = rawEffect.split("|")
                # effectImpact = effectParts[0]
                # functionalClass = effectParts[1]
                # codonChange = effectParts[2]
                # aaChange = effectParts[3]
                # aaLength = effectParts[4]
                geneName = effectParts[5]
                transcriptBiotype = effectParts[6]
                # geneCoding = effectParts[7]
                # ensembleId = effectParts[8]
                # exonNumber = effectParts[9]
                # genotypeNumber = effectParts[10]

                # the RNA gene list can have "RP11" and that
                # should filter out any gene with RP11 in it
                for rnaGene in i_rnaGeneList:
                    if (rnaGene in geneName):
                        isRnaBlacklistGene = True
                        break

                if (transcriptBiotype in i_rnaGeneFamilyList):
                    isRnaBlacklistGeneFamily = True

            output = ["\t".join(splitLine[0:6])]

            # if the filter should be applied
            if (isRnaBlacklistGene):
                filterSet.add("rgene")
            # if the filter should be applied
            if (isRnaBlacklistGeneFamily):
                filterSet.add("rgfam")

            # if there are no filters so far, then this call passes
            if (len(filterSet) == 0):
                filterSet.add("PASS")

            output.append(";".join(filterSet))

            output.append("\t".join(splitLine[7:]))

            if (i_outputFilename is not sys.stdout):
                i_outputFileHandler.write("\t".join(output) + "\n")
            else:
                print >> sys.stdout, "\t".join(output)

    # close the files
    i_vcfFileHandler.close()
    if (i_outputFilename is not sys.stdout):
        i_outputFileHandler.close()

    return
Esempio n. 2
0
def main():
    
    # command for running this on a small test case: 
    #python createBlatFile.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf ../data/test/tmp/ --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads
 
    startTime = time.time()
    
    # create the usage statement
    usage = "usage: python %prog id vcfFile headerFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)
    
    # add the optional parameters
    i_cmdLineParser.add_option("-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed")
    i_cmdLineParser.add_option("-b", "--allReadBases", action="store_false", default=True, dest="altBasesOnly", help="by default only the reads with the alternate base are processed, include this argument if all of the reads should be processed")
    i_cmdLineParser.add_option("-d", "--maxReadDepth", type="int", default=int(8000), dest="maxReadDepth", metavar="MAX_READ_DEPTH", help="the maximum read depth to process from the samtools view command, %default by default")
    
    i_cmdLineParser.add_option("-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default")
    i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default")
    
    i_cmdLineParser.add_option("", "--transcriptNameTag", dest="transcriptNameTag", help="the INFO key where the original transcript name can be found")
    i_cmdLineParser.add_option("", "--transcriptCoordinateTag", dest="transcriptCoordinateTag", help="the INFO key where the original transcript coordinate can be found")
    i_cmdLineParser.add_option("", "--transcriptStrandTag", dest="transcriptStrandTag", help="the INFO key where the original transcript strand can be found")
    i_cmdLineParser.add_option("", "--rnaIncludeSecondaryAlignments", action="store_true", default=False, dest="rnaIncludeSecondaryAlignments", help="if you align the RNA to transcript isoforms, then you may want to include RNA secondary alignments in the samtools mpileups")
    
    i_cmdLineParser.add_option("-n", "--blatDnaNormalReads", action="store_true", default=False, dest="blatDnaNormalReads", help="include this argument if the normal DNA reads should be processed")
    i_cmdLineParser.add_option("-x", "--blatRnaNormalReads", action="store_true", default=False, dest="blatRnaNormalReads", help="include this argument if the normal RNA reads should be processed")
    i_cmdLineParser.add_option("-t", "--blatDnaTumorReads", action="store_true", default=False, dest="blatDnaTumorReads", help="include this argument if the tumor DNA reads should be processed")
    i_cmdLineParser.add_option("-r", "--blatRnaTumorReads", action="store_true", default=False, dest="blatRnaTumorReads", help="include this argument if the tumor RNA reads should be processed")
    
    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3,22,1)
    i_argLength = len(sys.argv)
    
    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)
    
    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_vcfFilename = i_cmdLineArgs[1]
    i_headerFilename = i_cmdLineArgs[2]
    
    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly
    i_altBasesOnlyFlag = i_cmdLineOptions.altBasesOnly
    i_maxReadDepth = i_cmdLineOptions.maxReadDepth
    i_logLevel = i_cmdLineOptions.logLevel
    i_rnaIncludeSecondaryAlignments = i_cmdLineOptions.rnaIncludeSecondaryAlignments
    
    i_blatDnaNormalReads = i_cmdLineOptions.blatDnaNormalReads
    i_blatDnaTumorReads = i_cmdLineOptions.blatDnaTumorReads
    i_blatRnaNormalReads = i_cmdLineOptions.blatRnaNormalReads
    i_blatRnaTumorReads = i_cmdLineOptions.blatRnaTumorReads
    
    # try to get any optional parameters with no defaults    
    i_readFilenameList = [i_vcfFilename, i_headerFilename]
    i_writeFilenameList = []
    
    i_logFilename = None
    i_outputFilename = None
    i_transcriptNameTag = None
    i_transcriptCoordinateTag = None
    i_transcriptStrandTag = None
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)
        i_writeFilenameList += [i_logFilename]
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
        i_writeFilenameList += [i_outputFilename]
    if (i_cmdLineOptions.transcriptNameTag != None):
        i_transcriptNameTag = i_cmdLineOptions.transcriptNameTag
    if (i_cmdLineOptions.transcriptCoordinateTag != None):
        i_transcriptCoordinateTag = i_cmdLineOptions.transcriptCoordinateTag
    if (i_cmdLineOptions.transcriptStrandTag != None):
        i_transcriptStrandTag = i_cmdLineOptions.transcriptStrandTag
           
    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)
    
    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        
    # set the debug flag    
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("headerFilename=%s", i_headerFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("altBasesOnlyFlag? %s", i_altBasesOnlyFlag)
        logging.debug("maxReadDepth %s", i_maxReadDepth)
        
        logging.debug("transcriptNameTag %s", i_transcriptNameTag)
        logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag)
        logging.debug("transcriptStrandTag %s", i_transcriptStrandTag)
        logging.debug("rnaIncludeSecondaryAlignments=%s" % i_rnaIncludeSecondaryAlignments)
        
        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)
                    
    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)):
        sys.exit(1)
        
    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
                
    # get the VCF generator
    i_vcfGenerator  = get_vcf_data(i_vcfFilename, i_headerFilename, i_passedVCFCallsOnlyFlag, i_debug)    
   
    # for each VCF call that should be investigated   
    for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore, vcfFilterSet, vcfInfoDict, restOfLine, vcfParamsDict) in i_vcfGenerator:
        if (i_debug):
            logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, str(vcfFilterSet), str(vcfInfoDict), restOfLine) 
        
        modTypes = vcfInfoDict["MT"]
        for modType in modTypes:
            
            # get the reads contributing to a call and put them in a blat query file
            if (i_blatDnaNormalReads):
                write_to_blat_file(i_outputFileHandler, 
                                   vcfChr, 
                                   vcfStopCoordinate, 
                                   [vcfChr], 
                                   [vcfStopCoordinate], 
                                   [None], 
                                   vcfParamsDict, 
                                   vcfInfoDict, 
                                   "dnaNormal", 
                                   i_altBasesOnlyFlag, 
                                   False,
                                   i_maxReadDepth, 
                                   i_debug)
                
            if (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                # if we should process the transcripts
                if ((i_transcriptNameTag != None) and (i_transcriptNameTag in vcfInfoDict)):
                    write_to_blat_file(i_outputFileHandler, 
                                       vcfChr, 
                                       vcfStopCoordinate, 
                                       vcfInfoDict[i_transcriptNameTag], 
                                       vcfInfoDict[i_transcriptCoordinateTag], 
                                       vcfInfoDict[i_transcriptStrandTag], 
                                       vcfParamsDict, 
                                       vcfInfoDict, 
                                       "rnaNormal", 
                                       i_altBasesOnlyFlag, 
                                       i_rnaIncludeSecondaryAlignments,
                                       i_maxReadDepth, 
                                       i_debug)
                else:
                    write_to_blat_file(i_outputFileHandler, 
                                       vcfChr, 
                                       vcfStopCoordinate, 
                                       [vcfChr], 
                                       [vcfStopCoordinate], 
                                       [None], 
                                       vcfParamsDict, 
                                       vcfInfoDict, 
                                       "rnaNormal", 
                                       i_altBasesOnlyFlag, 
                                       i_rnaIncludeSecondaryAlignments, 
                                       i_maxReadDepth,
                                       i_debug)
            
            if (i_blatDnaTumorReads):
                write_to_blat_file(i_outputFileHandler, 
                                   vcfChr, 
                                   vcfStopCoordinate, 
                                   [vcfChr], 
                                   [vcfStopCoordinate], 
                                   [None], 
                                   vcfParamsDict, 
                                   vcfInfoDict, 
                                   "dnaTumor", 
                                   i_altBasesOnlyFlag, 
                                   False, 
                                   i_maxReadDepth,
                                   i_debug)
                
            if ((modType == "SOM" or modType == "TUM_EDIT") and i_blatRnaTumorReads):
                # if we should process the transcripts
                if ((i_transcriptNameTag != None) and (i_transcriptNameTag in vcfInfoDict)):
                    write_to_blat_file(i_outputFileHandler, 
                                       vcfChr, 
                                       vcfStopCoordinate,
                                       list(vcfInfoDict[i_transcriptNameTag]), 
                                       vcfInfoDict[i_transcriptCoordinateTag], 
                                       vcfInfoDict[i_transcriptStrandTag], 
                                       vcfParamsDict, 
                                       vcfInfoDict, 
                                       "rnaTumor", 
                                       i_altBasesOnlyFlag, 
                                       i_rnaIncludeSecondaryAlignments,
                                       i_maxReadDepth, 
                                       i_debug)
                else:
                    write_to_blat_file(i_outputFileHandler, 
                                       vcfChr, 
                                       vcfStopCoordinate, 
                                       [vcfChr], 
                                       [vcfStopCoordinate], 
                                       [None], 
                                       vcfParamsDict, 
                                       vcfInfoDict, 
                                       "rnaTumor", 
                                       i_altBasesOnlyFlag, 
                                       i_rnaIncludeSecondaryAlignments,
                                       i_maxReadDepth, 
                                       i_debug)
            
    stopTime = time.time()       
    logging.info("createBlatFile.py Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime))         
        
    # close the files 
    if (i_outputFilename != None):
        i_outputFileHandler.close()
        
    return
Esempio n. 3
0
def filter_events(aTCGAId, aChrom, aBedFilename, aVCFFilename,
                  anOutputFilename, aFilterName, aFilterField,
                  anIncludeOverlapInfo, anIncludeFilterName, anIdField,
                  anIncludeId, anIncludeCount, aFilterHeaderLine,
                  aBinSize, anIsDebug):
    '''
    ' This function reads from a .bed file and a .vcf file line by line and
    ' looks for variants that should be filtered or tagged. The .bed file
    ' specifies coordinates for areas where variants should either be included
    ' or excluded.  For example, a .bed file specifying transcription or exon
    ' start and stop coordinates can be provided along with the
    ' --includeOverlaps flag to indicate that the variants in these regions
    ' should be kept, and variants outside of these regions should be flagged
    ' or filtered out.  Conversely, a bed file specifying areas of the genome
    ' that are accessible (as defined by the 1000 Genomes project) can be given
    ' without the --includeOverlaps flag to indicate that the variants outside
    ' of the accessible genome should be flagged or filtered out, and variants
    ' overlapping the accessible regions should not be flagged or filtered out.
    '
    ' aTCGAId: The TCGA Id for this sample
    ' aChrom: The chromosome being filtered
    ' aBedFilename: A .bed file with at least 3 columns specifying the chrom,
    '    start, and stop coordinates and possibly a 4th column with an id
    ' aVCFFilename: A .vcf file with variants that will be either
    '    included or excluded
    ' anOutputFilename: An output file where the filtered variants are output
    ' aFilterName: The name of the filter
    ' aFilterField: The field where the filter name should be included
    '    (e.g. INFO or FILTER)
    ' anIncludeOverlapInfo: A flag specifying whether the variants should be
    '    included or excluded when they overlap
    ' anIncludeFilterName: A flag specifying whether the filtering name should
    '    be included in the output or not
    ' anIdField: The field where the ID should be specified (e.g. ID or INFO)
    ' anIncludeId: A flag specifying whether the id should be included in the
    '    output or not
    ' anIncludeCount: A flag specifying whether the number of overlaps should
    '    be included in the output or not
    ' aFilterHeaderLine: A filter header line that should be added to the VCF
    '    header describing this filter
    ' aBinSize:  The size of the interval between each bin
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    # initialize pybed with the filtering file
    filterPybed = pybed(binsize=aBinSize)
    filterPybed.load_from_file(aBedFilename)

    # get the vcf file
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFilename)

    # get the output file
    i_outputFileHandler = None
    if (anOutputFilename is not None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(anOutputFilename)

    # create the generator for the vcf file
    vcfGenerator = get_vcf_data(i_vcfFileHandler,
                                i_outputFileHandler,
                                aFilterHeaderLine,
                                anIsDebug)

    # initialize some variables
    overlappingEvents = 0
    nonOverlappingEvents = 0
    totalEvents = 0
    startTime = time.time()

    # for each vcf line
    for (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate,
         vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter,
         vcf_info, vcf_restLine, vcf_line) in (vcfGenerator):

        totalEvents += 1

        if (anIsDebug):
            logging.debug("VCF: %s", vcf_line)

        # check if this vcf coordinate overlaps with the filter coordinates
        posTuple = (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate)
        (isOverlap, idValue, count) = filterPybed.overlaps_with(posTuple,
                                                                anIncludeCount)

        # if an event overlaps with the filters
        if (isOverlap):
            # count the overlap
            overlappingEvents += 1

            # if we want to add info about overlaps
            if (anIncludeOverlapInfo):

                # alter the filter and id name if appropriate
                if (anIncludeFilterName):
                    (vcf_filter, vcf_info) = add_filter(vcf_filter,
                                                        vcf_info,
                                                        aFilterName,
                                                        aFilterField,
                                                        anIncludeCount,
                                                        count,
                                                        anIncludeId,
                                                        anIdField,
                                                        idValue)

                if (anIncludeId and anIdField == "ID"):
                    vcf_id = add_id(vcf_id, idValue)

                # output the event
                outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id,
                              vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info)
                if (anOutputFilename is not None):
                    i_outputFileHandler.write("\t".join(outputList) + "\t" +
                                              "\t".join(vcf_restLine) + "\n")
                else:
                    print >> sys.stdout, ("\t".join(outputList) + "\t" +
                                          "\t".join(vcf_restLine))
            # we don't want to add info about overlaps, just output them
            else:
                # output the event
                if (anOutputFilename is not None):
                    i_outputFileHandler.write(vcf_line + "\n")
                else:
                    print >> sys.stdout, vcf_line
        # these events don't overlap with the filters
        else:
            # count the non overlap
            nonOverlappingEvents += 1

            # if we don't want to add info about overlaps,
            # then we do want to add info about non-overlaps
            if (not anIncludeOverlapInfo):

                # alter the filter and id name if appropriate
                if (anIncludeFilterName):
                    (vcf_filter, vcf_info) = add_filter(vcf_filter,
                                                        vcf_info,
                                                        aFilterName,
                                                        aFilterField,
                                                        anIncludeCount,
                                                        count,
                                                        anIncludeId,
                                                        anIdField,
                                                        idValue)

                if (anIncludeId and anIdField == "ID"):
                    vcf_id = add_id(vcf_id, idValue)

                # output the event
                outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id,
                              vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info)
                if (anOutputFilename is not None):
                    i_outputFileHandler.write("\t".join(outputList) + "\t" +
                                              "\t".join(vcf_restLine) + "\n")
                else:
                    print >> sys.stdout, ("\t".join(outputList) + "\t" +
                                          "\t".join(vcf_restLine))
            # we do want to add info about overlaps,
            # so just output non-overlaps
            else:
                # output the event
                if (anOutputFilename is not None):
                    i_outputFileHandler.write(vcf_line + "\n")
                else:
                    print >> sys.stdout, vcf_line

    stopTime = time.time()
    logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs",
                 aChrom, aTCGAId, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))

    if (overlappingEvents + nonOverlappingEvents == totalEvents):
        logging.info("For chrom %s and Id %s: %s (overlapping events) + " +
                     "%s (non-overlapping events) = %s", aChrom, aTCGAId,
                     overlappingEvents, nonOverlappingEvents, totalEvents)
    else:
        logging.info("filterByPybed Warning: For chrom %s and Id %s: %s " +
                     "(overlapping events) + %s (non-overlapping events) = %s",
                     aChrom, aTCGAId, overlappingEvents,
                     nonOverlappingEvents, totalEvents)

    # close the files
    i_vcfFileHandler.close()
    if (anOutputFilename is not None):
        i_outputFileHandler.close()
    return
Esempio n. 4
0
def main():

    # command for running this on a small test case:
    # python mergeChroms.py TCGA-BH-A18P
    # ../data/test/ ../data/test/ --log=DEBUG

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id inputDir outputDir [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-l", "--log",
        dest="logLevel", default="WARNING", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")
    i_cmdLineParser.add_option(
        "", "--gzip",
        dest="gzip", action="store_true", default=False,
        help="include this argument if the final VCF should be " +
             "compressed with gzip")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 10, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_inputDir = i_cmdLineArgs[1]
    i_outputDir = i_cmdLineArgs[2]

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_gzip = i_cmdLineOptions.gzip

    i_logFilename = None
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("inputDir=%s", i_inputDir)
        logging.debug("outputDir=%s", i_outputDir)
        logging.debug("logLevel=%s", i_logLevel)
        logging.debug("logFile=%s", i_logFilename)
        logging.debug("gzip=%s", i_gzip)

    # check for any errors
    i_readFilenameList = None
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]
    else:
        i_writeFilenameList = None
    i_dirList = [i_inputDir, i_outputDir]

    if (not radiaUtil.check_for_argv_errors(i_dirList,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # get the VCF generator
    (headerDict, coordDict) = get_vcf_data(i_id, i_inputDir, i_debug)

    if (i_gzip):
        i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf.gz")
    else:
        i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf")

    outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    # if we have header info to output
    if (len(headerDict["metadata"]) > 0):
        # output the header information
        outputFileHandler.write("\n".join(headerDict["metadata"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["filter"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["info"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["format"]) + "\n")
        outputFileHandler.write("".join(headerDict["chrom"]) + "\n")

    # first output the numerical chroms in order
    numericChromKeys = coordDict["numbers"].keys()
    numericChromKeys.sort(key=int)
    for chrom in numericChromKeys:
        outputFileHandler.write("\n".join(coordDict["numbers"][chrom]) + "\n")

    # then output the alphabetical chroms in order
    letterChromKeys = coordDict["letters"].keys()
    letterChromKeys.sort(key=str)
    for chrom in letterChromKeys:
        outputFileHandler.write("\n".join(coordDict["letters"][chrom]) + "\n")

    stopTime = time.time()
    logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs",
                 i_id, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))

    # close the files
    outputFileHandler.close()

    return
Esempio n. 5
0
def main():

    # python mergeRnaAndDnaFiles.py TCGA-AB-2995 5
    # ../data/test/TCGA-AB-2995_dnaFile.vcf
    # ../data/test/TCGA-AB-2995_rnaFile.vcf
    # ../data/test/TCGA-AB-2995_rnaFile.vcf
    # ../data/test/

    startTime = time.time()

    # create the usage statement
    usage = ("usage: python %prog id chrom dnaFile rnaFile rnaOverlapsFile " +
             "rnaNonOverlapsFile outputFile [Options]")
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-l", "--log", default="WARNING",
        dest="logLevel", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(6, 15, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_chrom = i_cmdLineArgs[1]
    i_dnaFilename = i_cmdLineArgs[2]
    i_rnaFilename = i_cmdLineArgs[3]
    i_overlapsFilename = i_cmdLineArgs[4]
    i_nonOverlapsFilename = i_cmdLineArgs[5]
    i_outputFilename = i_cmdLineArgs[6]

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel

    i_logFilename = None
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("chrom=%s", i_chrom)
        logging.debug("dnaFilename=%s", i_dnaFilename)
        logging.debug("rnaFilename=%s", i_rnaFilename)
        logging.debug("overlapsFilename=%s", i_overlapsFilename)
        logging.debug("nonOverlapsFilename=%s", i_nonOverlapsFilename)
        logging.debug("outputFilename=%s", i_outputFilename)

    # check for any errors
    i_readFilenameList = [i_dnaFilename, i_rnaFilename, i_overlapsFilename]
    i_writeFilenameList = [i_outputFilename]
    i_dirList = None

    if (not radiaUtil.check_for_argv_errors(i_dirList,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # get the VCF generator
    (headerList,
     coordinateDict) = merge_vcf_data(i_dnaFilename,
                                      i_rnaFilename,
                                      i_overlapsFilename,
                                      i_nonOverlapsFilename,
                                      i_debug)

    outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    for headerLine in headerList:
        outputFileHandler.write(headerLine)

    numericKeys = coordinateDict.keys()
    numericKeys.sort(key=int)
    for coordinate in numericKeys:
        line = coordinateDict[coordinate]
        line = line.rstrip("\r\n")
        # split the line on the tab
        splitLine = line.split("\t")

        # set the SST field in the INFO
        splitLine[7] = set_sst_field(splitLine[7])
        outputFileHandler.write("\t".join(splitLine) + "\n")

    stopTime = time.time()
    logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs",
                 i_id, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))

    # close the files
    outputFileHandler.close()

    return
Esempio n. 6
0
def main():

    # create the usage statement
    usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help=
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
    )
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-c",
        "--allVCFCalls",
        action="store_false",
        default=True,
        dest="passedVCFCallsOnly",
        help=
        "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed"
    )

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 14, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_vcfFilename = str(i_cmdLineArgs[0])
    i_rnaGeneFilename = str(i_cmdLineArgs[1])
    i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
            i_logLevel)

    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename)
        logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename)
        logging.debug("outputFilename=%s", i_outputFilename)
        logging.debug("logFilename=%s", i_logFilename)
        logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [
        i_vcfFilename, i_rnaGeneFilename, i_rnaGeneFamilyFilename
    ]

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    # get the RNA gene blacklists
    (i_rnaGeneList,
     i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename,
                                          i_rnaGeneFamilyFilename, i_debug)

    hasAddedHeader = False
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename)
    vcfHeader = "##FILTER=<ID=rgene,Description=\"This gene is on the RNA gene blacklist\">\n"
    vcfHeader += "##FILTER=<ID=rgfam,Description=\"This gene family is on the RNA gene family blacklist\">\n"

    for line in i_vcfFileHandler:

        # strip the carriage return and newline characters
        line = line.rstrip("\r\n")

        if (i_debug):
            logging.debug("vcfLine: %s", line)

        # if it is an empty line, then just continue
        if (line.isspace()):
            continue
        # if we find the FILTER section, then add the filters from here
        elif ((not hasAddedHeader)
              and (line.startswith("##FILTER") or line.startswith("##INFO"))):
            hasAddedHeader = True
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(vcfHeader)
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, vcfHeader
                print >> sys.stdout, line

        # these lines are from previous scripts in the pipeline, so output them
        elif (line.startswith("#")):
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, line

        # if we are only suppose to process the passed calls
        # and this call has not passed, then skip it
        elif (i_passedVCFCallsOnlyFlag and "PASS" not in line):
            if (i_outputFileHandler != None):
                i_outputFileHandler.write(line + "\n")
            else:
                print >> sys.stdout, line

        # now we are to the data
        else:

            # split the line on the tab
            splitLine = line.split("\t")

            filterSet = set(splitLine[6].split(";"))

            # if there are no filters so far, then clear the list
            if (len(filterSet) == 1 and "PASS" in filterSet):
                filterSet = set()

            # parse the info column and create a dict
            infoList = splitLine[7].split(";")
            infoDict = collections.defaultdict(list)
            for info in infoList:
                keyValueList = info.split("=")
                # some keys are just singular without a value (e.g. DB, SOMATIC, etc.)
                if (len(keyValueList) == 1):
                    infoDict[keyValueList[0]] = ["True"]
                else:
                    # the value can be a comma separated list
                    infoDict[keyValueList[0]] = keyValueList[1].split(",")

            effectList = infoDict["EFF"]
            effectRegEx = re.compile("(\\w).*\\({1}")
            ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"]

            isRnaBlacklistGene = False
            isRnaBlacklistGeneFamily = False

            for rawEffect in effectList:
                rawEffect = rawEffect.rstrip(")")
                iterator = effectRegEx.finditer(rawEffect)

                # for each match object in the iterator
                for match in iterator:
                    effect = match.group()
                    rawEffect = rawEffect.replace(effect, "")
                    effect = effect.rstrip("(")

                if (effect in ignoreEffectsList):
                    continue

                effectParts = rawEffect.split("|")
                #effectImpact = effectParts[0]
                #functionalClass = effectParts[1]
                #codonChange = effectParts[2]
                #aaChange = effectParts[3]
                #aaLength = effectParts[4]
                geneName = effectParts[5]
                transcriptBiotype = effectParts[6]
                #geneCoding = effectParts[7]
                #ensembleId = effectParts[8]
                #exonNumber = effectParts[9]
                #genotypeNumber = effectParts[10]

                # the RNA gene list can have "RP11" and that
                # should filter out any gene with RP11 in it
                for rnaGene in i_rnaGeneList:
                    if (rnaGene in geneName):
                        isRnaBlacklistGene = True
                        break

                if (transcriptBiotype in i_rnaGeneFamilyList):
                    isRnaBlacklistGeneFamily = True

            output = ["\t".join(splitLine[0:6])]

            # if the filter should be applied
            if (isRnaBlacklistGene):
                filterSet.add("rgene")
            # if the filter should be applied
            if (isRnaBlacklistGeneFamily):
                filterSet.add("rgfam")

            # if there are no filters so far, then this call passes
            if (len(filterSet) == 0):
                filterSet.add("PASS")

            output.append(";".join(filterSet))

            output.append("\t".join(splitLine[7:]))

            if (i_outputFilename != None):
                i_outputFileHandler.write("\t".join(output) + "\n")
            else:
                print >> sys.stdout, "\t".join(output)

    # close the files
    i_vcfFileHandler.close()
    if (i_outputFilename != None):
        i_outputFileHandler.close()

    return
Esempio n. 7
0
def main():

    # command for running this on a small test case:
    #python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatOutputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    # add the optional parameters
    i_cmdLineParser.add_option(
        "-c",
        "--allVCFCalls",
        action="store_false",
        default=True,
        dest="passedVCFCallsOnly",
        help=
        "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed"
    )
    i_cmdLineParser.add_option(
        "-k",
        "--keepPreviousFilters",
        action="store_true",
        default=False,
        dest="keepPreviousFilters",
        help=
        "by default the previous filters are overwritten with the blat filter, include this argument if the previous filters should be kept"
    )

    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-b",
                               "--blatOutputFormat",
                               dest="blatOutputFormat",
                               metavar="OUTPUT_FORMAT",
                               default="BLAST",
                               help="the BLAT output format, BLAST by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help=
        "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default"
    )
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDOUT by default")

    i_cmdLineParser.add_option(
        "-n",
        "--blatDnaNormalReads",
        action="store_true",
        default=False,
        dest="blatDnaNormalReads",
        help="include this argument if the normal DNA reads should be processed"
    )
    i_cmdLineParser.add_option(
        "-x",
        "--blatRnaNormalReads",
        action="store_true",
        default=False,
        dest="blatRnaNormalReads",
        help="include this argument if the normal RNA reads should be processed"
    )
    i_cmdLineParser.add_option(
        "-t",
        "--blatDnaTumorReads",
        action="store_true",
        default=False,
        dest="blatDnaTumorReads",
        help="include this argument if the tumor DNA reads should be processed"
    )
    i_cmdLineParser.add_option(
        "-r",
        "--blatRnaTumorReads",
        action="store_true",
        default=False,
        dest="blatRnaTumorReads",
        help="include this argument if the tumor RNA reads should be processed"
    )

    i_cmdLineParser.add_option(
        "-d",
        "--readDepthCutoff",
        type="int",
        default=int(4),
        dest="readDepthCutoff",
        metavar="READ_DP_CUTOFF",
        help=
        "the minimum number of valid reads that are necessary, %default by default"
    )
    i_cmdLineParser.add_option(
        "-p",
        "--readPercentCutoff",
        type="float",
        default=float(0.10),
        dest="readPercentCutoff",
        metavar="READ_PERCENT_CUTOFF",
        help=
        "the minimum percentage of valid reads that are necessary, %default by default"
    )

    #i_cmdLineParser.add_option("-e", "--eValueCutoff", type="float", default=float(10e-6), dest="eValueCutoff", metavar="EVAL_CUTOFF", help="the e-value cutoff for determining if a blat hit is significant, %default by default")
    #i_cmdLineParser.add_option("-u", "--upperIdentityCutoff", type="float", default=float(0.95), dest="upperIdentityCutoff", metavar="UPPER_CUTOFF", help="the upper cutoff for the match length adjusted identity to determine if a blat hit is significant, %default by default")
    #i_cmdLineParser.add_option("-l", "--lowerIdentityCutoff", type="float", default=float(0.5), dest="lowerIdentityCutoff", metavar="LOWER_CUTOFF", help="the lower cutoff for the match length adjusted identity to determine if a second blat hit is significant, %default by default")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5, 27, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatOutputFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly
    i_keepPreviousFiltersFlag = i_cmdLineOptions.keepPreviousFilters
    i_blatOutputFormat = i_cmdLineOptions.blatOutputFormat
    i_logLevel = i_cmdLineOptions.logLevel
    i_readDepthCutoff = i_cmdLineOptions.readDepthCutoff
    i_readPercentCutoff = i_cmdLineOptions.readPercentCutoff
    #i_eValueCutoff = i_cmdLineOptions.eValueCutoff
    #i_upperIdentityCutoff = i_cmdLineOptions.upperIdentityCutoff
    #i_lowerIdentityCutoff = i_cmdLineOptions.lowerIdentityCutoff

    i_blatDnaNormalReads = i_cmdLineOptions.blatDnaNormalReads
    i_blatDnaTumorReads = i_cmdLineOptions.blatDnaTumorReads
    i_blatRnaNormalReads = i_cmdLineOptions.blatRnaNormalReads
    i_blatRnaTumorReads = i_cmdLineOptions.blatRnaTumorReads

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    if (i_cmdLineOptions.outputFilename != None):
        i_outputFilename = str(i_cmdLineOptions.outputFilename)
    if (i_cmdLineOptions.logFilename != None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
            i_logLevel)

    # set up the logging
    if (i_logFilename != None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatOutputFilename=%s", i_blatOutputFilename)
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag)
        logging.debug("blatOutputFormat=%s", i_blatOutputFormat)

        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)

        logging.debug("readDepthCutoff=%s", i_readDepthCutoff)
        logging.debug("readPerentCutoff=%s", i_readPercentCutoff)

    # check for any errors
    i_writeFilenameList = []
    if (i_outputFilename != None):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename != None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename, i_blatOutputFilename]

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    i_outputFileHandler = None
    if (i_outputFilename != None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    # get the BLAT results
    i_blatCoordinateDict = parse_blat_output(i_blatOutputFilename,
                                             i_blatOutputFormat, i_debug)

    # get the VCF generator
    i_vcfGenerator = get_vcf_data(i_vcfFilename, i_passedVCFCallsOnlyFlag,
                                  i_debug)

    for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore,
         vcfFilterSet, vcfInfoDict, restOfLine) in i_vcfGenerator:
        if (i_debug):
            logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr,
                          str(vcfStopCoordinate), vcfId,
                          vcfRef, vcfAlt, vcfScore, str(vcfFilterSet),
                          str(vcfInfoDict), restOfLine)

        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        for modType in modTypes:

            blatHitsDict = dict()
            blatOverallReadDepth = 0
            numValidReads = 0

            if (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                if ("rnaNormal"
                        in i_blatCoordinateDict[vcfChr + "_" +
                                                str(vcfStopCoordinate)]):
                    # for each coordinate, get a dict of reads and corresponding blat hits
                    blatHitsDict = i_blatCoordinateDict[
                        vcfChr + "_" + str(vcfStopCoordinate)]["rnaNormal"]
            elif ((modType == "SOM" or modType == "TUM_EDIT")
                  and i_blatRnaTumorReads):
                if ("rnaTumor"
                        in i_blatCoordinateDict[vcfChr + "_" +
                                                str(vcfStopCoordinate)]):
                    # for each coordinate, get a dict of reads and corresponding blat hits
                    blatHitsDict = i_blatCoordinateDict[
                        vcfChr + "_" + str(vcfStopCoordinate)]["rnaTumor"]

            # for each read, investigate the blat hits to see if this read is valid
            for (readId, blatHitList) in blatHitsDict.iteritems():
                if (i_debug):
                    logging.debug("num of blat hits for read %s=%s", readId,
                                  len(blatHitList))

                blatOverallReadDepth += 1

                # find out if the read is valid or if it maps to other places in the genome
                if (i_blatOutputFormat == "PSL"):
                    (isValidRead, validRead) = is_valid_read_psl_format(
                        blatHitList, vcfChr, vcfStopCoordinate, i_debug)
                elif (i_blatOutputFormat == "BLAST"):
                    (isValidRead, validRead) = is_valid_read_blast_format(
                        blatHitList, vcfChr, vcfStopCoordinate, 0, i_debug)
                    #(isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 1, i_debug)
                    #(isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 2, i_debug)

                # if we have only one valid blat hit, then the read doesn't map to other places in the genome very well, so let's use it
                if (isValidRead):
                    numValidReads += 1

                    if (i_debug):
                        logging.debug("ValidRead: %s", validRead)

            if (blatOverallReadDepth > 0):
                altPercent = round(numValidReads / float(blatOverallReadDepth),
                                   2)
            else:
                altPercent = 0.0

            if (numValidReads < i_readDepthCutoff
                    or altPercent < i_readPercentCutoff):
                modTypeFilters[modType] = "blat"
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True

            if (i_debug):
                logging.debug(
                    "blatOverallReadDepth=%s, numValidReads=%s, altPercent=%s",
                    str(blatOverallReadDepth), str(numValidReads),
                    str(altPercent))
                logging.debug("modType=%s, passed? %s", modType,
                              modTypeFilters[modType])
                logging.debug("blatFilter originalDepth=%s, afterBlatDepth=%s",
                              str(blatOverallReadDepth), str(numValidReads))

        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "blat"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)

        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges

        # if at least one passed, then set pass
        if (atLeastOnePass):
            vcfFilterSet = ["PASS"]
        else:
            # if the user wants to keep the previous filters
            if (i_keepPreviousFiltersFlag):
                # if the call previous passed, then just set blat
                if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet):
                    vcfFilterSet = ["blat"]
                # otherwise, add it to the previous filters
                else:
                    vcfFilterSet.add("blat")
            # otherwise, just set the blat filter
            else:
                vcfFilterSet = ["blat"]

            # update the mod filters
            modTypes = vcfInfoDict["MT"]
            modChanges = vcfInfoDict["MC"]
            origins = vcfInfoDict["ORIGIN"]
            modFilters = [] if vcfInfoDict["MF"] is None else vcfInfoDict["MF"]
            modFilterTypes = [] if vcfInfoDict["MFT"] is None else vcfInfoDict[
                "MFT"]

            for origin in origins:
                for (modType, modChange) in izip(modTypes, modChanges):
                    modFilterTypes.append("_".join(
                        [origin, modType, modChange]))
                    modFilters.append("_".join(vcfFilterSet))

            vcfInfoDict["MF"] = modFilters
            vcfInfoDict["MFT"] = modFilterTypes

        output = [
            vcfChr,
            str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore,
            ";".join(vcfFilterSet)
        ]

        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"

        output.append(infoField.rstrip(";"))
        output.append(restOfLine)

        if (i_outputFilename != None):
            i_outputFileHandler.write("\t".join(output) + "\n")
        else:
            print >> sys.stdout, "\t".join(output)

    stopTime = time.time()
    logging.info(
        "filterByBlat.py for Id %s: Total time=%s hrs, %s mins, %s secs", i_id,
        ((stopTime - startTime) / (3600)), ((stopTime - startTime) / 60),
        (stopTime - startTime))

    # close the files
    if (i_outputFilename != None):
        i_outputFileHandler.close()

    return
Esempio n. 8
0
def main():

    # command for running this on a small test case:
    # python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf
    # ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl
    # --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatOutputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    # add the optional parameters
    i_cmdLineParser.add_option(
        "-c", "--allVCFCalls", action="store_false", default=True,
        dest="passedVCFCallsOnly",
        help="by default only the VCF calls that have passed all filters " +
             "thus far are processed, include this argument if all of the " +
             "VCF calls should be processed")
    i_cmdLineParser.add_option(
        "-k", "--keepPreviousFilters", action="store_true", default=False,
        dest="keepPreviousFilters",
        help="by default the previous filters are overwritten with the blat " +
             "filter, include this argument if the previous filters should " +
             "be kept")

    i_cmdLineParser.add_option(
        "-o", "--outputFilename",
        dest="outputFilename", metavar="OUTPUT_FILE", default=sys.stdout,
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option(
        "-b", "--blatOutputFormat",
        dest="blatOutputFormat", metavar="OUTPUT_FORMAT", default="BLAST",
        help="the BLAT output format, BLAST by default")
    i_cmdLineParser.add_option(
        "-l", "--log",
        dest="logLevel", default="WARNING", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")

    i_cmdLineParser.add_option(
        "", "--transcriptNameTag",
        dest="transcriptNameTag",
        help="the INFO key where the original transcript name can be found")
    i_cmdLineParser.add_option(
        "", "--transcriptCoordinateTag",
        dest="transcriptCoordinateTag",
        help="the INFO key where the original transcript" +
             "coordinate can be found")
    i_cmdLineParser.add_option(
        "", "--transcriptStrandTag",
        dest="transcriptStrandTag",
        help="the INFO key where the original transcript strand can be found")
    i_cmdLineParser.add_option(
        "", "--rnaIncludeSecondaryAlignments",
        action="store_true", default=False,
        dest="rnaIncludeSecondaryAlignments",
        help="if you align the RNA to transcript isoforms, then you may " +
             "want to include RNA secondary alignments in the pileup")

    i_cmdLineParser.add_option(
        "-n", "--blatDnaNormalReads", action="store_true", default=False,
        dest="blatDnaNormalReads",
        help="include this argument if the normal DNA reads " +
             "should be processed")
    i_cmdLineParser.add_option(
        "-x", "--blatRnaNormalReads", action="store_true", default=False,
        dest="blatRnaNormalReads",
        help="include this argument if the normal RNA reads " +
             "should be processed")
    i_cmdLineParser.add_option(
        "-t", "--blatDnaTumorReads", action="store_true", default=False,
        dest="blatDnaTumorReads",
        help="include this argument if the tumor DNA reads " +
             "should be processed")
    i_cmdLineParser.add_option(
        "-r", "--blatRnaTumorReads", action="store_true", default=False,
        dest="blatRnaTumorReads",
        help="include this argument if the tumor RNA reads " +
             "should be processed")

    i_cmdLineParser.add_option(
        "-d", "--minReadDepth", type="int", default=int(4),
        dest="minReadDepth", metavar="MIN_READ_DP",
        help="the minimum number of valid reads that are necessary, " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-p", "--minReadPercent", type="float", default=float(0.10),
        dest="minReadPercent", metavar="MIN_READ_PCT",
        help="the minimum percentage of valid reads that are necessary, " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-m", "--minOrderMagnitude", type="int", default=float(0),
        dest="minOrderMagnitude", metavar="MIN_ORDER_MAGNITUDE",
        help="the minimum order of magnitude difference between the blat " +
             "hit at the query position vs. the next best blat hit in order " +
             "for the read to be valid, %default by default")

    '''
    i_cmdLineParser.add_option(
        "-e", "--minEValue", type="float", default=float(10e-6),
        dest="minEValue", metavar="MIN_EVALUE",
        help="the minimum e-value needed for a blat hit to be significant, " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-u", "--maxIdentity", type="float", default=float(0.95),
        dest="maxIdentity", metavar="MAX_IDENTITY",
        help="the maximum match length adjusted identity for a blat hit to " +
             "be significant, %default by default")
    i_cmdLineParser.add_option(
        "-l", "--minIdentity", type="float", default=float(0.5),
        dest="minIdentity", metavar="MIN_IDENTITY",
        help="the minimum match length adjusted identity for a blat hit to " +
             "be significant, %default by default")
    '''

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5, 27, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (cmdLineOpts, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatOutputFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = cmdLineOpts.passedVCFCallsOnly
    i_keepPreviousFiltersFlag = cmdLineOpts.keepPreviousFilters
    i_blatOutputFormat = cmdLineOpts.blatOutputFormat
    i_logLevel = cmdLineOpts.logLevel
    i_rnaIncludeSecondaryAlignments = cmdLineOpts.rnaIncludeSecondaryAlignments
    i_minReadDepth = cmdLineOpts.minReadDepth
    i_minReadPercent = cmdLineOpts.minReadPercent
    i_minOrderMagnitude = cmdLineOpts.minOrderMagnitude
    # i_minEValue = cmdLineOpts.minEValue
    # i_maxIdentity = cmdLineOpts.maxIdentity
    # i_minIdentity = cmdLineOpts.minIdentity

    i_blatDnaNormalReads = cmdLineOpts.blatDnaNormalReads
    i_blatDnaTumorReads = cmdLineOpts.blatDnaTumorReads
    i_blatRnaNormalReads = cmdLineOpts.blatRnaNormalReads
    i_blatRnaTumorReads = cmdLineOpts.blatRnaTumorReads

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    i_transcriptNameTag = None
    i_transcriptCoordinateTag = None
    i_transcriptStrandTag = None
    if (cmdLineOpts.outputFilename is not None):
        i_outputFilename = cmdLineOpts.outputFilename
    if (cmdLineOpts.logFilename is not None):
        i_logFilename = cmdLineOpts.logFilename
    if (cmdLineOpts.transcriptNameTag is not None):
        i_transcriptNameTag = cmdLineOpts.transcriptNameTag
    if (cmdLineOpts.transcriptCoordinateTag is not None):
        i_transcriptCoordinateTag = cmdLineOpts.transcriptCoordinateTag
    if (cmdLineOpts.transcriptStrandTag is not None):
        i_transcriptStrandTag = cmdLineOpts.transcriptStrandTag

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatOutputFilename=%s", i_blatOutputFilename)
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag)
        logging.debug("blatOutputFormat=%s", i_blatOutputFormat)

        logging.debug("transcriptNameTag %s", i_transcriptNameTag)
        logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag)
        logging.debug("transcriptStrandTag %s", i_transcriptStrandTag)
        logging.debug("rnaInclSecAlign=%s" % i_rnaIncludeSecondaryAlignments)

        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)

        logging.debug("minReadDepth=%s", i_minReadDepth)
        logging.debug("minReadPercent=%s", i_minReadPercent)
        logging.debug("minOrderMagnitude=%s", i_minOrderMagnitude)

    # check for any errors
    i_writeFilenameList = []
    if (cmdLineOpts.outputFilename is not sys.stdout):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename, i_blatOutputFilename]

    if (not radiaUtil.check_for_argv_errors(None,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    if i_outputFilename is not sys.stdout:
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
    else:
        i_outputFileHandler = i_outputFilename

    # get the VCF generator
    i_vcfGenerator = get_vcf_data(i_vcfFilename,
                                  i_passedVCFCallsOnlyFlag,
                                  i_debug)

    # get the blat hits generator
    i_blatGenerator = parse_blat_output(i_blatOutputFilename,
                                        i_blatOutputFormat,
                                        i_debug)

    for (vcfLine, blatHitsDict) in izip(i_vcfGenerator, i_blatGenerator):

        if (i_debug):
            logging.debug("VCF Line=%s", vcfLine)
            logging.debug("Len Blat Hits=%s", len(blatHitsDict))

        # parse the VCF line
        splitLine = vcfLine.split("\t")

        # the coordinate is the second element
        vcfChr = splitLine[0]
        vcfStopCoordinate = int(splitLine[1])
        vcfIds = splitLine[2]
        vcfRef = splitLine[3]
        vcfAlts = splitLine[4]
        vcfScore = splitLine[5]
        vcfFilterSet = set(splitLine[6].split(";"))
        vcfInfoList = splitLine[7].split(";")
        vcfInfoDict = collections.defaultdict(list)
        for info in vcfInfoList:
            keyValueList = info.split("=")
            # some keys are just singular without a value (e.g. DB, etc.)
            if (len(keyValueList) == 1):
                vcfInfoDict[keyValueList[0]] = ["True"]
            else:
                # the value can be a comma separated list
                vcfInfoDict[keyValueList[0]] = keyValueList[1].split(",")
        vcfRestOfLine = "\t".join(splitLine[8:])

        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        for modType in modTypes:

            blatOverallReadDepth = 0
            numValidReads = 0

            prefix = ""
            if (modType == "GERM" and i_blatDnaNormalReads):
                prefix = "dnaNormal"
            elif (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                prefix = "rnaNormal"
            elif (modType == "SOM" and i_blatDnaTumorReads):
                prefix = "dnaTumor"
            elif ((modType == "SOM" or modType == "TUM_EDIT") and
                  i_blatRnaTumorReads):
                prefix = "rnaTumor"

            # get the expected prefix
            vcfKey = "_".join([prefix, vcfChr, str(vcfStopCoordinate)])

            # for each read, investigate the blat
            # hits to see if this read is valid
            for (readId, blatHitList) in blatHitsDict.iteritems():
                if (i_debug):
                    logging.debug("num of blat hits for read %s=%s",
                                  readId, len(blatHitList))

                # if the readId does not start with the vcfKey,
                # then something is wrong. the VCF and blat hits
                # need to be in sync...
                if (not readId.startswith(vcfKey)):
                    logging.error("The blat query seems to be out of sync " +
                                    "with the blat hits.")
                    logging.error("VCF Line=%s", vcfLine)
                    logging.error("readId=%s, blatHitsDict=%s",
                                    readId, blatHitsDict[readId][1])
                    sys.exit(1)

                blatOverallReadDepth += 1

                # find out if the read is valid or if it
                # maps to other places in the genome
                if (i_blatOutputFormat == "PSL"):
                    # if we should process the transcripts
                    if ((i_transcriptNameTag is not None) and
                        (i_transcriptNameTag in vcfInfoDict)):
                        (isValidRead, validRead) = is_valid_read_psl_format(
                                        blatHitList,
                                        vcfInfoDict[i_transcriptNameTag],
                                        vcfInfoDict[i_transcriptCoordinateTag],
                                        i_rnaIncludeSecondaryAlignments,
                                        i_debug)
                    else:
                        (isValidRead, validRead) = is_valid_read_psl_format(
                                        blatHitList,
                                        [vcfChr],
                                        [vcfStopCoordinate],
                                        i_rnaIncludeSecondaryAlignments,
                                        i_debug)

                elif (i_blatOutputFormat == "BLAST"):
                    # if we should process the transcripts
                    if ((i_transcriptNameTag is not None) and
                        (i_transcriptNameTag in vcfInfoDict)):
                        (isValidRead, validRead) = is_valid_read_blast_format(
                                        blatHitList,
                                        vcfInfoDict[i_transcriptNameTag],
                                        vcfInfoDict[i_transcriptCoordinateTag],
                                        i_rnaIncludeSecondaryAlignments,
                                        i_minOrderMagnitude,
                                        i_debug)
                    else:
                        (isValidRead, validRead) = is_valid_read_blast_format(
                                        blatHitList,
                                        [vcfChr],
                                        [vcfStopCoordinate],
                                        i_rnaIncludeSecondaryAlignments,
                                        i_minOrderMagnitude,
                                        i_debug)

                # if we have only one valid blat hit, then the read doesn't
                # map to other places in the genome very well, so let's use it
                if (isValidRead):
                    numValidReads += 1

                    if (i_debug):
                        logging.debug("ValidRead: %s", validRead)
                elif (i_debug):
                    logging.debug("not a valid read")

            if (blatOverallReadDepth > 0):
                tmpAltPct = numValidReads/float(blatOverallReadDepth)
                altPercent = round(tmpAltPct, 2)
            else:
                altPercent = 0.0

            if (numValidReads < i_minReadDepth or
                altPercent < i_minReadPercent):
                modTypeFilters[modType] = "blat"
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True

            if (i_debug):
                logging.debug("blatOverallReadDepth=%s, numValidReads=%s, " +
                              "altPercent=%s", str(blatOverallReadDepth),
                              str(numValidReads), str(altPercent))
                logging.debug("modType=%s, passed? %s", modType,
                              modTypeFilters[modType])
                logging.debug("blatFilter originalDepth=%s, validBlatDepth=%s",
                              str(blatOverallReadDepth), str(numValidReads))

        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "blat"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)

        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges

        # if at least one passed, then set pass
        if (atLeastOnePass):
            vcfFilterSet = ["PASS"]
        else:
            # if the user wants to keep the previous filters
            if (i_keepPreviousFiltersFlag):
                # if the call previous passed, then just set blat
                if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet):
                    vcfFilterSet = ["blat"]
                # otherwise, add it to the previous filters
                else:
                    vcfFilterSet.add("blat")
            # otherwise, just set the blat filter
            else:
                vcfFilterSet = ["blat"]

            # update the mod filters
            modTypes = vcfInfoDict["MT"]
            modChanges = vcfInfoDict["MC"]
            origins = vcfInfoDict["ORIGIN"]
            if vcfInfoDict["MF"] is None:
                modFilters = []
            else:
                modFilters = vcfInfoDict["MF"]
            if vcfInfoDict["MFT"] is None:
                modFilterTypes = []
            else:
                modFilterTypes = vcfInfoDict["MFT"]

            for origin in origins:
                for (modType, modChange) in izip(modTypes, modChanges):
                    modFilterTypes.append("_".join([origin,
                                                    modType,
                                                    modChange]))
                    modFilters.append("_".join(vcfFilterSet))

            vcfInfoDict["MF"] = modFilters
            vcfInfoDict["MFT"] = modFilterTypes

        output = [vcfChr, str(vcfStopCoordinate), vcfIds, vcfRef,
                  vcfAlts, vcfScore, ";".join(vcfFilterSet)]

        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"

        output.append(infoField.rstrip(";"))
        output.append(vcfRestOfLine)

        i_outputFileHandler.write("\t".join(output) + "\n")

    stopTime = time.time()
    logging.info("filterByBlat.py for Id %s: Total time=%s hrs, %s mins, " +
                 "%s secs", i_id, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))

    # close the files
    if (i_outputFilename is not sys.stdout):
        i_outputFileHandler.close()

    return
Esempio n. 9
0
def main():

    # command for running this on a small test case:
    # python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf
    # ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl
    # --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id chrom vcfFile blatOutputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    # add the optional parameters
    i_cmdLineParser.add_option(
        "-c",
        "--allVCFCalls",
        action="store_false",
        default=True,
        dest="passedVCFCallsOnly",
        help="by default only the VCF calls that have passed all filters " +
        "thus far are processed, include this argument if all of the " +
        "VCF calls should be processed")
    i_cmdLineParser.add_option(
        "-k",
        "--keepPreviousFilters",
        action="store_true",
        default=False,
        dest="keepPreviousFilters",
        help="by default the previous filters are overwritten with the blat " +
        "filter, include this argument if the previous filters should " +
        "be kept")

    i_cmdLineParser.add_option(
        "-o",
        "--outputFilename",
        dest="outputFilename",
        metavar="OUTPUT_FILE",
        default=sys.stdout,
        help="the name of the output file, STDOUT by default")
    i_cmdLineParser.add_option("-b",
                               "--blatOutputFormat",
                               dest="blatOutputFormat",
                               metavar="OUTPUT_FORMAT",
                               default="BLAST",
                               help="the BLAT output format, BLAST by default")
    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
        "%default by default")
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")

    i_cmdLineParser.add_option(
        "",
        "--transcriptNameTag",
        dest="transcriptNameTag",
        help="the INFO key where the original transcript name can be found")
    i_cmdLineParser.add_option(
        "",
        "--transcriptCoordinateTag",
        dest="transcriptCoordinateTag",
        help="the INFO key where the original transcript" +
        "coordinate can be found")
    i_cmdLineParser.add_option(
        "",
        "--transcriptStrandTag",
        dest="transcriptStrandTag",
        help="the INFO key where the original transcript strand can be found")
    i_cmdLineParser.add_option(
        "",
        "--rnaIncludeSecondaryAlignments",
        action="store_true",
        default=False,
        dest="rnaIncludeSecondaryAlignments",
        help="if you align the RNA to transcript isoforms, then you may " +
        "want to include RNA secondary alignments in the pileup")

    i_cmdLineParser.add_option(
        "-n",
        "--blatDnaNormalReads",
        action="store_true",
        default=False,
        dest="blatDnaNormalReads",
        help="include this argument if the normal DNA reads " +
        "should be processed")
    i_cmdLineParser.add_option(
        "-x",
        "--blatRnaNormalReads",
        action="store_true",
        default=False,
        dest="blatRnaNormalReads",
        help="include this argument if the normal RNA reads " +
        "should be processed")
    i_cmdLineParser.add_option(
        "-t",
        "--blatDnaTumorReads",
        action="store_true",
        default=False,
        dest="blatDnaTumorReads",
        help="include this argument if the tumor DNA reads " +
        "should be processed")
    i_cmdLineParser.add_option(
        "-r",
        "--blatRnaTumorReads",
        action="store_true",
        default=False,
        dest="blatRnaTumorReads",
        help="include this argument if the tumor RNA reads " +
        "should be processed")

    i_cmdLineParser.add_option(
        "-d",
        "--minReadDepth",
        type="int",
        default=int(4),
        dest="minReadDepth",
        metavar="MIN_READ_DP",
        help="the minimum number of valid reads that are necessary, " +
        "%default by default")
    i_cmdLineParser.add_option(
        "-p",
        "--minReadPercent",
        type="float",
        default=float(0.10),
        dest="minReadPercent",
        metavar="MIN_READ_PCT",
        help="the minimum percentage of valid reads that are necessary, " +
        "%default by default")
    i_cmdLineParser.add_option(
        "-m",
        "--minOrderMagnitude",
        type="int",
        default=float(0),
        dest="minOrderMagnitude",
        metavar="MIN_ORDER_MAGNITUDE",
        help="the minimum order of magnitude difference between the blat " +
        "hit at the query position vs. the next best blat hit in order " +
        "for the read to be valid, %default by default")
    '''
    i_cmdLineParser.add_option(
        "-e", "--minEValue", type="float", default=float(10e-6),
        dest="minEValue", metavar="MIN_EVALUE",
        help="the minimum e-value needed for a blat hit to be significant, " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-u", "--maxIdentity", type="float", default=float(0.95),
        dest="maxIdentity", metavar="MAX_IDENTITY",
        help="the maximum match length adjusted identity for a blat hit to " +
             "be significant, %default by default")
    i_cmdLineParser.add_option(
        "-l", "--minIdentity", type="float", default=float(0.5),
        dest="minIdentity", metavar="MIN_IDENTITY",
        help="the minimum match length adjusted identity for a blat hit to " +
             "be significant, %default by default")
    '''

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(5, 27, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (cmdLineOpts, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = str(i_cmdLineArgs[0])
    i_vcfFilename = str(i_cmdLineArgs[1])
    i_blatOutputFilename = str(i_cmdLineArgs[2])

    # get the optional params with default values
    i_passedVCFCallsOnlyFlag = cmdLineOpts.passedVCFCallsOnly
    i_keepPreviousFiltersFlag = cmdLineOpts.keepPreviousFilters
    i_blatOutputFormat = cmdLineOpts.blatOutputFormat
    i_logLevel = cmdLineOpts.logLevel
    i_rnaIncludeSecondaryAlignments = cmdLineOpts.rnaIncludeSecondaryAlignments
    i_minReadDepth = cmdLineOpts.minReadDepth
    i_minReadPercent = cmdLineOpts.minReadPercent
    i_minOrderMagnitude = cmdLineOpts.minOrderMagnitude
    # i_minEValue = cmdLineOpts.minEValue
    # i_maxIdentity = cmdLineOpts.maxIdentity
    # i_minIdentity = cmdLineOpts.minIdentity

    i_blatDnaNormalReads = cmdLineOpts.blatDnaNormalReads
    i_blatDnaTumorReads = cmdLineOpts.blatDnaTumorReads
    i_blatRnaNormalReads = cmdLineOpts.blatRnaNormalReads
    i_blatRnaTumorReads = cmdLineOpts.blatRnaTumorReads

    # try to get any optional parameters with no defaults
    i_outputFilename = None
    i_logFilename = None
    i_transcriptNameTag = None
    i_transcriptCoordinateTag = None
    i_transcriptStrandTag = None
    if (cmdLineOpts.outputFilename is not None):
        i_outputFilename = cmdLineOpts.outputFilename
    if (cmdLineOpts.logFilename is not None):
        i_logFilename = cmdLineOpts.logFilename
    if (cmdLineOpts.transcriptNameTag is not None):
        i_transcriptNameTag = cmdLineOpts.transcriptNameTag
    if (cmdLineOpts.transcriptCoordinateTag is not None):
        i_transcriptCoordinateTag = cmdLineOpts.transcriptCoordinateTag
    if (cmdLineOpts.transcriptStrandTag is not None):
        i_transcriptStrandTag = cmdLineOpts.transcriptStrandTag

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the " +
            "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("vcfFilename=%s", i_vcfFilename)
        logging.debug("blatOutputFilename=%s", i_blatOutputFilename)
        logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag)
        logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag)
        logging.debug("blatOutputFormat=%s", i_blatOutputFormat)

        logging.debug("transcriptNameTag %s", i_transcriptNameTag)
        logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag)
        logging.debug("transcriptStrandTag %s", i_transcriptStrandTag)
        logging.debug("rnaInclSecAlign=%s" % i_rnaIncludeSecondaryAlignments)

        logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads)
        logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads)
        logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads)
        logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads)

        logging.debug("minReadDepth=%s", i_minReadDepth)
        logging.debug("minReadPercent=%s", i_minReadPercent)
        logging.debug("minOrderMagnitude=%s", i_minOrderMagnitude)

    # check for any errors
    i_writeFilenameList = []
    if (cmdLineOpts.outputFilename is not sys.stdout):
        i_writeFilenameList = [i_outputFilename]
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]

    i_readFilenameList = [i_vcfFilename, i_blatOutputFilename]

    if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # open the output stream
    if i_outputFilename is not sys.stdout:
        i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)
    else:
        i_outputFileHandler = i_outputFilename

    # get the VCF generator
    i_vcfGenerator = get_vcf_data(i_vcfFilename, i_passedVCFCallsOnlyFlag,
                                  i_debug)

    # get the blat hits generator
    i_blatGenerator = parse_blat_output(i_blatOutputFilename,
                                        i_blatOutputFormat, i_debug)

    for (vcfLine, blatHitsDict) in izip(i_vcfGenerator, i_blatGenerator):

        if (i_debug):
            logging.debug("VCF Line=%s", vcfLine)
            logging.debug("Len Blat Hits=%s", len(blatHitsDict))

        # parse the VCF line
        splitLine = vcfLine.split("\t")

        # the coordinate is the second element
        vcfChr = splitLine[0]
        vcfStopCoordinate = int(splitLine[1])
        vcfIds = splitLine[2]
        vcfRef = splitLine[3]
        vcfAlts = splitLine[4]
        vcfScore = splitLine[5]
        vcfFilterSet = set(splitLine[6].split(";"))
        vcfInfoList = splitLine[7].split(";")
        vcfInfoDict = collections.defaultdict(list)
        for info in vcfInfoList:
            keyValueList = info.split("=")
            # some keys are just singular without a value (e.g. DB, etc.)
            if (len(keyValueList) == 1):
                vcfInfoDict[keyValueList[0]] = ["True"]
            else:
                # the value can be a comma separated list
                vcfInfoDict[keyValueList[0]] = keyValueList[1].split(",")
        vcfRestOfLine = "\t".join(splitLine[8:])

        modTypes = vcfInfoDict["MT"]
        modTypeFilters = dict()
        atLeastOnePass = False
        for modType in modTypes:

            blatOverallReadDepth = 0
            numValidReads = 0

            prefix = ""
            if (modType == "GERM" and i_blatDnaNormalReads):
                prefix = "dnaNormal"
            elif (modType == "NOR_EDIT" and i_blatRnaNormalReads):
                prefix = "rnaNormal"
            elif (modType == "SOM" and i_blatDnaTumorReads):
                prefix = "dnaTumor"
            elif ((modType == "SOM" or modType == "TUM_EDIT")
                  and i_blatRnaTumorReads):
                prefix = "rnaTumor"

            # get the expected prefix
            vcfKey = "_".join([prefix, vcfChr, str(vcfStopCoordinate)])

            # for each read, investigate the blat
            # hits to see if this read is valid
            for (readId, blatHitList) in blatHitsDict.iteritems():
                if (i_debug):
                    logging.debug("num of blat hits for read %s=%s", readId,
                                  len(blatHitList))

                # if the readId does not start with the vcfKey,
                # then something is wrong. the VCF and blat hits
                # need to be in sync...
                if (not readId.startswith(vcfKey)):
                    logging.error("The blat query seems to be out of sync " +
                                  "with the blat hits.")
                    logging.error("VCF Line=%s", vcfLine)
                    logging.error("readId=%s, blatHitsDict=%s", readId,
                                  blatHitsDict[readId][1])
                    sys.exit(1)

                blatOverallReadDepth += 1

                # find out if the read is valid or if it
                # maps to other places in the genome
                if (i_blatOutputFormat == "PSL"):
                    # if we should process the transcripts
                    if ((i_transcriptNameTag is not None)
                            and (i_transcriptNameTag in vcfInfoDict)):
                        (isValidRead, validRead) = is_valid_read_psl_format(
                            blatHitList, vcfInfoDict[i_transcriptNameTag],
                            vcfInfoDict[i_transcriptCoordinateTag],
                            i_rnaIncludeSecondaryAlignments, i_debug)
                    else:
                        (isValidRead, validRead) = is_valid_read_psl_format(
                            blatHitList, [vcfChr], [vcfStopCoordinate],
                            i_rnaIncludeSecondaryAlignments, i_debug)

                elif (i_blatOutputFormat == "BLAST"):
                    # if we should process the transcripts
                    if ((i_transcriptNameTag is not None)
                            and (i_transcriptNameTag in vcfInfoDict)):
                        (isValidRead, validRead) = is_valid_read_blast_format(
                            blatHitList, vcfInfoDict[i_transcriptNameTag],
                            vcfInfoDict[i_transcriptCoordinateTag],
                            i_rnaIncludeSecondaryAlignments,
                            i_minOrderMagnitude, i_debug)
                    else:
                        (isValidRead, validRead) = is_valid_read_blast_format(
                            blatHitList, [vcfChr], [vcfStopCoordinate],
                            i_rnaIncludeSecondaryAlignments,
                            i_minOrderMagnitude, i_debug)

                # if we have only one valid blat hit, then the read doesn't
                # map to other places in the genome very well, so let's use it
                if (isValidRead):
                    numValidReads += 1

                    if (i_debug):
                        logging.debug("ValidRead: %s", validRead)
                elif (i_debug):
                    logging.debug("not a valid read")

            if (blatOverallReadDepth > 0):
                tmpAltPct = numValidReads / float(blatOverallReadDepth)
                altPercent = round(tmpAltPct, 2)
            else:
                altPercent = 0.0

            if (numValidReads < i_minReadDepth
                    or altPercent < i_minReadPercent):
                modTypeFilters[modType] = "blat"
            else:
                modTypeFilters[modType] = "PASS"
                atLeastOnePass = True

            if (i_debug):
                logging.debug(
                    "blatOverallReadDepth=%s, numValidReads=%s, " +
                    "altPercent=%s", str(blatOverallReadDepth),
                    str(numValidReads), str(altPercent))
                logging.debug("modType=%s, passed? %s", modType,
                              modTypeFilters[modType])
                logging.debug("blatFilter originalDepth=%s, validBlatDepth=%s",
                              str(blatOverallReadDepth), str(numValidReads))

        # make a copy of the list to manipulate
        modTypesTmpList = list(modTypes)
        modChanges = vcfInfoDict["MC"]
        # if at least one passed, then remove the ones that didn't
        for (modType, modChange) in izip(modTypes, modChanges):
            # if at least one passed, then remove the ones that didn't
            if (atLeastOnePass):
                if (modTypeFilters[modType] == "blat"):
                    modTypesTmpList.remove(modType)
                    modChanges.remove(modChange)

        # set the modTypes and modChanges
        vcfInfoDict["MT"] = modTypesTmpList
        vcfInfoDict["MC"] = modChanges

        # if at least one passed, then set pass
        if (atLeastOnePass):
            vcfFilterSet = ["PASS"]
        else:
            # if the user wants to keep the previous filters
            if (i_keepPreviousFiltersFlag):
                # if the call previous passed, then just set blat
                if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet):
                    vcfFilterSet = ["blat"]
                # otherwise, add it to the previous filters
                else:
                    vcfFilterSet.add("blat")
            # otherwise, just set the blat filter
            else:
                vcfFilterSet = ["blat"]

            # update the mod filters
            modTypes = vcfInfoDict["MT"]
            modChanges = vcfInfoDict["MC"]
            origins = vcfInfoDict["ORIGIN"]
            if vcfInfoDict["MF"] is None:
                modFilters = []
            else:
                modFilters = vcfInfoDict["MF"]
            if vcfInfoDict["MFT"] is None:
                modFilterTypes = []
            else:
                modFilterTypes = vcfInfoDict["MFT"]

            for origin in origins:
                for (modType, modChange) in izip(modTypes, modChanges):
                    modFilterTypes.append("_".join(
                        [origin, modType, modChange]))
                    modFilters.append("_".join(vcfFilterSet))

            vcfInfoDict["MF"] = modFilters
            vcfInfoDict["MFT"] = modFilterTypes

        output = [
            vcfChr,
            str(vcfStopCoordinate), vcfIds, vcfRef, vcfAlts, vcfScore,
            ";".join(vcfFilterSet)
        ]

        # add the modified info dict
        infoField = ""
        for key in sorted(vcfInfoDict.iterkeys()):
            if (len(vcfInfoDict[key]) == 0):
                continue
            elif ("True" in vcfInfoDict[key]):
                infoField += key + ";"
            else:
                infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";"

        output.append(infoField.rstrip(";"))
        output.append(vcfRestOfLine)

        i_outputFileHandler.write("\t".join(output) + "\n")

    stopTime = time.time()
    logging.info(
        "filterByBlat.py for Id %s: Total time=%s hrs, %s mins, " + "%s secs",
        i_id, ((stopTime - startTime) / (3600)), ((stopTime - startTime) / 60),
        (stopTime - startTime))

    # close the files
    if (i_outputFilename is not sys.stdout):
        i_outputFileHandler.close()

    return
Esempio n. 10
0
def filter_events(aTCGAId, aChrom, aBedFilename, aVCFFilename,
                  anOutputFilename, aFilterName, aFilterField,
                  anIncludeOverlapInfo, anIncludeFilterName, anIdField,
                  anIncludeId, anIncludeCount, aFilterHeaderLine, aBinSize,
                  anIsDebug):
    '''
    ' This function reads from a .bed file and a .vcf file line by line and
    ' looks for variants that should be filtered or tagged. The .bed file
    ' specifies coordinates for areas where variants should either be included
    ' or excluded.  For example, a .bed file specifying transcription or exon
    ' start and stop coordinates can be provided along with the
    ' --includeOverlaps flag to indicate that the variants in these regions
    ' should be kept, and variants outside of these regions should be flagged
    ' or filtered out.  Conversely, a bed file specifying areas of the genome
    ' that are accessible (as defined by the 1000 Genomes project) can be given
    ' without the --includeOverlaps flag to indicate that the variants outside
    ' of the accessible genome should be flagged or filtered out, and variants
    ' overlapping the accessible regions should not be flagged or filtered out.
    '
    ' aTCGAId: The TCGA Id for this sample
    ' aChrom: The chromosome being filtered
    ' aBedFilename: A .bed file with at least 3 columns specifying the chrom,
    '    start, and stop coordinates and possibly a 4th column with an id
    ' aVCFFilename: A .vcf file with variants that will be either
    '    included or excluded
    ' anOutputFilename: An output file where the filtered variants are output
    ' aFilterName: The name of the filter
    ' aFilterField: The field where the filter name should be included
    '    (e.g. INFO or FILTER)
    ' anIncludeOverlapInfo: A flag specifying whether the variants should be
    '    included or excluded when they overlap
    ' anIncludeFilterName: A flag specifying whether the filtering name should
    '    be included in the output or not
    ' anIdField: The field where the ID should be specified (e.g. ID or INFO)
    ' anIncludeId: A flag specifying whether the id should be included in the
    '    output or not
    ' anIncludeCount: A flag specifying whether the number of overlaps should
    '    be included in the output or not
    ' aFilterHeaderLine: A filter header line that should be added to the VCF
    '    header describing this filter
    ' aBinSize:  The size of the interval between each bin
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''

    # initialize pybed with the filtering file
    filterPybed = pybed(binsize=aBinSize)
    filterPybed.load_from_file(aBedFilename)

    # get the vcf file
    i_vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFilename)

    # get the output file
    i_outputFileHandler = None
    if (anOutputFilename is not None):
        i_outputFileHandler = radiaUtil.get_write_fileHandler(anOutputFilename)

    # create the generator for the vcf file
    vcfGenerator = get_vcf_data(i_vcfFileHandler, i_outputFileHandler,
                                aFilterHeaderLine, anIsDebug)

    # initialize some variables
    overlappingEvents = 0
    nonOverlappingEvents = 0
    totalEvents = 0
    startTime = time.time()

    # for each vcf line
    for (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate, vcf_id, vcf_ref,
         vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_restLine,
         vcf_line) in (vcfGenerator):

        totalEvents += 1

        if (anIsDebug):
            logging.debug("VCF: %s", vcf_line)

        # check if this vcf coordinate overlaps with the filter coordinates
        posTuple = (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate)
        (isOverlap, idValue,
         count) = filterPybed.overlaps_with(posTuple, anIncludeCount)

        # if an event overlaps with the filters
        if (isOverlap):
            # count the overlap
            overlappingEvents += 1

            # if we want to add info about overlaps
            if (anIncludeOverlapInfo):

                # alter the filter and id name if appropriate
                if (anIncludeFilterName):
                    (vcf_filter, vcf_info) = add_filter(
                        vcf_filter, vcf_info, aFilterName, aFilterField,
                        anIncludeCount, count, anIncludeId, anIdField, idValue)

                if (anIncludeId and anIdField == "ID"):
                    vcf_id = add_id(vcf_id, idValue)

                # output the event
                outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id,
                              vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info)
                if (anOutputFilename is not None):
                    i_outputFileHandler.write("\t".join(outputList) + "\t" +
                                              "\t".join(vcf_restLine) + "\n")
                else:
                    print >> sys.stdout, ("\t".join(outputList) + "\t" +
                                          "\t".join(vcf_restLine))
            # we don't want to add info about overlaps, just output them
            else:
                # output the event
                if (anOutputFilename is not None):
                    i_outputFileHandler.write(vcf_line + "\n")
                else:
                    print >> sys.stdout, vcf_line
        # these events don't overlap with the filters
        else:
            # count the non overlap
            nonOverlappingEvents += 1

            # if we don't want to add info about overlaps,
            # then we do want to add info about non-overlaps
            if (not anIncludeOverlapInfo):

                # alter the filter and id name if appropriate
                if (anIncludeFilterName):
                    (vcf_filter, vcf_info) = add_filter(
                        vcf_filter, vcf_info, aFilterName, aFilterField,
                        anIncludeCount, count, anIncludeId, anIdField, idValue)

                if (anIncludeId and anIdField == "ID"):
                    vcf_id = add_id(vcf_id, idValue)

                # output the event
                outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id,
                              vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info)
                if (anOutputFilename is not None):
                    i_outputFileHandler.write("\t".join(outputList) + "\t" +
                                              "\t".join(vcf_restLine) + "\n")
                else:
                    print >> sys.stdout, ("\t".join(outputList) + "\t" +
                                          "\t".join(vcf_restLine))
            # we do want to add info about overlaps,
            # so just output non-overlaps
            else:
                # output the event
                if (anOutputFilename is not None):
                    i_outputFileHandler.write(vcf_line + "\n")
                else:
                    print >> sys.stdout, vcf_line

    stopTime = time.time()
    logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs",
                 aChrom, aTCGAId, ((stopTime - startTime) / (3600)),
                 ((stopTime - startTime) / 60), (stopTime - startTime))

    if (overlappingEvents + nonOverlappingEvents == totalEvents):
        logging.info(
            "For chrom %s and Id %s: %s (overlapping events) + " +
            "%s (non-overlapping events) = %s", aChrom, aTCGAId,
            overlappingEvents, nonOverlappingEvents, totalEvents)
    else:
        logging.info(
            "filterByPybed Warning: For chrom %s and Id %s: %s " +
            "(overlapping events) + %s (non-overlapping events) = %s", aChrom,
            aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents)

    # close the files
    i_vcfFileHandler.close()
    if (anOutputFilename is not None):
        i_outputFileHandler.close()
    return
Esempio n. 11
0
def compare_events(aTCGAId, aChrom, aRadiaFilename, aCompareFilename, aStatsFilename, anOverlapFilename, aNonOverlapFilename, aCompareDict, anIsDebug):
    '''
    ' The function compares variants in one file with variants in another file.  This can be used to compare variants from
    ' different methods, MAF files, or validation files.  At a minimum, the coordinates are compared.  The user can also
    ' specify additional comparisons that should be done such as comparing if the call was classified as somatic in both
    ' methods (e.g. SOM=Somatic).  The keys and values can be comma-separated lists.  For example, a call may be labeled
    ' as blacklisted in one file with "blck" and in another file with "blq" or "bldp", then the comparison string would
    ' be blck=blq,bldp.
    '
    ' aTCGAId: The TCGA Id for this sample
    ' aChrom: The chromosome being filtered
    ' aRadiaFilename: A .vcf file from RADIA
    ' aCompareFilename: A file to compare to
    ' aStatsFilename: A stats file
    ' anOverlapFilename: A file where all the overlaps are output
    ' aNonOverlapFilename: A file where all the non-overlaps are output
    ' aCompareDict: A dictionary of key=value to be compare (coordinate is always compared)
    ' anIsDebug: A flag for outputting debug messages to STDERR
    '''
    
    # create the generators for the filter and vcf files
    i_statsDict = collections.defaultdict(int)
    i_filterDict = collections.defaultdict(int)
    (i_radDict, i_statsDict) = get_vcf_data(aRadiaFilename, i_statsDict, aCompareDict, "rad", anIsDebug)
    (i_cmpDict, i_statsDict) = get_vcf_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug)
    #(i_radDict, i_statsDict) = get_maf_data(aRadFilename, i_statsDict, aCompareDict, "rad", anIsDebug)
    #(i_cmpDict, i_statsDict) = get_maf_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug)
    #(i_cmpDict, i_statsDict) = get_validation_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug)
    #(i_cmpDict, i_statsDict) = get_simulation_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug)
    
    if (anOverlapFilename != None):
        overlapFileHandler = radiaUtil.get_write_fileHandler(anOverlapFilename)
    if (aNonOverlapFilename != None):
        nonOverlapFileHandler = radiaUtil.get_write_fileHandler(aNonOverlapFilename)

    # initialize some variables
    startTime = time.time()
        
    # for each cmp event
    for (cmpCoordinate, cmpLine) in i_cmpDict.iteritems():
        
        # this one is for comparing blacklist results
        #if ("SNP" in cmpLine and ("bldp" in cmpLine or "blq" in cmpLine) and cmpCoordinate not in i_radDict):
        # this one is for comparing BB, Radia, or Maf results
        if ("PASS" in cmpLine and ("SOM" in cmpLine or "EDIT" in cmpLine or "RNA_TUM_VAR" in cmpLine or "RNA_NOR_VAR" in cmpLine) and cmpCoordinate not in i_radDict):
        #if ("SNP" in cmpLine and "Somatic" in cmpLine and cmpCoordinate not in i_radDict):
        # this one is for validation data
        #if (cmpCoordinate not in i_radDict):
        #if ("PASS" in cmpLine and "SNP" in cmpLine and "SOM" in cmpLine and cmpCoordinate not in i_radDict):
        #if ("Somatic" in cmpLine and "SNP" in cmpLine and cmpCoordinate not in i_radDict):
            if (anIsDebug):
                logging.debug("no radia call %s", cmpLine)
            
            #if (aNonOverlapFilename != None):
            #    nonOverlapFileHandler.write(cmpLine + "\n")
            
            # add to maf
            #if (anOverlapFilename != None):
            #    overlapFileHandler.write(cmpLine + "\n")
            
    # for each rad event
    for (radCoordinate, radLine) in i_radDict.iteritems():
        
        #if (("bldp" in radLine or "blq" in radLine) and radCoordinate not in i_cmpDict):
        #if ("PASS" in radLine and "SNP" in radLine and radCoordinate not in i_cmpDict):
        #if (radCoordinate not in i_cmpDict):
        #if ("PASS" in radLine and "SOM" in radLine):
        #if ("SOM" in radLine and radCoordinate not in i_cmpDict):
        if ("PASS" in radLine and ("SOM" in radLine or "EDIT" in radLine or "RNA_TUM_VAR" in radLine or "RNA_NOR_VAR" in radLine) and "SNP" in radLine and radCoordinate not in i_cmpDict):
        #if ("PASS" in radLine and "SNP" in radLine and "Somatic" in radLine and radCoordinate not in i_cmpDict):
        #if ("SNP" in radLine and "Somatic" in radLine and radCoordinate not in i_cmpDict):
        #if ("PASS" in radLine and "SOM" in radLine and radCoordinate not in i_cmpDict):
        #if ("SOM" in radLine and radCoordinate not in i_cmpDict):
            if (anIsDebug):
                logging.debug("new radia call %s", radLine)
            
            if (aNonOverlapFilename != None):
                nonOverlapFileHandler.write(radLine + "\n")
            
            # add to maf
            #if (anOverlapFilename != None):
                #caller = "ucsc;"
                #if ("radia" in radLine):
                #    caller += "radia;"
                #if ("bambam" in radLine):
                #    caller += "bambam;"
                
                #caller = "rnaCall;"
                # split the line on the tab
                #splitLine = radLine.split("\t")
                #chrom = splitLine[0]
                #stopCoordinate = int(splitLine[1])
                #startCoordinate = stopCoordinate-1
                #output = ["gene", "score", caller, "score", chrom, str(startCoordinate), str(stopCoordinate), "+", "mutClass", "SNP", "Somatic"]
                #overlapFileHandler.write("\t".join(output) + "\n")
                
            
        # if the coordinates overlap, then count them
        if (radCoordinate in i_cmpDict):
                
            i_statsDict["overlap_events"] += 1
            compareLine = i_cmpDict[radCoordinate]
            
            # this one is for BB and Maf comparisons
            #if ("PASS" in radLine and "SNP" in radLine):
            # this one is for Radia to Radia comparisons
            #if ("PASS" in radLine and "PASS" in compareLine):
            # this one is for Radia and validation
            if ("PASS" in radLine and ("SOM" in radLine or "EDIT" in radLine or "RNA_TUM_VAR" in radLine or "RNA_NOR_VAR" in radLine) and 
                ("PASS" in compareLine and ("SOM" in compareLine or "EDIT" in compareLine or "RNA_TUM_VAR" in compareLine or "RNA_NOR_VAR" in compareLine))):
            #if ("PASS" in radLine and "SOM" in radLine and "SNP" in compareLine and "Somatic" in compareLine):
            #if ("Somatic" in radLine and "SNP" in radLine and "Somatic" in compareLine):
            #if ("SOM" in radLine and "Somatic" in compareLine and "SNP" in compareLine): 
            #if ("PASS" in radLine and "SOM" in radLine):
            #if ("SOM" in radLine):
            #if ("PASS" in radLine and "SNP" in compareLine and "Somatic" in compareLine):
                i_statsDict["overlap_pass_events"] += 1
            
            # for each key to compare
            # their can be multiple keys for one filter such as blq and bldp for blacklists            
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                # break up the strings to get the individual keys
                radKeyList = radKeyString.split(",")
                cmpKeyList = cmpKeyString.split(",")
                
                # set some booleans
                foundInRad = False
                foundInCmp = False
                # search for one of them
                for radKey in radKeyList:
                    # if we find one
                    if (radKey in radLine):
                        foundInRad = True
                        break;
                # search for one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in compareLine):
                        foundInCmp = True
                        break;
                
                # if the keys exist in both files at the same position, then count them
                if (foundInRad and foundInCmp):
                    # if these are germline or they haven't been found in dbSnp, then count them
                    #if (((radKey == "GERM") or ("DB" not in radLine and "DB" not in compareLine))):
                    #if ("SNP" in compareLine):
                    #if ("SNP" in compareLine and ((radKey == "GERM") or ("DB" not in radLine and "DB" not in compareLine))):
                    #if ("SNP" in compareLine):
                    if ("PASS" in compareLine):
                    #if ("PASS" in compareLine and "SNP" in compareLine):
                    #if (True):
                        i_statsDict["overlap_" + radKey] += 1
                        
                        splitLine = radLine.split("\t")    
                        filterString = splitLine[6]
                        filterList = filterString.split(";")
                    
                        #if ("PASS" in radLine and "SNP" in radLine):
                        #if ("PASS" in radLine and "SNP" in radLine):
                        if ("PASS" in radLine):
                        #if ("SNP" in radLine):
                        #if (True):
                            i_statsDict["overlap_pass_" + radKey] += 1
                            if (anIsDebug):
                                logging.debug("found call %s", compareLine)
                            if (anOverlapFilename != None):
                                
                                # add to maf
                                
                                #caller = ";ucsc;"
                                #if ("radia" in radLine):
                                #    caller += "radia;"
                                #if ("bambam" in radLine):
                                #    caller += "bambam;"
                                
                                #caller = ";rnaCall"
                                #splitLine = compareLine.split("\t")
                                #splitLine[2] += caller
                                #overlapFileHandler.write("\t".join(splitLine) + "\n")
                                
                                #caller = ";rnaCall"
                                #cmpSplitLine = compareLine.split("\t")
                                #callers = cmpSplitLine[2] + caller
                                #callers = callers.replace(";;", ",")
                                #callers = callers.replace(";", ",")
                                
                                #radSplitLine = radLine.split("\t")
                                #radSplitLine[7] += ";CALLER=" + callers
                                #overlapFileHandler.write("\t".join(radSplitLine) + "\n")
                                
                                overlapFileHandler.write(radLine + "\n")
                                # we only want to write the line to the overlap file once
                                # even if it matches as a SOM and an EDIT
                                break;
                                #overlapFileHandler.write(compareLine + "\n")
                        else:
                            if (anIsDebug):
                                logging.debug("found but no radia pass %s %s", radLine, compareLine)
                            #overlapFileHandler.write(compareLine + "\n")
                            splitLine = radLine.split("\t")
                            
                            filterString = splitLine[6]
                            filterList = filterString.split(";")
                            for filterKey in filterList:
                                i_filterDict[filterKey] += 1
                                
                            if (aNonOverlapFilename != None):
                                #nonOverlapFileHandler.write(compareLine + "\n")
                                nonOverlapFileHandler.write(radLine + "\n")
                        
                elif (anIsDebug and foundInRad):
                    logging.debug("overlap but not found in compare file %s %s %s", radKey, radLine, compareLine)
                    #overlapFileHandler.write(compareLine + "\n")
                elif (anIsDebug and foundInCmp):
                    logging.debug("overlap but not found in RADIA %s %s %s", cmpKey, radLine, compareLine)
                    #overlapFileHandler.write(compareLine + "\n")
                elif (anIsDebug):
                    logging.debug("overlap but not same type %s %s %s %s", radKeyList, cmpKeyList, radLine, compareLine)
                    #overlapFileHandler.write(compareLine + "\n")

    # aTCGAId, aChrom, rad_events, cmp_events, overlap_events, [rad_key, cmp_key, overlap_radKey]{n}
    #outputHeader = ["PatientId", "Chrom", "rad_events", "cmp_events", "overlap_events", "rad_pass_events", "cmp_pass_events", "overlap_pass_events"]
    outputList = [aTCGAId, aChrom]
    outputList += [str(i_statsDict["rad_events"]), str(i_statsDict["cmp_events"]), str(i_statsDict["overlap_events"])]
    outputList += [str(i_statsDict["rad_pass_events"]), str(i_statsDict["cmp_pass_events"]), str(i_statsDict["overlap_pass_events"])]
    
    # for each key to compare, get the total radias, total cmps, and overlaps
    for radKey in sorted(aCompareDict.iterkeys()):
        cmpKey = aCompareDict[radKey]
        outputList += [str(i_statsDict["rad_" + radKey]), str(i_statsDict["cmp_" + cmpKey]), str(i_statsDict["overlap_" + radKey])]
        outputList += [str(i_statsDict["rad_pass_" + radKey]), str(i_statsDict["cmp_pass_" + cmpKey]), str(i_statsDict["overlap_pass_" + radKey])]
          
    for (filterKey, count) in i_filterDict.iteritems():
        logging.debug("filter: %s\t%s", filterKey, count)
        
    # get the files
    i_statsFileHandler = None
    if (aStatsFilename != None):
        i_statsFileHandler = radiaUtil.get_append_fileHandler(aStatsFilename)
        i_statsFileHandler.write("\t".join(outputList) + "\n")
        i_statsFileHandler.close() 
                        
    stopTime = time.time()
    logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs", aChrom, aTCGAId, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime))
    logging.info("\t".join(outputList))
    
    if (anOverlapFilename != None):
        overlapFileHandler.close()
                      
    if (aNonOverlapFilename != None):
        nonOverlapFileHandler.close()
            
    return
Esempio n. 12
0
def main():

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog passingFile originalFile outputFile [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-l", "--log", default="WARNING",
        dest="logLevel", metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
             "%default by default")
    i_cmdLineParser.add_option(
        "-g", "--logFilename",
        dest="logFilename", metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 10, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_passingFilename = i_cmdLineArgs[0]
    i_originalFilename = i_cmdLineArgs[1]
    i_outputFilename = i_cmdLineArgs[2]

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel

    i_logFilename = None
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError("Invalid log level: '%s' must be one of the " +
                         "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL",
                         i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(
            level=i_numericLogLevel,
            filename=i_logFilename,
            filemode='w',
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(
            level=i_numericLogLevel,
            format='%(asctime)s\t%(levelname)s\t%(message)s',
            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("passingFile=%s", i_passingFilename)
        logging.debug("originalFile=%s", i_originalFilename)
        logging.debug("outputFilename=%s", i_outputFilename)

    # check for any errors
    i_readFilenameList = [i_passingFilename, i_originalFilename]
    i_writeFilenameList = [i_outputFilename]
    i_dirList = None

    if (not radiaUtil.check_for_argv_errors(i_dirList,
                                            i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # get the VCF generator
    (passHeaderList,
     chromLine,
     passInfoList,
     passFilterList,
     passCoordinateDict) = get_vcf_data(i_passingFilename, i_debug)

    (orgHeaderList,
     chromLine,
     orgInfoList,
     orgFilterList,
     orgCoordinateDict) = get_vcf_data(i_originalFilename, i_debug)

    outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    for headerLine in orgHeaderList:
        outputFileHandler.write(headerLine + "\n")

    for headerLine in orgInfoList:
        outputFileHandler.write(headerLine + "\n")

    for headerLine in passInfoList:
        if (headerLine not in orgInfoList):
            outputFileHandler.write(headerLine + "\n")

    for headerLine in orgFilterList:
        outputFileHandler.write(headerLine + "\n")

    for headerLine in passFilterList:
        if (headerLine not in orgFilterList):
            outputFileHandler.write(headerLine + "\n")

    outputFileHandler.write(chromLine + "\n")

    numericKeys = orgCoordinateDict.keys()
    numericKeys.sort(key=int)
    for coordinate in numericKeys:
        if (coordinate in passCoordinateDict):
            line = passCoordinateDict[coordinate]
        else:
            line = orgCoordinateDict[coordinate]
        outputFileHandler.write(line)

    stopTime = time.time()
    logging.info("Total time=%s hrs, %s mins, %s secs",
                 ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60),
                 (stopTime-startTime))

    # close the files
    outputFileHandler.close()

    return
Esempio n. 13
0
def compare_events(anId, aChrom, aRadiaFilename, aCompareFilename,
                   aStatsFilename, anOverlapFilename, aNonOverlapFilename,
                   aCompareDict, anIsDebug):
    '''
    ' The function compares variants in one file with variants in another file.
    ' This can be used to compare variants from different methods, MAF files,
    ' or validation files.  At a minimum, the coordinates are compared.  The
    ' user can also specify additional comparisons that should be done such as
    ' comparing if the call was classified as somatic in both methods
    ' (e.g. SOM=Somatic).  The keys and values can be comma-separated lists.
    ' For example, a call may be labeled as blacklisted in one file with "blck"
    ' and in another file with "blq" or "bldp", then the comparison string
    ' would be blck=blq,bldp.
    '
    ' anId:                   The Id for this sample
    ' aChrom:                 The chromosome being filtered
    ' aRadiaFilename:         A .vcf file from RADIA
    ' aCompareFilename:       A file to compare to
    ' aStatsFilename:         A stats file
    ' anOverlapFilename:      A file where all the overlaps are output
    ' aNonOverlapFilename:    A file where all the non-overlaps are output
    ' aCompareDict:           A dictionary of key=value to be compare
    ' anIsDebug:              A flag for outputting debug messages to STDERR
    '''

    # create the generators for the filter and vcf files
    i_statsDict = collections.defaultdict(int)
    i_filterDict = collections.defaultdict(int)
    (i_radDict, i_statsDict) = get_vcf_data(aRadiaFilename, i_statsDict,
                                            aCompareDict, "rad", anIsDebug)
    (i_cmpDict, i_statsDict) = get_vcf_data(aCompareFilename, i_statsDict,
                                            aCompareDict, "cmp", anIsDebug)
    '''
    (i_radDict, i_statsDict) = get_maf_data(aRadiaFilename, i_statsDict,
                                            aCompareDict, "rad", anIsDebug)
    (i_cmpDict, i_statsDict) = get_maf_data(aCompareFilename, i_statsDict,
                                            aCompareDict, "cmp", anIsDebug)
    (i_cmpDict, i_statsDict) = get_validation_data(aCompareFilename,
                                                   i_statsDict,
                                                   aCompareDict,
                                                   "cmp", anIsDebug)
    (i_cmpDict, i_statsDict) = get_simulation_data(aCompareFilename,
                                                   i_statsDict,
                                                   aCompareDict,
                                                   "cmp", anIsDebug)
    '''
    if (anOverlapFilename is not None):
        overlapFileHandler = radiaUtil.get_write_fileHandler(anOverlapFilename)
    if (aNonOverlapFilename is not None):
        nonOverlapFileHandler = radiaUtil.get_write_fileHandler(
                                                        aNonOverlapFilename)

    # initialize some variables
    startTime = time.time()

    # for each cmp event
    for (cmpCoordinate, cmpLine) in i_cmpDict.iteritems():

        '''
        # this one is for comparing blacklist results
        if ("SNP" in cmpLine and
            ("bldp" in cmpLine or "blq" in cmpLine) and
            (cmpCoordinate not in i_radDict)):
        if ("SNP" in cmpLine and "Somatic" in cmpLine and
            cmpCoordinate not in i_radDict):
        # this one is for validation data
        # if (cmpCoordinate not in i_radDict):
        if ("PASS" in cmpLine and "SNP" in cmpLine and
            "SOM" in cmpLine and cmpCoordinate not in i_radDict):
        if ("Somatic" in cmpLine and "SNP" in cmpLine and
            cmpCoordinate not in i_radDict):
        '''
        # this one is for comparing BB, Radia, or Maf results
        if ((cmpCoordinate not in i_radDict) and
            ("PASS" in cmpLine) and
            ("SOM" in cmpLine or "EDIT" in cmpLine or
             "RNA_TUM_VAR" in cmpLine or "RNA_NOR_VAR" in cmpLine)):
            if (anIsDebug):
                logging.debug("no radia call %s", cmpLine)

            # if (aNonOverlapFilename is not None):
            #    nonOverlapFileHandler.write(cmpLine + "\n")

            # add to maf
            # if (anOverlapFilename is not None):
            #    overlapFileHandler.write(cmpLine + "\n")

    # for each rad event
    for (radCoordinate, radLine) in i_radDict.iteritems():

        # if ((radCoordinate not in i_cmpDict) and
        #    ("bldp" in radLine or "blq" in radLine)):
        # if ((radCoordinate not in i_cmpDict) and
        #    ("PASS" in radLine and "SNP" in radLine):
        # if (radCoordinate not in i_cmpDict):
        # if ("PASS" in radLine and "SOM" in radLine):
        # if ("SOM" in radLine and radCoordinate not in i_cmpDict):
        # if ((radCoordinate not in i_cmpDict) and
        #    ("PASS" in radLine and "SNP" in radLine and "Somatic" in radLine):
        # if ((radCoordinate not in i_cmpDict) and
        #    ("SNP" in radLine and "Somatic" in radLine)):
        # if ((radCoordinate not in i_cmpDict) and
        #    ("PASS" in radLine and "SOM" in radLine)):
        # if ("SOM" in radLine and radCoordinate not in i_cmpDict):
        if ((radCoordinate not in i_cmpDict) and
            ("PASS" in radLine and "SNP" in radLine) and
            ("SOM" in radLine or "EDIT" in radLine or
             "RNA_TUM_VAR" in radLine or "RNA_NOR_VAR" in radLine)):

            if (anIsDebug):
                logging.debug("new radia call %s", radLine)

            if (aNonOverlapFilename is not None):
                nonOverlapFileHandler.write(radLine + "\n")

            # add to maf
            # if (anOverlapFilename is not None):
                # caller = "ucsc;"
                # if ("radia" in radLine):
                #    caller += "radia;"
                # if ("bambam" in radLine):
                #    caller += "bambam;"

                # caller = "rnaCall;"
                # split the line on the tab
                # splitLine = radLine.split("\t")
                # chrom = splitLine[0]
                # stopCoordinate = int(splitLine[1])
                # startCoordinate = stopCoordinate-1
                # output = ["gene", "score", caller, "score", chrom,
                #           str(startCoordinate), str(stopCoordinate),
                #           "+", "mutClass", "SNP", "Somatic"]
                # overlapFileHandler.write("\t".join(output) + "\n")

        # if the coordinates overlap, then count them
        if (radCoordinate in i_cmpDict):

            i_statsDict["overlap_events"] += 1
            compareLine = i_cmpDict[radCoordinate]

            # this one is for BB and Maf comparisons
            # if ("PASS" in radLine and "SNP" in radLine):
            # this one is for Radia to Radia comparisons
            # if ("PASS" in radLine and "PASS" in compareLine):
            # this one is for Radia and validation
            # if ("PASS" in radLine and
            #     "SOM" in radLine and
            #     "SNP" in compareLine and
            #     "Somatic" in compareLine):
            # if ("Somatic" in radLine and
            #     "SNP" in radLine and
            #     "Somatic" in compareLine):
            # if ("SOM" in radLine and
            #     "Somatic" in compareLine and
            #     "SNP" in compareLine):
            # if ("PASS" in radLine and "SOM" in radLine):
            # if ("SOM" in radLine):
            # if ("PASS" in radLine and
            #     "SNP" in compareLine and
            #     "Somatic" in compareLine):
            # this one is for Radia to Radia comparisons
            if (("PASS" in radLine) and
                ("SOM" in radLine or
                 "EDIT" in radLine or
                 "RNA_TUM_VAR" in radLine or
                 "RNA_NOR_VAR" in radLine) and
                (("PASS" in compareLine) and
                 ("SOM" in compareLine or
                  "EDIT" in compareLine or
                  "RNA_TUM_VAR" in compareLine or
                  "RNA_NOR_VAR" in compareLine))):

                i_statsDict["overlap_pass_events"] += 1

            # for each key to compare
            # their can be multiple keys for one filter
            # such as blq and bldp for blacklists
            for (radKeyString, cmpKeyString) in aCompareDict.iteritems():
                # break up the strings to get the individual keys
                radKeyList = radKeyString.split(",")
                cmpKeyList = cmpKeyString.split(",")

                # set some booleans
                foundInRad = False
                foundInCmp = False
                # search for one of them
                for radKey in radKeyList:
                    # if we find one
                    if (radKey in radLine):
                        foundInRad = True
                        break
                # search for one of them
                for cmpKey in cmpKeyList:
                    # if we find one
                    if (cmpKey in compareLine):
                        foundInCmp = True
                        break

                # if the keys exist in both files at the
                # same position, then count them
                if (foundInRad and foundInCmp):
                    # if these are germline or they haven't
                    # been found in dbSnp, then count them
                    # if ((radKey == "GERM") or
                    #     ("DB" not in radLine and "DB" not in compareLine)):
                    # if ("SNP" in compareLine):
                    # if (("SNP" in compareLine) and
                    #     ((radKey == "GERM") or
                    #      ("DB" not in radLine and "DB" not in compareLine))):
                    # if ("PASS" in compareLine):
                    # if ("PASS" in compareLine and "SNP" in compareLine):
                    # if (True):
                        i_statsDict["overlap_" + radKey] += 1

                        splitLine = radLine.split("\t")
                        filterString = splitLine[6]
                        filterList = filterString.split(";")

                        # if ("PASS" in radLine and "SNP" in radLine):
                        # if ("PASS" in radLine and "SNP" in radLine):
                        # if ("SNP" in radLine):
                        if ("PASS" in radLine):
                            i_statsDict["overlap_pass_" + radKey] += 1
                            if (anIsDebug):
                                logging.debug("found call %s", compareLine)
                            if (anOverlapFilename is not None):

                                # add to maf

                                # caller = ";ucsc;"
                                # if ("radia" in radLine):
                                #    caller += "radia;"
                                # if ("bambam" in radLine):
                                #    caller += "bambam;"

                                # caller = ";rnaCall"
                                # splitLine = compareLine.split("\t")
                                # splitLine[2] += caller
                                # newSplitLine = "\t".join(splitLine) + "\n"
                                # overlapFileHandler.write(newSplitLine)

                                # caller = ";rnaCall"
                                # cmpSplitLine = compareLine.split("\t")
                                # callers = cmpSplitLine[2] + caller
                                # callers = callers.replace(";;", ",")
                                # callers = callers.replace(";", ",")

                                # radSplitLine = radLine.split("\t")
                                # radSplitLine[7] += ";CALLER=" + callers
                                # newRadLine = "\t".join(radSplitLine) + "\n"
                                # overlapFileHandler.write(newRadLine)

                                overlapFileHandler.write(radLine + "\n")
                                # we only want to write the line to the
                                # overlap file once even if it matches
                                # as a SOM and an EDIT
                                break
                                # overlapFileHandler.write(compareLine + "\n")
                        else:
                            if (anIsDebug):
                                logging.debug("found but no radia pass %s %s",
                                              radLine, compareLine)
                            # overlapFileHandler.write(compareLine + "\n")
                            splitLine = radLine.split("\t")

                            filterString = splitLine[6]
                            filterList = filterString.split(";")
                            for filterKey in filterList:
                                i_filterDict[filterKey] += 1

                            if (aNonOverlapFilename is not None):
                                # nonOverlapFileHandler.write(compareLine +
                                #                             "\n")
                                nonOverlapFileHandler.write(radLine + "\n")

                elif (anIsDebug and foundInRad):
                    logging.debug("overlap but not found in compare file %s " +
                                  "%s %s", radKey, radLine, compareLine)
                    # overlapFileHandler.write(compareLine + "\n")
                elif (anIsDebug and foundInCmp):
                    logging.debug("overlap but not found in RADIA %s %s %s",
                                  cmpKey, radLine, compareLine)
                    # overlapFileHandler.write(compareLine + "\n")
                elif (anIsDebug):
                    logging.debug("overlap but not same type %s %s %s %s",
                                  radKeyList, cmpKeyList, radLine, compareLine)
                    # overlapFileHandler.write(compareLine + "\n")

    # anId, aChrom, rad_events, cmp_events, overlap_events,
    # [rad_key, cmp_key, overlap_radKey]{n}
    # outputHeader = ["PatientId", "Chrom", "rad_events", "cmp_events",
    #                 "overlap_events", "rad_pass_events",
    #                 "cmp_pass_events", "overlap_pass_events"]
    outputList = [anId, aChrom]
    outputList += [str(i_statsDict["rad_events"]),
                   str(i_statsDict["cmp_events"]),
                   str(i_statsDict["overlap_events"])]
    outputList += [str(i_statsDict["rad_pass_events"]),
                   str(i_statsDict["cmp_pass_events"]),
                   str(i_statsDict["overlap_pass_events"])]

    # for each key to compare, get the total radias, total cmps, and overlaps
    for radKey in sorted(aCompareDict.iterkeys()):
        cmpKey = aCompareDict[radKey]
        outputList += [str(i_statsDict["rad_" + radKey]),
                       str(i_statsDict["cmp_" + cmpKey]),
                       str(i_statsDict["overlap_" + radKey])]
        outputList += [str(i_statsDict["rad_pass_" + radKey]),
                       str(i_statsDict["cmp_pass_" + cmpKey]),
                       str(i_statsDict["overlap_pass_" + radKey])]

    for (filterKey, count) in i_filterDict.iteritems():
        logging.debug("filter: %s\t%s", filterKey, count)

    # get the files
    i_statsFileHandler = None
    if (aStatsFilename is not None):
        i_statsFileHandler = radiaUtil.get_append_fileHandler(aStatsFilename)
        i_statsFileHandler.write("\t".join(outputList) + "\n")
        i_statsFileHandler.close()

    stopTime = time.time()
    logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs",
                 aChrom, anId, ((stopTime-startTime)/(3600)),
                 ((stopTime-startTime)/60), (stopTime-startTime))
    logging.info("\t".join(outputList))

    if (anOverlapFilename is not None):
        overlapFileHandler.close()

    if (aNonOverlapFilename is not None):
        nonOverlapFileHandler.close()

    return
Esempio n. 14
0
def main():

    # command for running this on a small test case:
    # python mergeChroms.py TCGA-BH-A18P
    # ../data/test/ ../data/test/ --log=DEBUG

    startTime = time.time()

    # create the usage statement
    usage = "usage: python %prog id inputDir outputDir [Options]"
    i_cmdLineParser = OptionParser(usage=usage)

    i_cmdLineParser.add_option(
        "-l",
        "--log",
        dest="logLevel",
        default="WARNING",
        metavar="LOG",
        help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " +
        "%default by default")
    i_cmdLineParser.add_option(
        "-g",
        "--logFilename",
        dest="logFilename",
        metavar="LOG_FILE",
        help="the name of the log file, STDERR by default")
    i_cmdLineParser.add_option(
        "",
        "--gzip",
        dest="gzip",
        action="store_true",
        default=False,
        help="include this argument if the final VCF should be " +
        "compressed with gzip")

    # range(inclusiveFrom, exclusiveTo, by)
    i_possibleArgLengths = range(3, 10, 1)
    i_argLength = len(sys.argv)

    # check if this is one of the possible correct commands
    if (i_argLength not in i_possibleArgLengths):
        i_cmdLineParser.print_help()
        sys.exit(1)

    # get the required parameters
    (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args()
    i_id = i_cmdLineArgs[0]
    i_inputDir = i_cmdLineArgs[1]
    i_outputDir = i_cmdLineArgs[2]

    # get the optional params with default values
    i_logLevel = i_cmdLineOptions.logLevel
    i_gzip = i_cmdLineOptions.gzip

    i_logFilename = None
    if (i_cmdLineOptions.logFilename is not None):
        i_logFilename = str(i_cmdLineOptions.logFilename)

    # assuming loglevel is bound to the string value obtained from the
    # command line argument. Convert to upper case to allow the user to
    # specify --log=DEBUG or --log=debug
    i_numericLogLevel = getattr(logging, i_logLevel.upper(), None)
    if not isinstance(i_numericLogLevel, int):
        raise ValueError(
            "Invalid log level: '%s' must be one of the " +
            "following:  DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel)

    # set up the logging
    if (i_logFilename is not None):
        logging.basicConfig(level=i_numericLogLevel,
                            filename=i_logFilename,
                            filemode='w',
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')
    else:
        logging.basicConfig(level=i_numericLogLevel,
                            format='%(asctime)s\t%(levelname)s\t%(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S %p')

    # set the debug
    i_debug = (i_numericLogLevel == logging.DEBUG)

    # output some debug info
    if (i_debug):
        logging.debug("id=%s", i_id)
        logging.debug("inputDir=%s", i_inputDir)
        logging.debug("outputDir=%s", i_outputDir)
        logging.debug("logLevel=%s", i_logLevel)
        logging.debug("logFile=%s", i_logFilename)
        logging.debug("gzip=%s", i_gzip)

    # check for any errors
    i_readFilenameList = None
    if (i_logFilename is not None):
        i_writeFilenameList = [i_logFilename]
    else:
        i_writeFilenameList = None
    i_dirList = [i_inputDir, i_outputDir]

    if (not radiaUtil.check_for_argv_errors(i_dirList, i_readFilenameList,
                                            i_writeFilenameList)):
        sys.exit(1)

    # get the VCF generator
    (headerDict, coordDict) = get_vcf_data(i_id, i_inputDir, i_debug)

    if (i_gzip):
        i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf.gz")
    else:
        i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf")

    outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename)

    # if we have header info to output
    if (len(headerDict["metadata"]) > 0):
        # output the header information
        outputFileHandler.write("\n".join(headerDict["metadata"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["filter"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["info"]) + "\n")
        outputFileHandler.write("\n".join(headerDict["format"]) + "\n")
        outputFileHandler.write("".join(headerDict["chrom"]) + "\n")

    # first output the numerical chroms in order
    numericChromKeys = coordDict["numbers"].keys()
    numericChromKeys.sort(key=int)
    for chrom in numericChromKeys:
        outputFileHandler.write("\n".join(coordDict["numbers"][chrom]) + "\n")

    # then output the alphabetical chroms in order
    letterChromKeys = coordDict["letters"].keys()
    letterChromKeys.sort(key=str)
    for chrom in letterChromKeys:
        outputFileHandler.write("\n".join(coordDict["letters"][chrom]) + "\n")

    stopTime = time.time()
    logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs",
                 i_id, ((stopTime - startTime) / (3600)),
                 ((stopTime - startTime) / 60), (stopTime - startTime))

    # close the files
    outputFileHandler.close()

    return