def main(): # create the usage statement usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Opts]" i_cmdLineParser = OptionParser(usage=usage) i_cmdLineParser.add_option( "-o", "--outputFilename", default=sys.stdout, dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default") i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " + "%default by default") i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default") i_cmdLineParser.add_option( "-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters " + "thus far are processed, include this argument if all of the " + "VCF calls should be processed") # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(3, 14, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_vcfFilename = str(i_cmdLineArgs[0]) i_rnaGeneFilename = str(i_cmdLineArgs[1]) i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2]) # get the optional params with default values i_logLevel = i_cmdLineOptions.logLevel i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly # try to get any optional parameters with no defaults i_outputFilename = None i_logFilename = None if (i_cmdLineOptions.outputFilename is not None): i_outputFilename = str(i_cmdLineOptions.outputFilename) if (i_cmdLineOptions.logFilename is not None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError("Invalid log level: '%s' must be one of the " + "following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename is not sys.stdout): logging.basicConfig( level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig( level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("vcfFilename=%s", i_vcfFilename) logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename) logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename) logging.debug("outputFilename=%s", i_outputFilename) logging.debug("logFilename=%s", i_logFilename) logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag) # check for any errors i_writeFilenameList = [] if (i_outputFilename is not sys.stdout): i_writeFilenameList = [i_outputFilename] if (i_logFilename is not None): i_writeFilenameList = [i_logFilename] i_readFilenameList = [i_vcfFilename, i_rnaGeneFilename, i_rnaGeneFamilyFilename] if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # open the input stream i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename) # open the output stream if i_outputFilename is not sys.stdout: i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) else: i_outputFileHandler = i_outputFilename # get the RNA gene blacklists (i_rnaGeneList, i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename, i_rnaGeneFamilyFilename, i_debug) hasAddedFilterHeader = False for line in i_vcfFileHandler: if (i_debug): logging.debug("vcfLine: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue # if we find the FILTER section, then add the filters from here elif ((not hasAddedFilterHeader) and (line.startswith("##FILTER"))): hasAddedFilterHeader = True i_outputFileHandler.write( "##FILTER=<ID=rgene,Description=\"This gene is on the " + "RNA gene blacklist\">\n") i_outputFileHandler.write( "##FILTER=<ID=rgfam,Description=\"This gene family is on " + "the RNA gene family blacklist\">\n") i_outputFileHandler.write(line) # these lines are from previous scripts in the pipeline, so output them elif (line.startswith("#")): i_outputFileHandler.write(line) # if we are only suppose to process the passed calls # and this call has not passed, then skip it elif (i_passedVCFCallsOnlyFlag and "PASS" not in line): i_outputFileHandler.write(line) # now we are to the data else: # strip the carriage return and newline characters line = line.rstrip("\r\n") # split the line on the tab splitLine = line.split("\t") filterSet = set(splitLine[6].split(";")) # if there are no filters so far, then clear the list if (len(filterSet) == 1 and "PASS" in filterSet): filterSet = set() # parse the info column and create a dict infoList = splitLine[7].split(";") infoDict = collections.defaultdict(list) for info in infoList: keyValueList = info.split("=") # some keys are just singular without a value (e.g. DB, etc.) if (len(keyValueList) == 1): infoDict[keyValueList[0]] = ["True"] else: # the value can be a comma separated list infoDict[keyValueList[0]] = keyValueList[1].split(",") effectList = infoDict["EFF"] effectRegEx = re.compile("(\\w).*\\({1}") ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"] isRnaBlacklistGene = False isRnaBlacklistGeneFamily = False for rawEffect in effectList: rawEffect = rawEffect.rstrip(")") iterator = effectRegEx.finditer(rawEffect) # for each match object in the iterator for match in iterator: effect = match.group() rawEffect = rawEffect.replace(effect, "") effect = effect.rstrip("(") if (effect in ignoreEffectsList): continue effectParts = rawEffect.split("|") # effectImpact = effectParts[0] # functionalClass = effectParts[1] # codonChange = effectParts[2] # aaChange = effectParts[3] # aaLength = effectParts[4] geneName = effectParts[5] transcriptBiotype = effectParts[6] # geneCoding = effectParts[7] # ensembleId = effectParts[8] # exonNumber = effectParts[9] # genotypeNumber = effectParts[10] # the RNA gene list can have "RP11" and that # should filter out any gene with RP11 in it for rnaGene in i_rnaGeneList: if (rnaGene in geneName): isRnaBlacklistGene = True break if (transcriptBiotype in i_rnaGeneFamilyList): isRnaBlacklistGeneFamily = True output = ["\t".join(splitLine[0:6])] # if the filter should be applied if (isRnaBlacklistGene): filterSet.add("rgene") # if the filter should be applied if (isRnaBlacklistGeneFamily): filterSet.add("rgfam") # if there are no filters so far, then this call passes if (len(filterSet) == 0): filterSet.add("PASS") output.append(";".join(filterSet)) output.append("\t".join(splitLine[7:])) if (i_outputFilename is not sys.stdout): i_outputFileHandler.write("\t".join(output) + "\n") else: print >> sys.stdout, "\t".join(output) # close the files i_vcfFileHandler.close() if (i_outputFilename is not sys.stdout): i_outputFileHandler.close() return
def main(): # command for running this on a small test case: #python createBlatFile.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf ../data/test/tmp/ --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads startTime = time.time() # create the usage statement usage = "usage: python %prog id vcfFile headerFile [Options]" i_cmdLineParser = OptionParser(usage=usage) # add the optional parameters i_cmdLineParser.add_option("-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed") i_cmdLineParser.add_option("-b", "--allReadBases", action="store_false", default=True, dest="altBasesOnly", help="by default only the reads with the alternate base are processed, include this argument if all of the reads should be processed") i_cmdLineParser.add_option("-d", "--maxReadDepth", type="int", default=int(8000), dest="maxReadDepth", metavar="MAX_READ_DEPTH", help="the maximum read depth to process from the samtools view command, %default by default") i_cmdLineParser.add_option("-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default") i_cmdLineParser.add_option("-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default") i_cmdLineParser.add_option("-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default") i_cmdLineParser.add_option("", "--transcriptNameTag", dest="transcriptNameTag", help="the INFO key where the original transcript name can be found") i_cmdLineParser.add_option("", "--transcriptCoordinateTag", dest="transcriptCoordinateTag", help="the INFO key where the original transcript coordinate can be found") i_cmdLineParser.add_option("", "--transcriptStrandTag", dest="transcriptStrandTag", help="the INFO key where the original transcript strand can be found") i_cmdLineParser.add_option("", "--rnaIncludeSecondaryAlignments", action="store_true", default=False, dest="rnaIncludeSecondaryAlignments", help="if you align the RNA to transcript isoforms, then you may want to include RNA secondary alignments in the samtools mpileups") i_cmdLineParser.add_option("-n", "--blatDnaNormalReads", action="store_true", default=False, dest="blatDnaNormalReads", help="include this argument if the normal DNA reads should be processed") i_cmdLineParser.add_option("-x", "--blatRnaNormalReads", action="store_true", default=False, dest="blatRnaNormalReads", help="include this argument if the normal RNA reads should be processed") i_cmdLineParser.add_option("-t", "--blatDnaTumorReads", action="store_true", default=False, dest="blatDnaTumorReads", help="include this argument if the tumor DNA reads should be processed") i_cmdLineParser.add_option("-r", "--blatRnaTumorReads", action="store_true", default=False, dest="blatRnaTumorReads", help="include this argument if the tumor RNA reads should be processed") # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(3,22,1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_id = i_cmdLineArgs[0] i_vcfFilename = i_cmdLineArgs[1] i_headerFilename = i_cmdLineArgs[2] # get the optional params with default values i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly i_altBasesOnlyFlag = i_cmdLineOptions.altBasesOnly i_maxReadDepth = i_cmdLineOptions.maxReadDepth i_logLevel = i_cmdLineOptions.logLevel i_rnaIncludeSecondaryAlignments = i_cmdLineOptions.rnaIncludeSecondaryAlignments i_blatDnaNormalReads = i_cmdLineOptions.blatDnaNormalReads i_blatDnaTumorReads = i_cmdLineOptions.blatDnaTumorReads i_blatRnaNormalReads = i_cmdLineOptions.blatRnaNormalReads i_blatRnaTumorReads = i_cmdLineOptions.blatRnaTumorReads # try to get any optional parameters with no defaults i_readFilenameList = [i_vcfFilename, i_headerFilename] i_writeFilenameList = [] i_logFilename = None i_outputFilename = None i_transcriptNameTag = None i_transcriptCoordinateTag = None i_transcriptStrandTag = None if (i_cmdLineOptions.logFilename != None): i_logFilename = str(i_cmdLineOptions.logFilename) i_writeFilenameList += [i_logFilename] if (i_cmdLineOptions.outputFilename != None): i_outputFilename = str(i_cmdLineOptions.outputFilename) i_writeFilenameList += [i_outputFilename] if (i_cmdLineOptions.transcriptNameTag != None): i_transcriptNameTag = i_cmdLineOptions.transcriptNameTag if (i_cmdLineOptions.transcriptCoordinateTag != None): i_transcriptCoordinateTag = i_cmdLineOptions.transcriptCoordinateTag if (i_cmdLineOptions.transcriptStrandTag != None): i_transcriptStrandTag = i_cmdLineOptions.transcriptStrandTag # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError("Invalid log level: '%s' must be one of the following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename != None): logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug flag i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("id=%s", i_id) logging.debug("vcfFilename=%s", i_vcfFilename) logging.debug("headerFilename=%s", i_headerFilename) logging.debug("outputFilename=%s", i_outputFilename) logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag) logging.debug("altBasesOnlyFlag? %s", i_altBasesOnlyFlag) logging.debug("maxReadDepth %s", i_maxReadDepth) logging.debug("transcriptNameTag %s", i_transcriptNameTag) logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag) logging.debug("transcriptStrandTag %s", i_transcriptStrandTag) logging.debug("rnaIncludeSecondaryAlignments=%s" % i_rnaIncludeSecondaryAlignments) logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads) logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads) logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads) logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads) if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # open the output stream i_outputFileHandler = None if (i_outputFilename != None): i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) # get the VCF generator i_vcfGenerator = get_vcf_data(i_vcfFilename, i_headerFilename, i_passedVCFCallsOnlyFlag, i_debug) # for each VCF call that should be investigated for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore, vcfFilterSet, vcfInfoDict, restOfLine, vcfParamsDict) in i_vcfGenerator: if (i_debug): logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, str(vcfFilterSet), str(vcfInfoDict), restOfLine) modTypes = vcfInfoDict["MT"] for modType in modTypes: # get the reads contributing to a call and put them in a blat query file if (i_blatDnaNormalReads): write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, [vcfChr], [vcfStopCoordinate], [None], vcfParamsDict, vcfInfoDict, "dnaNormal", i_altBasesOnlyFlag, False, i_maxReadDepth, i_debug) if (modType == "NOR_EDIT" and i_blatRnaNormalReads): # if we should process the transcripts if ((i_transcriptNameTag != None) and (i_transcriptNameTag in vcfInfoDict)): write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, vcfInfoDict[i_transcriptNameTag], vcfInfoDict[i_transcriptCoordinateTag], vcfInfoDict[i_transcriptStrandTag], vcfParamsDict, vcfInfoDict, "rnaNormal", i_altBasesOnlyFlag, i_rnaIncludeSecondaryAlignments, i_maxReadDepth, i_debug) else: write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, [vcfChr], [vcfStopCoordinate], [None], vcfParamsDict, vcfInfoDict, "rnaNormal", i_altBasesOnlyFlag, i_rnaIncludeSecondaryAlignments, i_maxReadDepth, i_debug) if (i_blatDnaTumorReads): write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, [vcfChr], [vcfStopCoordinate], [None], vcfParamsDict, vcfInfoDict, "dnaTumor", i_altBasesOnlyFlag, False, i_maxReadDepth, i_debug) if ((modType == "SOM" or modType == "TUM_EDIT") and i_blatRnaTumorReads): # if we should process the transcripts if ((i_transcriptNameTag != None) and (i_transcriptNameTag in vcfInfoDict)): write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, list(vcfInfoDict[i_transcriptNameTag]), vcfInfoDict[i_transcriptCoordinateTag], vcfInfoDict[i_transcriptStrandTag], vcfParamsDict, vcfInfoDict, "rnaTumor", i_altBasesOnlyFlag, i_rnaIncludeSecondaryAlignments, i_maxReadDepth, i_debug) else: write_to_blat_file(i_outputFileHandler, vcfChr, vcfStopCoordinate, [vcfChr], [vcfStopCoordinate], [None], vcfParamsDict, vcfInfoDict, "rnaTumor", i_altBasesOnlyFlag, i_rnaIncludeSecondaryAlignments, i_maxReadDepth, i_debug) stopTime = time.time() logging.info("createBlatFile.py Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) # close the files if (i_outputFilename != None): i_outputFileHandler.close() return
def filter_events(aTCGAId, aChrom, aBedFilename, aVCFFilename, anOutputFilename, aFilterName, aFilterField, anIncludeOverlapInfo, anIncludeFilterName, anIdField, anIncludeId, anIncludeCount, aFilterHeaderLine, aBinSize, anIsDebug): ''' ' This function reads from a .bed file and a .vcf file line by line and ' looks for variants that should be filtered or tagged. The .bed file ' specifies coordinates for areas where variants should either be included ' or excluded. For example, a .bed file specifying transcription or exon ' start and stop coordinates can be provided along with the ' --includeOverlaps flag to indicate that the variants in these regions ' should be kept, and variants outside of these regions should be flagged ' or filtered out. Conversely, a bed file specifying areas of the genome ' that are accessible (as defined by the 1000 Genomes project) can be given ' without the --includeOverlaps flag to indicate that the variants outside ' of the accessible genome should be flagged or filtered out, and variants ' overlapping the accessible regions should not be flagged or filtered out. ' ' aTCGAId: The TCGA Id for this sample ' aChrom: The chromosome being filtered ' aBedFilename: A .bed file with at least 3 columns specifying the chrom, ' start, and stop coordinates and possibly a 4th column with an id ' aVCFFilename: A .vcf file with variants that will be either ' included or excluded ' anOutputFilename: An output file where the filtered variants are output ' aFilterName: The name of the filter ' aFilterField: The field where the filter name should be included ' (e.g. INFO or FILTER) ' anIncludeOverlapInfo: A flag specifying whether the variants should be ' included or excluded when they overlap ' anIncludeFilterName: A flag specifying whether the filtering name should ' be included in the output or not ' anIdField: The field where the ID should be specified (e.g. ID or INFO) ' anIncludeId: A flag specifying whether the id should be included in the ' output or not ' anIncludeCount: A flag specifying whether the number of overlaps should ' be included in the output or not ' aFilterHeaderLine: A filter header line that should be added to the VCF ' header describing this filter ' aBinSize: The size of the interval between each bin ' anIsDebug: A flag for outputting debug messages to STDERR ''' # initialize pybed with the filtering file filterPybed = pybed(binsize=aBinSize) filterPybed.load_from_file(aBedFilename) # get the vcf file i_vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFilename) # get the output file i_outputFileHandler = None if (anOutputFilename is not None): i_outputFileHandler = radiaUtil.get_write_fileHandler(anOutputFilename) # create the generator for the vcf file vcfGenerator = get_vcf_data(i_vcfFileHandler, i_outputFileHandler, aFilterHeaderLine, anIsDebug) # initialize some variables overlappingEvents = 0 nonOverlappingEvents = 0 totalEvents = 0 startTime = time.time() # for each vcf line for (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate, vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_restLine, vcf_line) in (vcfGenerator): totalEvents += 1 if (anIsDebug): logging.debug("VCF: %s", vcf_line) # check if this vcf coordinate overlaps with the filter coordinates posTuple = (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate) (isOverlap, idValue, count) = filterPybed.overlaps_with(posTuple, anIncludeCount) # if an event overlaps with the filters if (isOverlap): # count the overlap overlappingEvents += 1 # if we want to add info about overlaps if (anIncludeOverlapInfo): # alter the filter and id name if appropriate if (anIncludeFilterName): (vcf_filter, vcf_info) = add_filter(vcf_filter, vcf_info, aFilterName, aFilterField, anIncludeCount, count, anIncludeId, anIdField, idValue) if (anIncludeId and anIdField == "ID"): vcf_id = add_id(vcf_id, idValue) # output the event outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info) if (anOutputFilename is not None): i_outputFileHandler.write("\t".join(outputList) + "\t" + "\t".join(vcf_restLine) + "\n") else: print >> sys.stdout, ("\t".join(outputList) + "\t" + "\t".join(vcf_restLine)) # we don't want to add info about overlaps, just output them else: # output the event if (anOutputFilename is not None): i_outputFileHandler.write(vcf_line + "\n") else: print >> sys.stdout, vcf_line # these events don't overlap with the filters else: # count the non overlap nonOverlappingEvents += 1 # if we don't want to add info about overlaps, # then we do want to add info about non-overlaps if (not anIncludeOverlapInfo): # alter the filter and id name if appropriate if (anIncludeFilterName): (vcf_filter, vcf_info) = add_filter(vcf_filter, vcf_info, aFilterName, aFilterField, anIncludeCount, count, anIncludeId, anIdField, idValue) if (anIncludeId and anIdField == "ID"): vcf_id = add_id(vcf_id, idValue) # output the event outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info) if (anOutputFilename is not None): i_outputFileHandler.write("\t".join(outputList) + "\t" + "\t".join(vcf_restLine) + "\n") else: print >> sys.stdout, ("\t".join(outputList) + "\t" + "\t".join(vcf_restLine)) # we do want to add info about overlaps, # so just output non-overlaps else: # output the event if (anOutputFilename is not None): i_outputFileHandler.write(vcf_line + "\n") else: print >> sys.stdout, vcf_line stopTime = time.time() logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs", aChrom, aTCGAId, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) if (overlappingEvents + nonOverlappingEvents == totalEvents): logging.info("For chrom %s and Id %s: %s (overlapping events) + " + "%s (non-overlapping events) = %s", aChrom, aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents) else: logging.info("filterByPybed Warning: For chrom %s and Id %s: %s " + "(overlapping events) + %s (non-overlapping events) = %s", aChrom, aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents) # close the files i_vcfFileHandler.close() if (anOutputFilename is not None): i_outputFileHandler.close() return
def main(): # command for running this on a small test case: # python mergeChroms.py TCGA-BH-A18P # ../data/test/ ../data/test/ --log=DEBUG startTime = time.time() # create the usage statement usage = "usage: python %prog id inputDir outputDir [Options]" i_cmdLineParser = OptionParser(usage=usage) i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " + "%default by default") i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDERR by default") i_cmdLineParser.add_option( "", "--gzip", dest="gzip", action="store_true", default=False, help="include this argument if the final VCF should be " + "compressed with gzip") # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(3, 10, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_id = i_cmdLineArgs[0] i_inputDir = i_cmdLineArgs[1] i_outputDir = i_cmdLineArgs[2] # get the optional params with default values i_logLevel = i_cmdLineOptions.logLevel i_gzip = i_cmdLineOptions.gzip i_logFilename = None if (i_cmdLineOptions.logFilename is not None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError("Invalid log level: '%s' must be one of the " + "following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename is not None): logging.basicConfig( level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig( level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("id=%s", i_id) logging.debug("inputDir=%s", i_inputDir) logging.debug("outputDir=%s", i_outputDir) logging.debug("logLevel=%s", i_logLevel) logging.debug("logFile=%s", i_logFilename) logging.debug("gzip=%s", i_gzip) # check for any errors i_readFilenameList = None if (i_logFilename is not None): i_writeFilenameList = [i_logFilename] else: i_writeFilenameList = None i_dirList = [i_inputDir, i_outputDir] if (not radiaUtil.check_for_argv_errors(i_dirList, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # get the VCF generator (headerDict, coordDict) = get_vcf_data(i_id, i_inputDir, i_debug) if (i_gzip): i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf.gz") else: i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf") outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) # if we have header info to output if (len(headerDict["metadata"]) > 0): # output the header information outputFileHandler.write("\n".join(headerDict["metadata"]) + "\n") outputFileHandler.write("\n".join(headerDict["filter"]) + "\n") outputFileHandler.write("\n".join(headerDict["info"]) + "\n") outputFileHandler.write("\n".join(headerDict["format"]) + "\n") outputFileHandler.write("".join(headerDict["chrom"]) + "\n") # first output the numerical chroms in order numericChromKeys = coordDict["numbers"].keys() numericChromKeys.sort(key=int) for chrom in numericChromKeys: outputFileHandler.write("\n".join(coordDict["numbers"][chrom]) + "\n") # then output the alphabetical chroms in order letterChromKeys = coordDict["letters"].keys() letterChromKeys.sort(key=str) for chrom in letterChromKeys: outputFileHandler.write("\n".join(coordDict["letters"][chrom]) + "\n") stopTime = time.time() logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) # close the files outputFileHandler.close() return
def main(): # python mergeRnaAndDnaFiles.py TCGA-AB-2995 5 # ../data/test/TCGA-AB-2995_dnaFile.vcf # ../data/test/TCGA-AB-2995_rnaFile.vcf # ../data/test/TCGA-AB-2995_rnaFile.vcf # ../data/test/ startTime = time.time() # create the usage statement usage = ("usage: python %prog id chrom dnaFile rnaFile rnaOverlapsFile " + "rnaNonOverlapsFile outputFile [Options]") i_cmdLineParser = OptionParser(usage=usage) i_cmdLineParser.add_option( "-l", "--log", default="WARNING", dest="logLevel", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " + "%default by default") i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDERR by default") # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(6, 15, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_id = i_cmdLineArgs[0] i_chrom = i_cmdLineArgs[1] i_dnaFilename = i_cmdLineArgs[2] i_rnaFilename = i_cmdLineArgs[3] i_overlapsFilename = i_cmdLineArgs[4] i_nonOverlapsFilename = i_cmdLineArgs[5] i_outputFilename = i_cmdLineArgs[6] # get the optional params with default values i_logLevel = i_cmdLineOptions.logLevel i_logFilename = None if (i_cmdLineOptions.logFilename is not None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError("Invalid log level: '%s' must be one of the " + "following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename is not None): logging.basicConfig( level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig( level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("id=%s", i_id) logging.debug("chrom=%s", i_chrom) logging.debug("dnaFilename=%s", i_dnaFilename) logging.debug("rnaFilename=%s", i_rnaFilename) logging.debug("overlapsFilename=%s", i_overlapsFilename) logging.debug("nonOverlapsFilename=%s", i_nonOverlapsFilename) logging.debug("outputFilename=%s", i_outputFilename) # check for any errors i_readFilenameList = [i_dnaFilename, i_rnaFilename, i_overlapsFilename] i_writeFilenameList = [i_outputFilename] i_dirList = None if (not radiaUtil.check_for_argv_errors(i_dirList, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # get the VCF generator (headerList, coordinateDict) = merge_vcf_data(i_dnaFilename, i_rnaFilename, i_overlapsFilename, i_nonOverlapsFilename, i_debug) outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) for headerLine in headerList: outputFileHandler.write(headerLine) numericKeys = coordinateDict.keys() numericKeys.sort(key=int) for coordinate in numericKeys: line = coordinateDict[coordinate] line = line.rstrip("\r\n") # split the line on the tab splitLine = line.split("\t") # set the SST field in the INFO splitLine[7] = set_sst_field(splitLine[7]) outputFileHandler.write("\t".join(splitLine) + "\n") stopTime = time.time() logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) # close the files outputFileHandler.close() return
def main(): # create the usage statement usage = "usage: python %prog vcfFile rnaGeneFile rnaGeneFamilyFile [Options]" i_cmdLineParser = OptionParser(usage=usage) i_cmdLineParser.add_option( "-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default") i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help= "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default" ) i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default") i_cmdLineParser.add_option( "-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help= "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed" ) # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(3, 14, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_vcfFilename = str(i_cmdLineArgs[0]) i_rnaGeneFilename = str(i_cmdLineArgs[1]) i_rnaGeneFamilyFilename = str(i_cmdLineArgs[2]) # get the optional params with default values i_logLevel = i_cmdLineOptions.logLevel i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly # try to get any optional parameters with no defaults i_outputFilename = None i_logFilename = None if (i_cmdLineOptions.outputFilename != None): i_outputFilename = str(i_cmdLineOptions.outputFilename) if (i_cmdLineOptions.logFilename != None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError( "Invalid log level: '%s' must be one of the following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename != None): logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("vcfFilename=%s", i_vcfFilename) logging.debug("rnaGeneFilename=%s", i_rnaGeneFilename) logging.debug("rnaGeneFamilyFilename=%s", i_rnaGeneFamilyFilename) logging.debug("outputFilename=%s", i_outputFilename) logging.debug("logFilename=%s", i_logFilename) logging.debug("passedOnly?=%s", i_passedVCFCallsOnlyFlag) # check for any errors i_writeFilenameList = [] if (i_outputFilename != None): i_writeFilenameList = [i_outputFilename] if (i_logFilename != None): i_writeFilenameList = [i_logFilename] i_readFilenameList = [ i_vcfFilename, i_rnaGeneFilename, i_rnaGeneFamilyFilename ] if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # open the output stream i_outputFileHandler = None if (i_outputFilename != None): i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) # get the RNA gene blacklists (i_rnaGeneList, i_rnaGeneFamilyList) = get_rna_genes(i_rnaGeneFilename, i_rnaGeneFamilyFilename, i_debug) hasAddedHeader = False i_vcfFileHandler = radiaUtil.get_read_fileHandler(i_vcfFilename) vcfHeader = "##FILTER=<ID=rgene,Description=\"This gene is on the RNA gene blacklist\">\n" vcfHeader += "##FILTER=<ID=rgfam,Description=\"This gene family is on the RNA gene family blacklist\">\n" for line in i_vcfFileHandler: # strip the carriage return and newline characters line = line.rstrip("\r\n") if (i_debug): logging.debug("vcfLine: %s", line) # if it is an empty line, then just continue if (line.isspace()): continue # if we find the FILTER section, then add the filters from here elif ((not hasAddedHeader) and (line.startswith("##FILTER") or line.startswith("##INFO"))): hasAddedHeader = True if (i_outputFileHandler != None): i_outputFileHandler.write(vcfHeader) i_outputFileHandler.write(line + "\n") else: print >> sys.stdout, vcfHeader print >> sys.stdout, line # these lines are from previous scripts in the pipeline, so output them elif (line.startswith("#")): if (i_outputFileHandler != None): i_outputFileHandler.write(line + "\n") else: print >> sys.stdout, line # if we are only suppose to process the passed calls # and this call has not passed, then skip it elif (i_passedVCFCallsOnlyFlag and "PASS" not in line): if (i_outputFileHandler != None): i_outputFileHandler.write(line + "\n") else: print >> sys.stdout, line # now we are to the data else: # split the line on the tab splitLine = line.split("\t") filterSet = set(splitLine[6].split(";")) # if there are no filters so far, then clear the list if (len(filterSet) == 1 and "PASS" in filterSet): filterSet = set() # parse the info column and create a dict infoList = splitLine[7].split(";") infoDict = collections.defaultdict(list) for info in infoList: keyValueList = info.split("=") # some keys are just singular without a value (e.g. DB, SOMATIC, etc.) if (len(keyValueList) == 1): infoDict[keyValueList[0]] = ["True"] else: # the value can be a comma separated list infoDict[keyValueList[0]] = keyValueList[1].split(",") effectList = infoDict["EFF"] effectRegEx = re.compile("(\\w).*\\({1}") ignoreEffectsList = ["UPSTREAM", "DOWNSTREAM"] isRnaBlacklistGene = False isRnaBlacklistGeneFamily = False for rawEffect in effectList: rawEffect = rawEffect.rstrip(")") iterator = effectRegEx.finditer(rawEffect) # for each match object in the iterator for match in iterator: effect = match.group() rawEffect = rawEffect.replace(effect, "") effect = effect.rstrip("(") if (effect in ignoreEffectsList): continue effectParts = rawEffect.split("|") #effectImpact = effectParts[0] #functionalClass = effectParts[1] #codonChange = effectParts[2] #aaChange = effectParts[3] #aaLength = effectParts[4] geneName = effectParts[5] transcriptBiotype = effectParts[6] #geneCoding = effectParts[7] #ensembleId = effectParts[8] #exonNumber = effectParts[9] #genotypeNumber = effectParts[10] # the RNA gene list can have "RP11" and that # should filter out any gene with RP11 in it for rnaGene in i_rnaGeneList: if (rnaGene in geneName): isRnaBlacklistGene = True break if (transcriptBiotype in i_rnaGeneFamilyList): isRnaBlacklistGeneFamily = True output = ["\t".join(splitLine[0:6])] # if the filter should be applied if (isRnaBlacklistGene): filterSet.add("rgene") # if the filter should be applied if (isRnaBlacklistGeneFamily): filterSet.add("rgfam") # if there are no filters so far, then this call passes if (len(filterSet) == 0): filterSet.add("PASS") output.append(";".join(filterSet)) output.append("\t".join(splitLine[7:])) if (i_outputFilename != None): i_outputFileHandler.write("\t".join(output) + "\n") else: print >> sys.stdout, "\t".join(output) # close the files i_vcfFileHandler.close() if (i_outputFilename != None): i_outputFileHandler.close() return
def main(): # command for running this on a small test case: #python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads startTime = time.time() # create the usage statement usage = "usage: python %prog id chrom vcfFile blatOutputFile [Options]" i_cmdLineParser = OptionParser(usage=usage) # add the optional parameters i_cmdLineParser.add_option( "-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help= "by default only the VCF calls that have passed all filters thus far are processed, include this argument if all of the VCF calls should be processed" ) i_cmdLineParser.add_option( "-k", "--keepPreviousFilters", action="store_true", default=False, dest="keepPreviousFilters", help= "by default the previous filters are overwritten with the blat filter, include this argument if the previous filters should be kept" ) i_cmdLineParser.add_option( "-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", help="the name of the output file, STDOUT by default") i_cmdLineParser.add_option("-b", "--blatOutputFormat", dest="blatOutputFormat", metavar="OUTPUT_FORMAT", default="BLAST", help="the BLAT output format, BLAST by default") i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help= "the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), %default by default" ) i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDOUT by default") i_cmdLineParser.add_option( "-n", "--blatDnaNormalReads", action="store_true", default=False, dest="blatDnaNormalReads", help="include this argument if the normal DNA reads should be processed" ) i_cmdLineParser.add_option( "-x", "--blatRnaNormalReads", action="store_true", default=False, dest="blatRnaNormalReads", help="include this argument if the normal RNA reads should be processed" ) i_cmdLineParser.add_option( "-t", "--blatDnaTumorReads", action="store_true", default=False, dest="blatDnaTumorReads", help="include this argument if the tumor DNA reads should be processed" ) i_cmdLineParser.add_option( "-r", "--blatRnaTumorReads", action="store_true", default=False, dest="blatRnaTumorReads", help="include this argument if the tumor RNA reads should be processed" ) i_cmdLineParser.add_option( "-d", "--readDepthCutoff", type="int", default=int(4), dest="readDepthCutoff", metavar="READ_DP_CUTOFF", help= "the minimum number of valid reads that are necessary, %default by default" ) i_cmdLineParser.add_option( "-p", "--readPercentCutoff", type="float", default=float(0.10), dest="readPercentCutoff", metavar="READ_PERCENT_CUTOFF", help= "the minimum percentage of valid reads that are necessary, %default by default" ) #i_cmdLineParser.add_option("-e", "--eValueCutoff", type="float", default=float(10e-6), dest="eValueCutoff", metavar="EVAL_CUTOFF", help="the e-value cutoff for determining if a blat hit is significant, %default by default") #i_cmdLineParser.add_option("-u", "--upperIdentityCutoff", type="float", default=float(0.95), dest="upperIdentityCutoff", metavar="UPPER_CUTOFF", help="the upper cutoff for the match length adjusted identity to determine if a blat hit is significant, %default by default") #i_cmdLineParser.add_option("-l", "--lowerIdentityCutoff", type="float", default=float(0.5), dest="lowerIdentityCutoff", metavar="LOWER_CUTOFF", help="the lower cutoff for the match length adjusted identity to determine if a second blat hit is significant, %default by default") # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(5, 27, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_id = str(i_cmdLineArgs[0]) i_vcfFilename = str(i_cmdLineArgs[1]) i_blatOutputFilename = str(i_cmdLineArgs[2]) # get the optional params with default values i_passedVCFCallsOnlyFlag = i_cmdLineOptions.passedVCFCallsOnly i_keepPreviousFiltersFlag = i_cmdLineOptions.keepPreviousFilters i_blatOutputFormat = i_cmdLineOptions.blatOutputFormat i_logLevel = i_cmdLineOptions.logLevel i_readDepthCutoff = i_cmdLineOptions.readDepthCutoff i_readPercentCutoff = i_cmdLineOptions.readPercentCutoff #i_eValueCutoff = i_cmdLineOptions.eValueCutoff #i_upperIdentityCutoff = i_cmdLineOptions.upperIdentityCutoff #i_lowerIdentityCutoff = i_cmdLineOptions.lowerIdentityCutoff i_blatDnaNormalReads = i_cmdLineOptions.blatDnaNormalReads i_blatDnaTumorReads = i_cmdLineOptions.blatDnaTumorReads i_blatRnaNormalReads = i_cmdLineOptions.blatRnaNormalReads i_blatRnaTumorReads = i_cmdLineOptions.blatRnaTumorReads # try to get any optional parameters with no defaults i_outputFilename = None i_logFilename = None if (i_cmdLineOptions.outputFilename != None): i_outputFilename = str(i_cmdLineOptions.outputFilename) if (i_cmdLineOptions.logFilename != None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError( "Invalid log level: '%s' must be one of the following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename != None): logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("id=%s", i_id) logging.debug("vcfFilename=%s", i_vcfFilename) logging.debug("blatOutputFilename=%s", i_blatOutputFilename) logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag) logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag) logging.debug("blatOutputFormat=%s", i_blatOutputFormat) logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads) logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads) logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads) logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads) logging.debug("readDepthCutoff=%s", i_readDepthCutoff) logging.debug("readPerentCutoff=%s", i_readPercentCutoff) # check for any errors i_writeFilenameList = [] if (i_outputFilename != None): i_writeFilenameList = [i_outputFilename] if (i_logFilename != None): i_writeFilenameList = [i_logFilename] i_readFilenameList = [i_vcfFilename, i_blatOutputFilename] if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # open the output stream i_outputFileHandler = None if (i_outputFilename != None): i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) # get the BLAT results i_blatCoordinateDict = parse_blat_output(i_blatOutputFilename, i_blatOutputFormat, i_debug) # get the VCF generator i_vcfGenerator = get_vcf_data(i_vcfFilename, i_passedVCFCallsOnlyFlag, i_debug) for (vcfChr, vcfStopCoordinate, vcfId, vcfRef, vcfAlt, vcfScore, vcfFilterSet, vcfInfoDict, restOfLine) in i_vcfGenerator: if (i_debug): logging.debug("VCF Data: %s %s %s %s %s %s %s %s %s", vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, str(vcfFilterSet), str(vcfInfoDict), restOfLine) modTypes = vcfInfoDict["MT"] modTypeFilters = dict() atLeastOnePass = False for modType in modTypes: blatHitsDict = dict() blatOverallReadDepth = 0 numValidReads = 0 if (modType == "NOR_EDIT" and i_blatRnaNormalReads): if ("rnaNormal" in i_blatCoordinateDict[vcfChr + "_" + str(vcfStopCoordinate)]): # for each coordinate, get a dict of reads and corresponding blat hits blatHitsDict = i_blatCoordinateDict[ vcfChr + "_" + str(vcfStopCoordinate)]["rnaNormal"] elif ((modType == "SOM" or modType == "TUM_EDIT") and i_blatRnaTumorReads): if ("rnaTumor" in i_blatCoordinateDict[vcfChr + "_" + str(vcfStopCoordinate)]): # for each coordinate, get a dict of reads and corresponding blat hits blatHitsDict = i_blatCoordinateDict[ vcfChr + "_" + str(vcfStopCoordinate)]["rnaTumor"] # for each read, investigate the blat hits to see if this read is valid for (readId, blatHitList) in blatHitsDict.iteritems(): if (i_debug): logging.debug("num of blat hits for read %s=%s", readId, len(blatHitList)) blatOverallReadDepth += 1 # find out if the read is valid or if it maps to other places in the genome if (i_blatOutputFormat == "PSL"): (isValidRead, validRead) = is_valid_read_psl_format( blatHitList, vcfChr, vcfStopCoordinate, i_debug) elif (i_blatOutputFormat == "BLAST"): (isValidRead, validRead) = is_valid_read_blast_format( blatHitList, vcfChr, vcfStopCoordinate, 0, i_debug) #(isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 1, i_debug) #(isValidRead, validRead) = is_valid_read_blast_format(blatHitList, vcfChr, vcfStopCoordinate, 2, i_debug) # if we have only one valid blat hit, then the read doesn't map to other places in the genome very well, so let's use it if (isValidRead): numValidReads += 1 if (i_debug): logging.debug("ValidRead: %s", validRead) if (blatOverallReadDepth > 0): altPercent = round(numValidReads / float(blatOverallReadDepth), 2) else: altPercent = 0.0 if (numValidReads < i_readDepthCutoff or altPercent < i_readPercentCutoff): modTypeFilters[modType] = "blat" else: modTypeFilters[modType] = "PASS" atLeastOnePass = True if (i_debug): logging.debug( "blatOverallReadDepth=%s, numValidReads=%s, altPercent=%s", str(blatOverallReadDepth), str(numValidReads), str(altPercent)) logging.debug("modType=%s, passed? %s", modType, modTypeFilters[modType]) logging.debug("blatFilter originalDepth=%s, afterBlatDepth=%s", str(blatOverallReadDepth), str(numValidReads)) # make a copy of the list to manipulate modTypesTmpList = list(modTypes) modChanges = vcfInfoDict["MC"] # if at least one passed, then remove the ones that didn't for (modType, modChange) in izip(modTypes, modChanges): # if at least one passed, then remove the ones that didn't if (atLeastOnePass): if (modTypeFilters[modType] == "blat"): modTypesTmpList.remove(modType) modChanges.remove(modChange) # set the modTypes and modChanges vcfInfoDict["MT"] = modTypesTmpList vcfInfoDict["MC"] = modChanges # if at least one passed, then set pass if (atLeastOnePass): vcfFilterSet = ["PASS"] else: # if the user wants to keep the previous filters if (i_keepPreviousFiltersFlag): # if the call previous passed, then just set blat if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet): vcfFilterSet = ["blat"] # otherwise, add it to the previous filters else: vcfFilterSet.add("blat") # otherwise, just set the blat filter else: vcfFilterSet = ["blat"] # update the mod filters modTypes = vcfInfoDict["MT"] modChanges = vcfInfoDict["MC"] origins = vcfInfoDict["ORIGIN"] modFilters = [] if vcfInfoDict["MF"] is None else vcfInfoDict["MF"] modFilterTypes = [] if vcfInfoDict["MFT"] is None else vcfInfoDict[ "MFT"] for origin in origins: for (modType, modChange) in izip(modTypes, modChanges): modFilterTypes.append("_".join( [origin, modType, modChange])) modFilters.append("_".join(vcfFilterSet)) vcfInfoDict["MF"] = modFilters vcfInfoDict["MFT"] = modFilterTypes output = [ vcfChr, str(vcfStopCoordinate), vcfId, vcfRef, vcfAlt, vcfScore, ";".join(vcfFilterSet) ] # add the modified info dict infoField = "" for key in sorted(vcfInfoDict.iterkeys()): if (len(vcfInfoDict[key]) == 0): continue elif ("True" in vcfInfoDict[key]): infoField += key + ";" else: infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";" output.append(infoField.rstrip(";")) output.append(restOfLine) if (i_outputFilename != None): i_outputFileHandler.write("\t".join(output) + "\n") else: print >> sys.stdout, "\t".join(output) stopTime = time.time() logging.info( "filterByBlat.py for Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime - startTime) / (3600)), ((stopTime - startTime) / 60), (stopTime - startTime)) # close the files if (i_outputFilename != None): i_outputFileHandler.close() return
def main(): # command for running this on a small test case: # python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf # ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl # --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads startTime = time.time() # create the usage statement usage = "usage: python %prog id chrom vcfFile blatOutputFile [Options]" i_cmdLineParser = OptionParser(usage=usage) # add the optional parameters i_cmdLineParser.add_option( "-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters " + "thus far are processed, include this argument if all of the " + "VCF calls should be processed") i_cmdLineParser.add_option( "-k", "--keepPreviousFilters", action="store_true", default=False, dest="keepPreviousFilters", help="by default the previous filters are overwritten with the blat " + "filter, include this argument if the previous filters should " + "be kept") i_cmdLineParser.add_option( "-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", default=sys.stdout, help="the name of the output file, STDOUT by default") i_cmdLineParser.add_option( "-b", "--blatOutputFormat", dest="blatOutputFormat", metavar="OUTPUT_FORMAT", default="BLAST", help="the BLAT output format, BLAST by default") i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " + "%default by default") i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDERR by default") i_cmdLineParser.add_option( "", "--transcriptNameTag", dest="transcriptNameTag", help="the INFO key where the original transcript name can be found") i_cmdLineParser.add_option( "", "--transcriptCoordinateTag", dest="transcriptCoordinateTag", help="the INFO key where the original transcript" + "coordinate can be found") i_cmdLineParser.add_option( "", "--transcriptStrandTag", dest="transcriptStrandTag", help="the INFO key where the original transcript strand can be found") i_cmdLineParser.add_option( "", "--rnaIncludeSecondaryAlignments", action="store_true", default=False, dest="rnaIncludeSecondaryAlignments", help="if you align the RNA to transcript isoforms, then you may " + "want to include RNA secondary alignments in the pileup") i_cmdLineParser.add_option( "-n", "--blatDnaNormalReads", action="store_true", default=False, dest="blatDnaNormalReads", help="include this argument if the normal DNA reads " + "should be processed") i_cmdLineParser.add_option( "-x", "--blatRnaNormalReads", action="store_true", default=False, dest="blatRnaNormalReads", help="include this argument if the normal RNA reads " + "should be processed") i_cmdLineParser.add_option( "-t", "--blatDnaTumorReads", action="store_true", default=False, dest="blatDnaTumorReads", help="include this argument if the tumor DNA reads " + "should be processed") i_cmdLineParser.add_option( "-r", "--blatRnaTumorReads", action="store_true", default=False, dest="blatRnaTumorReads", help="include this argument if the tumor RNA reads " + "should be processed") i_cmdLineParser.add_option( "-d", "--minReadDepth", type="int", default=int(4), dest="minReadDepth", metavar="MIN_READ_DP", help="the minimum number of valid reads that are necessary, " + "%default by default") i_cmdLineParser.add_option( "-p", "--minReadPercent", type="float", default=float(0.10), dest="minReadPercent", metavar="MIN_READ_PCT", help="the minimum percentage of valid reads that are necessary, " + "%default by default") i_cmdLineParser.add_option( "-m", "--minOrderMagnitude", type="int", default=float(0), dest="minOrderMagnitude", metavar="MIN_ORDER_MAGNITUDE", help="the minimum order of magnitude difference between the blat " + "hit at the query position vs. the next best blat hit in order " + "for the read to be valid, %default by default") ''' i_cmdLineParser.add_option( "-e", "--minEValue", type="float", default=float(10e-6), dest="minEValue", metavar="MIN_EVALUE", help="the minimum e-value needed for a blat hit to be significant, " + "%default by default") i_cmdLineParser.add_option( "-u", "--maxIdentity", type="float", default=float(0.95), dest="maxIdentity", metavar="MAX_IDENTITY", help="the maximum match length adjusted identity for a blat hit to " + "be significant, %default by default") i_cmdLineParser.add_option( "-l", "--minIdentity", type="float", default=float(0.5), dest="minIdentity", metavar="MIN_IDENTITY", help="the minimum match length adjusted identity for a blat hit to " + "be significant, %default by default") ''' # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(5, 27, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (cmdLineOpts, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_id = str(i_cmdLineArgs[0]) i_vcfFilename = str(i_cmdLineArgs[1]) i_blatOutputFilename = str(i_cmdLineArgs[2]) # get the optional params with default values i_passedVCFCallsOnlyFlag = cmdLineOpts.passedVCFCallsOnly i_keepPreviousFiltersFlag = cmdLineOpts.keepPreviousFilters i_blatOutputFormat = cmdLineOpts.blatOutputFormat i_logLevel = cmdLineOpts.logLevel i_rnaIncludeSecondaryAlignments = cmdLineOpts.rnaIncludeSecondaryAlignments i_minReadDepth = cmdLineOpts.minReadDepth i_minReadPercent = cmdLineOpts.minReadPercent i_minOrderMagnitude = cmdLineOpts.minOrderMagnitude # i_minEValue = cmdLineOpts.minEValue # i_maxIdentity = cmdLineOpts.maxIdentity # i_minIdentity = cmdLineOpts.minIdentity i_blatDnaNormalReads = cmdLineOpts.blatDnaNormalReads i_blatDnaTumorReads = cmdLineOpts.blatDnaTumorReads i_blatRnaNormalReads = cmdLineOpts.blatRnaNormalReads i_blatRnaTumorReads = cmdLineOpts.blatRnaTumorReads # try to get any optional parameters with no defaults i_outputFilename = None i_logFilename = None i_transcriptNameTag = None i_transcriptCoordinateTag = None i_transcriptStrandTag = None if (cmdLineOpts.outputFilename is not None): i_outputFilename = cmdLineOpts.outputFilename if (cmdLineOpts.logFilename is not None): i_logFilename = cmdLineOpts.logFilename if (cmdLineOpts.transcriptNameTag is not None): i_transcriptNameTag = cmdLineOpts.transcriptNameTag if (cmdLineOpts.transcriptCoordinateTag is not None): i_transcriptCoordinateTag = cmdLineOpts.transcriptCoordinateTag if (cmdLineOpts.transcriptStrandTag is not None): i_transcriptStrandTag = cmdLineOpts.transcriptStrandTag # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError("Invalid log level: '%s' must be one of the " + "following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename is not None): logging.basicConfig( level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig( level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("id=%s", i_id) logging.debug("vcfFilename=%s", i_vcfFilename) logging.debug("blatOutputFilename=%s", i_blatOutputFilename) logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag) logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag) logging.debug("blatOutputFormat=%s", i_blatOutputFormat) logging.debug("transcriptNameTag %s", i_transcriptNameTag) logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag) logging.debug("transcriptStrandTag %s", i_transcriptStrandTag) logging.debug("rnaInclSecAlign=%s" % i_rnaIncludeSecondaryAlignments) logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads) logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads) logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads) logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads) logging.debug("minReadDepth=%s", i_minReadDepth) logging.debug("minReadPercent=%s", i_minReadPercent) logging.debug("minOrderMagnitude=%s", i_minOrderMagnitude) # check for any errors i_writeFilenameList = [] if (cmdLineOpts.outputFilename is not sys.stdout): i_writeFilenameList = [i_outputFilename] if (i_logFilename is not None): i_writeFilenameList = [i_logFilename] i_readFilenameList = [i_vcfFilename, i_blatOutputFilename] if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # open the output stream if i_outputFilename is not sys.stdout: i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) else: i_outputFileHandler = i_outputFilename # get the VCF generator i_vcfGenerator = get_vcf_data(i_vcfFilename, i_passedVCFCallsOnlyFlag, i_debug) # get the blat hits generator i_blatGenerator = parse_blat_output(i_blatOutputFilename, i_blatOutputFormat, i_debug) for (vcfLine, blatHitsDict) in izip(i_vcfGenerator, i_blatGenerator): if (i_debug): logging.debug("VCF Line=%s", vcfLine) logging.debug("Len Blat Hits=%s", len(blatHitsDict)) # parse the VCF line splitLine = vcfLine.split("\t") # the coordinate is the second element vcfChr = splitLine[0] vcfStopCoordinate = int(splitLine[1]) vcfIds = splitLine[2] vcfRef = splitLine[3] vcfAlts = splitLine[4] vcfScore = splitLine[5] vcfFilterSet = set(splitLine[6].split(";")) vcfInfoList = splitLine[7].split(";") vcfInfoDict = collections.defaultdict(list) for info in vcfInfoList: keyValueList = info.split("=") # some keys are just singular without a value (e.g. DB, etc.) if (len(keyValueList) == 1): vcfInfoDict[keyValueList[0]] = ["True"] else: # the value can be a comma separated list vcfInfoDict[keyValueList[0]] = keyValueList[1].split(",") vcfRestOfLine = "\t".join(splitLine[8:]) modTypes = vcfInfoDict["MT"] modTypeFilters = dict() atLeastOnePass = False for modType in modTypes: blatOverallReadDepth = 0 numValidReads = 0 prefix = "" if (modType == "GERM" and i_blatDnaNormalReads): prefix = "dnaNormal" elif (modType == "NOR_EDIT" and i_blatRnaNormalReads): prefix = "rnaNormal" elif (modType == "SOM" and i_blatDnaTumorReads): prefix = "dnaTumor" elif ((modType == "SOM" or modType == "TUM_EDIT") and i_blatRnaTumorReads): prefix = "rnaTumor" # get the expected prefix vcfKey = "_".join([prefix, vcfChr, str(vcfStopCoordinate)]) # for each read, investigate the blat # hits to see if this read is valid for (readId, blatHitList) in blatHitsDict.iteritems(): if (i_debug): logging.debug("num of blat hits for read %s=%s", readId, len(blatHitList)) # if the readId does not start with the vcfKey, # then something is wrong. the VCF and blat hits # need to be in sync... if (not readId.startswith(vcfKey)): logging.error("The blat query seems to be out of sync " + "with the blat hits.") logging.error("VCF Line=%s", vcfLine) logging.error("readId=%s, blatHitsDict=%s", readId, blatHitsDict[readId][1]) sys.exit(1) blatOverallReadDepth += 1 # find out if the read is valid or if it # maps to other places in the genome if (i_blatOutputFormat == "PSL"): # if we should process the transcripts if ((i_transcriptNameTag is not None) and (i_transcriptNameTag in vcfInfoDict)): (isValidRead, validRead) = is_valid_read_psl_format( blatHitList, vcfInfoDict[i_transcriptNameTag], vcfInfoDict[i_transcriptCoordinateTag], i_rnaIncludeSecondaryAlignments, i_debug) else: (isValidRead, validRead) = is_valid_read_psl_format( blatHitList, [vcfChr], [vcfStopCoordinate], i_rnaIncludeSecondaryAlignments, i_debug) elif (i_blatOutputFormat == "BLAST"): # if we should process the transcripts if ((i_transcriptNameTag is not None) and (i_transcriptNameTag in vcfInfoDict)): (isValidRead, validRead) = is_valid_read_blast_format( blatHitList, vcfInfoDict[i_transcriptNameTag], vcfInfoDict[i_transcriptCoordinateTag], i_rnaIncludeSecondaryAlignments, i_minOrderMagnitude, i_debug) else: (isValidRead, validRead) = is_valid_read_blast_format( blatHitList, [vcfChr], [vcfStopCoordinate], i_rnaIncludeSecondaryAlignments, i_minOrderMagnitude, i_debug) # if we have only one valid blat hit, then the read doesn't # map to other places in the genome very well, so let's use it if (isValidRead): numValidReads += 1 if (i_debug): logging.debug("ValidRead: %s", validRead) elif (i_debug): logging.debug("not a valid read") if (blatOverallReadDepth > 0): tmpAltPct = numValidReads/float(blatOverallReadDepth) altPercent = round(tmpAltPct, 2) else: altPercent = 0.0 if (numValidReads < i_minReadDepth or altPercent < i_minReadPercent): modTypeFilters[modType] = "blat" else: modTypeFilters[modType] = "PASS" atLeastOnePass = True if (i_debug): logging.debug("blatOverallReadDepth=%s, numValidReads=%s, " + "altPercent=%s", str(blatOverallReadDepth), str(numValidReads), str(altPercent)) logging.debug("modType=%s, passed? %s", modType, modTypeFilters[modType]) logging.debug("blatFilter originalDepth=%s, validBlatDepth=%s", str(blatOverallReadDepth), str(numValidReads)) # make a copy of the list to manipulate modTypesTmpList = list(modTypes) modChanges = vcfInfoDict["MC"] # if at least one passed, then remove the ones that didn't for (modType, modChange) in izip(modTypes, modChanges): # if at least one passed, then remove the ones that didn't if (atLeastOnePass): if (modTypeFilters[modType] == "blat"): modTypesTmpList.remove(modType) modChanges.remove(modChange) # set the modTypes and modChanges vcfInfoDict["MT"] = modTypesTmpList vcfInfoDict["MC"] = modChanges # if at least one passed, then set pass if (atLeastOnePass): vcfFilterSet = ["PASS"] else: # if the user wants to keep the previous filters if (i_keepPreviousFiltersFlag): # if the call previous passed, then just set blat if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet): vcfFilterSet = ["blat"] # otherwise, add it to the previous filters else: vcfFilterSet.add("blat") # otherwise, just set the blat filter else: vcfFilterSet = ["blat"] # update the mod filters modTypes = vcfInfoDict["MT"] modChanges = vcfInfoDict["MC"] origins = vcfInfoDict["ORIGIN"] if vcfInfoDict["MF"] is None: modFilters = [] else: modFilters = vcfInfoDict["MF"] if vcfInfoDict["MFT"] is None: modFilterTypes = [] else: modFilterTypes = vcfInfoDict["MFT"] for origin in origins: for (modType, modChange) in izip(modTypes, modChanges): modFilterTypes.append("_".join([origin, modType, modChange])) modFilters.append("_".join(vcfFilterSet)) vcfInfoDict["MF"] = modFilters vcfInfoDict["MFT"] = modFilterTypes output = [vcfChr, str(vcfStopCoordinate), vcfIds, vcfRef, vcfAlts, vcfScore, ";".join(vcfFilterSet)] # add the modified info dict infoField = "" for key in sorted(vcfInfoDict.iterkeys()): if (len(vcfInfoDict[key]) == 0): continue elif ("True" in vcfInfoDict[key]): infoField += key + ";" else: infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";" output.append(infoField.rstrip(";")) output.append(vcfRestOfLine) i_outputFileHandler.write("\t".join(output) + "\n") stopTime = time.time() logging.info("filterByBlat.py for Id %s: Total time=%s hrs, %s mins, " + "%s secs", i_id, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) # close the files if (i_outputFilename is not sys.stdout): i_outputFileHandler.close() return
def main(): # command for running this on a small test case: # python filterByBlat.py TCGA-00-4454 7 ../data/test/TCGA-00-4454_EGFR.vcf # ../data/test/TCGA-00-4454_EGFR.fa ../data/test/TCGA-00-4454_EGFR.psl # --dnaNormalFilename=../data/test/TCGA-00-4454_EGFR.reads startTime = time.time() # create the usage statement usage = "usage: python %prog id chrom vcfFile blatOutputFile [Options]" i_cmdLineParser = OptionParser(usage=usage) # add the optional parameters i_cmdLineParser.add_option( "-c", "--allVCFCalls", action="store_false", default=True, dest="passedVCFCallsOnly", help="by default only the VCF calls that have passed all filters " + "thus far are processed, include this argument if all of the " + "VCF calls should be processed") i_cmdLineParser.add_option( "-k", "--keepPreviousFilters", action="store_true", default=False, dest="keepPreviousFilters", help="by default the previous filters are overwritten with the blat " + "filter, include this argument if the previous filters should " + "be kept") i_cmdLineParser.add_option( "-o", "--outputFilename", dest="outputFilename", metavar="OUTPUT_FILE", default=sys.stdout, help="the name of the output file, STDOUT by default") i_cmdLineParser.add_option("-b", "--blatOutputFormat", dest="blatOutputFormat", metavar="OUTPUT_FORMAT", default="BLAST", help="the BLAT output format, BLAST by default") i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " + "%default by default") i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDERR by default") i_cmdLineParser.add_option( "", "--transcriptNameTag", dest="transcriptNameTag", help="the INFO key where the original transcript name can be found") i_cmdLineParser.add_option( "", "--transcriptCoordinateTag", dest="transcriptCoordinateTag", help="the INFO key where the original transcript" + "coordinate can be found") i_cmdLineParser.add_option( "", "--transcriptStrandTag", dest="transcriptStrandTag", help="the INFO key where the original transcript strand can be found") i_cmdLineParser.add_option( "", "--rnaIncludeSecondaryAlignments", action="store_true", default=False, dest="rnaIncludeSecondaryAlignments", help="if you align the RNA to transcript isoforms, then you may " + "want to include RNA secondary alignments in the pileup") i_cmdLineParser.add_option( "-n", "--blatDnaNormalReads", action="store_true", default=False, dest="blatDnaNormalReads", help="include this argument if the normal DNA reads " + "should be processed") i_cmdLineParser.add_option( "-x", "--blatRnaNormalReads", action="store_true", default=False, dest="blatRnaNormalReads", help="include this argument if the normal RNA reads " + "should be processed") i_cmdLineParser.add_option( "-t", "--blatDnaTumorReads", action="store_true", default=False, dest="blatDnaTumorReads", help="include this argument if the tumor DNA reads " + "should be processed") i_cmdLineParser.add_option( "-r", "--blatRnaTumorReads", action="store_true", default=False, dest="blatRnaTumorReads", help="include this argument if the tumor RNA reads " + "should be processed") i_cmdLineParser.add_option( "-d", "--minReadDepth", type="int", default=int(4), dest="minReadDepth", metavar="MIN_READ_DP", help="the minimum number of valid reads that are necessary, " + "%default by default") i_cmdLineParser.add_option( "-p", "--minReadPercent", type="float", default=float(0.10), dest="minReadPercent", metavar="MIN_READ_PCT", help="the minimum percentage of valid reads that are necessary, " + "%default by default") i_cmdLineParser.add_option( "-m", "--minOrderMagnitude", type="int", default=float(0), dest="minOrderMagnitude", metavar="MIN_ORDER_MAGNITUDE", help="the minimum order of magnitude difference between the blat " + "hit at the query position vs. the next best blat hit in order " + "for the read to be valid, %default by default") ''' i_cmdLineParser.add_option( "-e", "--minEValue", type="float", default=float(10e-6), dest="minEValue", metavar="MIN_EVALUE", help="the minimum e-value needed for a blat hit to be significant, " + "%default by default") i_cmdLineParser.add_option( "-u", "--maxIdentity", type="float", default=float(0.95), dest="maxIdentity", metavar="MAX_IDENTITY", help="the maximum match length adjusted identity for a blat hit to " + "be significant, %default by default") i_cmdLineParser.add_option( "-l", "--minIdentity", type="float", default=float(0.5), dest="minIdentity", metavar="MIN_IDENTITY", help="the minimum match length adjusted identity for a blat hit to " + "be significant, %default by default") ''' # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(5, 27, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (cmdLineOpts, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_id = str(i_cmdLineArgs[0]) i_vcfFilename = str(i_cmdLineArgs[1]) i_blatOutputFilename = str(i_cmdLineArgs[2]) # get the optional params with default values i_passedVCFCallsOnlyFlag = cmdLineOpts.passedVCFCallsOnly i_keepPreviousFiltersFlag = cmdLineOpts.keepPreviousFilters i_blatOutputFormat = cmdLineOpts.blatOutputFormat i_logLevel = cmdLineOpts.logLevel i_rnaIncludeSecondaryAlignments = cmdLineOpts.rnaIncludeSecondaryAlignments i_minReadDepth = cmdLineOpts.minReadDepth i_minReadPercent = cmdLineOpts.minReadPercent i_minOrderMagnitude = cmdLineOpts.minOrderMagnitude # i_minEValue = cmdLineOpts.minEValue # i_maxIdentity = cmdLineOpts.maxIdentity # i_minIdentity = cmdLineOpts.minIdentity i_blatDnaNormalReads = cmdLineOpts.blatDnaNormalReads i_blatDnaTumorReads = cmdLineOpts.blatDnaTumorReads i_blatRnaNormalReads = cmdLineOpts.blatRnaNormalReads i_blatRnaTumorReads = cmdLineOpts.blatRnaTumorReads # try to get any optional parameters with no defaults i_outputFilename = None i_logFilename = None i_transcriptNameTag = None i_transcriptCoordinateTag = None i_transcriptStrandTag = None if (cmdLineOpts.outputFilename is not None): i_outputFilename = cmdLineOpts.outputFilename if (cmdLineOpts.logFilename is not None): i_logFilename = cmdLineOpts.logFilename if (cmdLineOpts.transcriptNameTag is not None): i_transcriptNameTag = cmdLineOpts.transcriptNameTag if (cmdLineOpts.transcriptCoordinateTag is not None): i_transcriptCoordinateTag = cmdLineOpts.transcriptCoordinateTag if (cmdLineOpts.transcriptStrandTag is not None): i_transcriptStrandTag = cmdLineOpts.transcriptStrandTag # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError( "Invalid log level: '%s' must be one of the " + "following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename is not None): logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("id=%s", i_id) logging.debug("vcfFilename=%s", i_vcfFilename) logging.debug("blatOutputFilename=%s", i_blatOutputFilename) logging.debug("passedCallsOnly? %s", i_passedVCFCallsOnlyFlag) logging.debug("keepPreviousFiltersFlag? %s", i_keepPreviousFiltersFlag) logging.debug("blatOutputFormat=%s", i_blatOutputFormat) logging.debug("transcriptNameTag %s", i_transcriptNameTag) logging.debug("transcriptCoordinateTag %s", i_transcriptCoordinateTag) logging.debug("transcriptStrandTag %s", i_transcriptStrandTag) logging.debug("rnaInclSecAlign=%s" % i_rnaIncludeSecondaryAlignments) logging.debug("blatDnaNormal? %s", i_blatDnaNormalReads) logging.debug("blatDnaTumor? %s", i_blatDnaTumorReads) logging.debug("blatRnaNormal? %s", i_blatRnaNormalReads) logging.debug("blatRnaTumor? %s", i_blatRnaTumorReads) logging.debug("minReadDepth=%s", i_minReadDepth) logging.debug("minReadPercent=%s", i_minReadPercent) logging.debug("minOrderMagnitude=%s", i_minOrderMagnitude) # check for any errors i_writeFilenameList = [] if (cmdLineOpts.outputFilename is not sys.stdout): i_writeFilenameList = [i_outputFilename] if (i_logFilename is not None): i_writeFilenameList = [i_logFilename] i_readFilenameList = [i_vcfFilename, i_blatOutputFilename] if (not radiaUtil.check_for_argv_errors(None, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # open the output stream if i_outputFilename is not sys.stdout: i_outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) else: i_outputFileHandler = i_outputFilename # get the VCF generator i_vcfGenerator = get_vcf_data(i_vcfFilename, i_passedVCFCallsOnlyFlag, i_debug) # get the blat hits generator i_blatGenerator = parse_blat_output(i_blatOutputFilename, i_blatOutputFormat, i_debug) for (vcfLine, blatHitsDict) in izip(i_vcfGenerator, i_blatGenerator): if (i_debug): logging.debug("VCF Line=%s", vcfLine) logging.debug("Len Blat Hits=%s", len(blatHitsDict)) # parse the VCF line splitLine = vcfLine.split("\t") # the coordinate is the second element vcfChr = splitLine[0] vcfStopCoordinate = int(splitLine[1]) vcfIds = splitLine[2] vcfRef = splitLine[3] vcfAlts = splitLine[4] vcfScore = splitLine[5] vcfFilterSet = set(splitLine[6].split(";")) vcfInfoList = splitLine[7].split(";") vcfInfoDict = collections.defaultdict(list) for info in vcfInfoList: keyValueList = info.split("=") # some keys are just singular without a value (e.g. DB, etc.) if (len(keyValueList) == 1): vcfInfoDict[keyValueList[0]] = ["True"] else: # the value can be a comma separated list vcfInfoDict[keyValueList[0]] = keyValueList[1].split(",") vcfRestOfLine = "\t".join(splitLine[8:]) modTypes = vcfInfoDict["MT"] modTypeFilters = dict() atLeastOnePass = False for modType in modTypes: blatOverallReadDepth = 0 numValidReads = 0 prefix = "" if (modType == "GERM" and i_blatDnaNormalReads): prefix = "dnaNormal" elif (modType == "NOR_EDIT" and i_blatRnaNormalReads): prefix = "rnaNormal" elif (modType == "SOM" and i_blatDnaTumorReads): prefix = "dnaTumor" elif ((modType == "SOM" or modType == "TUM_EDIT") and i_blatRnaTumorReads): prefix = "rnaTumor" # get the expected prefix vcfKey = "_".join([prefix, vcfChr, str(vcfStopCoordinate)]) # for each read, investigate the blat # hits to see if this read is valid for (readId, blatHitList) in blatHitsDict.iteritems(): if (i_debug): logging.debug("num of blat hits for read %s=%s", readId, len(blatHitList)) # if the readId does not start with the vcfKey, # then something is wrong. the VCF and blat hits # need to be in sync... if (not readId.startswith(vcfKey)): logging.error("The blat query seems to be out of sync " + "with the blat hits.") logging.error("VCF Line=%s", vcfLine) logging.error("readId=%s, blatHitsDict=%s", readId, blatHitsDict[readId][1]) sys.exit(1) blatOverallReadDepth += 1 # find out if the read is valid or if it # maps to other places in the genome if (i_blatOutputFormat == "PSL"): # if we should process the transcripts if ((i_transcriptNameTag is not None) and (i_transcriptNameTag in vcfInfoDict)): (isValidRead, validRead) = is_valid_read_psl_format( blatHitList, vcfInfoDict[i_transcriptNameTag], vcfInfoDict[i_transcriptCoordinateTag], i_rnaIncludeSecondaryAlignments, i_debug) else: (isValidRead, validRead) = is_valid_read_psl_format( blatHitList, [vcfChr], [vcfStopCoordinate], i_rnaIncludeSecondaryAlignments, i_debug) elif (i_blatOutputFormat == "BLAST"): # if we should process the transcripts if ((i_transcriptNameTag is not None) and (i_transcriptNameTag in vcfInfoDict)): (isValidRead, validRead) = is_valid_read_blast_format( blatHitList, vcfInfoDict[i_transcriptNameTag], vcfInfoDict[i_transcriptCoordinateTag], i_rnaIncludeSecondaryAlignments, i_minOrderMagnitude, i_debug) else: (isValidRead, validRead) = is_valid_read_blast_format( blatHitList, [vcfChr], [vcfStopCoordinate], i_rnaIncludeSecondaryAlignments, i_minOrderMagnitude, i_debug) # if we have only one valid blat hit, then the read doesn't # map to other places in the genome very well, so let's use it if (isValidRead): numValidReads += 1 if (i_debug): logging.debug("ValidRead: %s", validRead) elif (i_debug): logging.debug("not a valid read") if (blatOverallReadDepth > 0): tmpAltPct = numValidReads / float(blatOverallReadDepth) altPercent = round(tmpAltPct, 2) else: altPercent = 0.0 if (numValidReads < i_minReadDepth or altPercent < i_minReadPercent): modTypeFilters[modType] = "blat" else: modTypeFilters[modType] = "PASS" atLeastOnePass = True if (i_debug): logging.debug( "blatOverallReadDepth=%s, numValidReads=%s, " + "altPercent=%s", str(blatOverallReadDepth), str(numValidReads), str(altPercent)) logging.debug("modType=%s, passed? %s", modType, modTypeFilters[modType]) logging.debug("blatFilter originalDepth=%s, validBlatDepth=%s", str(blatOverallReadDepth), str(numValidReads)) # make a copy of the list to manipulate modTypesTmpList = list(modTypes) modChanges = vcfInfoDict["MC"] # if at least one passed, then remove the ones that didn't for (modType, modChange) in izip(modTypes, modChanges): # if at least one passed, then remove the ones that didn't if (atLeastOnePass): if (modTypeFilters[modType] == "blat"): modTypesTmpList.remove(modType) modChanges.remove(modChange) # set the modTypes and modChanges vcfInfoDict["MT"] = modTypesTmpList vcfInfoDict["MC"] = modChanges # if at least one passed, then set pass if (atLeastOnePass): vcfFilterSet = ["PASS"] else: # if the user wants to keep the previous filters if (i_keepPreviousFiltersFlag): # if the call previous passed, then just set blat if (len(vcfFilterSet) == 1 and "PASS" in vcfFilterSet): vcfFilterSet = ["blat"] # otherwise, add it to the previous filters else: vcfFilterSet.add("blat") # otherwise, just set the blat filter else: vcfFilterSet = ["blat"] # update the mod filters modTypes = vcfInfoDict["MT"] modChanges = vcfInfoDict["MC"] origins = vcfInfoDict["ORIGIN"] if vcfInfoDict["MF"] is None: modFilters = [] else: modFilters = vcfInfoDict["MF"] if vcfInfoDict["MFT"] is None: modFilterTypes = [] else: modFilterTypes = vcfInfoDict["MFT"] for origin in origins: for (modType, modChange) in izip(modTypes, modChanges): modFilterTypes.append("_".join( [origin, modType, modChange])) modFilters.append("_".join(vcfFilterSet)) vcfInfoDict["MF"] = modFilters vcfInfoDict["MFT"] = modFilterTypes output = [ vcfChr, str(vcfStopCoordinate), vcfIds, vcfRef, vcfAlts, vcfScore, ";".join(vcfFilterSet) ] # add the modified info dict infoField = "" for key in sorted(vcfInfoDict.iterkeys()): if (len(vcfInfoDict[key]) == 0): continue elif ("True" in vcfInfoDict[key]): infoField += key + ";" else: infoField += key + "=" + ",".join(vcfInfoDict[key]) + ";" output.append(infoField.rstrip(";")) output.append(vcfRestOfLine) i_outputFileHandler.write("\t".join(output) + "\n") stopTime = time.time() logging.info( "filterByBlat.py for Id %s: Total time=%s hrs, %s mins, " + "%s secs", i_id, ((stopTime - startTime) / (3600)), ((stopTime - startTime) / 60), (stopTime - startTime)) # close the files if (i_outputFilename is not sys.stdout): i_outputFileHandler.close() return
def filter_events(aTCGAId, aChrom, aBedFilename, aVCFFilename, anOutputFilename, aFilterName, aFilterField, anIncludeOverlapInfo, anIncludeFilterName, anIdField, anIncludeId, anIncludeCount, aFilterHeaderLine, aBinSize, anIsDebug): ''' ' This function reads from a .bed file and a .vcf file line by line and ' looks for variants that should be filtered or tagged. The .bed file ' specifies coordinates for areas where variants should either be included ' or excluded. For example, a .bed file specifying transcription or exon ' start and stop coordinates can be provided along with the ' --includeOverlaps flag to indicate that the variants in these regions ' should be kept, and variants outside of these regions should be flagged ' or filtered out. Conversely, a bed file specifying areas of the genome ' that are accessible (as defined by the 1000 Genomes project) can be given ' without the --includeOverlaps flag to indicate that the variants outside ' of the accessible genome should be flagged or filtered out, and variants ' overlapping the accessible regions should not be flagged or filtered out. ' ' aTCGAId: The TCGA Id for this sample ' aChrom: The chromosome being filtered ' aBedFilename: A .bed file with at least 3 columns specifying the chrom, ' start, and stop coordinates and possibly a 4th column with an id ' aVCFFilename: A .vcf file with variants that will be either ' included or excluded ' anOutputFilename: An output file where the filtered variants are output ' aFilterName: The name of the filter ' aFilterField: The field where the filter name should be included ' (e.g. INFO or FILTER) ' anIncludeOverlapInfo: A flag specifying whether the variants should be ' included or excluded when they overlap ' anIncludeFilterName: A flag specifying whether the filtering name should ' be included in the output or not ' anIdField: The field where the ID should be specified (e.g. ID or INFO) ' anIncludeId: A flag specifying whether the id should be included in the ' output or not ' anIncludeCount: A flag specifying whether the number of overlaps should ' be included in the output or not ' aFilterHeaderLine: A filter header line that should be added to the VCF ' header describing this filter ' aBinSize: The size of the interval between each bin ' anIsDebug: A flag for outputting debug messages to STDERR ''' # initialize pybed with the filtering file filterPybed = pybed(binsize=aBinSize) filterPybed.load_from_file(aBedFilename) # get the vcf file i_vcfFileHandler = radiaUtil.get_read_fileHandler(aVCFFilename) # get the output file i_outputFileHandler = None if (anOutputFilename is not None): i_outputFileHandler = radiaUtil.get_write_fileHandler(anOutputFilename) # create the generator for the vcf file vcfGenerator = get_vcf_data(i_vcfFileHandler, i_outputFileHandler, aFilterHeaderLine, anIsDebug) # initialize some variables overlappingEvents = 0 nonOverlappingEvents = 0 totalEvents = 0 startTime = time.time() # for each vcf line for (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate, vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_restLine, vcf_line) in (vcfGenerator): totalEvents += 1 if (anIsDebug): logging.debug("VCF: %s", vcf_line) # check if this vcf coordinate overlaps with the filter coordinates posTuple = (vcf_chr, vcf_startCoordinate, vcf_stopCoordinate) (isOverlap, idValue, count) = filterPybed.overlaps_with(posTuple, anIncludeCount) # if an event overlaps with the filters if (isOverlap): # count the overlap overlappingEvents += 1 # if we want to add info about overlaps if (anIncludeOverlapInfo): # alter the filter and id name if appropriate if (anIncludeFilterName): (vcf_filter, vcf_info) = add_filter( vcf_filter, vcf_info, aFilterName, aFilterField, anIncludeCount, count, anIncludeId, anIdField, idValue) if (anIncludeId and anIdField == "ID"): vcf_id = add_id(vcf_id, idValue) # output the event outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info) if (anOutputFilename is not None): i_outputFileHandler.write("\t".join(outputList) + "\t" + "\t".join(vcf_restLine) + "\n") else: print >> sys.stdout, ("\t".join(outputList) + "\t" + "\t".join(vcf_restLine)) # we don't want to add info about overlaps, just output them else: # output the event if (anOutputFilename is not None): i_outputFileHandler.write(vcf_line + "\n") else: print >> sys.stdout, vcf_line # these events don't overlap with the filters else: # count the non overlap nonOverlappingEvents += 1 # if we don't want to add info about overlaps, # then we do want to add info about non-overlaps if (not anIncludeOverlapInfo): # alter the filter and id name if appropriate if (anIncludeFilterName): (vcf_filter, vcf_info) = add_filter( vcf_filter, vcf_info, aFilterName, aFilterField, anIncludeCount, count, anIncludeId, anIdField, idValue) if (anIncludeId and anIdField == "ID"): vcf_id = add_id(vcf_id, idValue) # output the event outputList = (vcf_chr, str(vcf_stopCoordinate), vcf_id, vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info) if (anOutputFilename is not None): i_outputFileHandler.write("\t".join(outputList) + "\t" + "\t".join(vcf_restLine) + "\n") else: print >> sys.stdout, ("\t".join(outputList) + "\t" + "\t".join(vcf_restLine)) # we do want to add info about overlaps, # so just output non-overlaps else: # output the event if (anOutputFilename is not None): i_outputFileHandler.write(vcf_line + "\n") else: print >> sys.stdout, vcf_line stopTime = time.time() logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs", aChrom, aTCGAId, ((stopTime - startTime) / (3600)), ((stopTime - startTime) / 60), (stopTime - startTime)) if (overlappingEvents + nonOverlappingEvents == totalEvents): logging.info( "For chrom %s and Id %s: %s (overlapping events) + " + "%s (non-overlapping events) = %s", aChrom, aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents) else: logging.info( "filterByPybed Warning: For chrom %s and Id %s: %s " + "(overlapping events) + %s (non-overlapping events) = %s", aChrom, aTCGAId, overlappingEvents, nonOverlappingEvents, totalEvents) # close the files i_vcfFileHandler.close() if (anOutputFilename is not None): i_outputFileHandler.close() return
def compare_events(aTCGAId, aChrom, aRadiaFilename, aCompareFilename, aStatsFilename, anOverlapFilename, aNonOverlapFilename, aCompareDict, anIsDebug): ''' ' The function compares variants in one file with variants in another file. This can be used to compare variants from ' different methods, MAF files, or validation files. At a minimum, the coordinates are compared. The user can also ' specify additional comparisons that should be done such as comparing if the call was classified as somatic in both ' methods (e.g. SOM=Somatic). The keys and values can be comma-separated lists. For example, a call may be labeled ' as blacklisted in one file with "blck" and in another file with "blq" or "bldp", then the comparison string would ' be blck=blq,bldp. ' ' aTCGAId: The TCGA Id for this sample ' aChrom: The chromosome being filtered ' aRadiaFilename: A .vcf file from RADIA ' aCompareFilename: A file to compare to ' aStatsFilename: A stats file ' anOverlapFilename: A file where all the overlaps are output ' aNonOverlapFilename: A file where all the non-overlaps are output ' aCompareDict: A dictionary of key=value to be compare (coordinate is always compared) ' anIsDebug: A flag for outputting debug messages to STDERR ''' # create the generators for the filter and vcf files i_statsDict = collections.defaultdict(int) i_filterDict = collections.defaultdict(int) (i_radDict, i_statsDict) = get_vcf_data(aRadiaFilename, i_statsDict, aCompareDict, "rad", anIsDebug) (i_cmpDict, i_statsDict) = get_vcf_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug) #(i_radDict, i_statsDict) = get_maf_data(aRadFilename, i_statsDict, aCompareDict, "rad", anIsDebug) #(i_cmpDict, i_statsDict) = get_maf_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug) #(i_cmpDict, i_statsDict) = get_validation_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug) #(i_cmpDict, i_statsDict) = get_simulation_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug) if (anOverlapFilename != None): overlapFileHandler = radiaUtil.get_write_fileHandler(anOverlapFilename) if (aNonOverlapFilename != None): nonOverlapFileHandler = radiaUtil.get_write_fileHandler(aNonOverlapFilename) # initialize some variables startTime = time.time() # for each cmp event for (cmpCoordinate, cmpLine) in i_cmpDict.iteritems(): # this one is for comparing blacklist results #if ("SNP" in cmpLine and ("bldp" in cmpLine or "blq" in cmpLine) and cmpCoordinate not in i_radDict): # this one is for comparing BB, Radia, or Maf results if ("PASS" in cmpLine and ("SOM" in cmpLine or "EDIT" in cmpLine or "RNA_TUM_VAR" in cmpLine or "RNA_NOR_VAR" in cmpLine) and cmpCoordinate not in i_radDict): #if ("SNP" in cmpLine and "Somatic" in cmpLine and cmpCoordinate not in i_radDict): # this one is for validation data #if (cmpCoordinate not in i_radDict): #if ("PASS" in cmpLine and "SNP" in cmpLine and "SOM" in cmpLine and cmpCoordinate not in i_radDict): #if ("Somatic" in cmpLine and "SNP" in cmpLine and cmpCoordinate not in i_radDict): if (anIsDebug): logging.debug("no radia call %s", cmpLine) #if (aNonOverlapFilename != None): # nonOverlapFileHandler.write(cmpLine + "\n") # add to maf #if (anOverlapFilename != None): # overlapFileHandler.write(cmpLine + "\n") # for each rad event for (radCoordinate, radLine) in i_radDict.iteritems(): #if (("bldp" in radLine or "blq" in radLine) and radCoordinate not in i_cmpDict): #if ("PASS" in radLine and "SNP" in radLine and radCoordinate not in i_cmpDict): #if (radCoordinate not in i_cmpDict): #if ("PASS" in radLine and "SOM" in radLine): #if ("SOM" in radLine and radCoordinate not in i_cmpDict): if ("PASS" in radLine and ("SOM" in radLine or "EDIT" in radLine or "RNA_TUM_VAR" in radLine or "RNA_NOR_VAR" in radLine) and "SNP" in radLine and radCoordinate not in i_cmpDict): #if ("PASS" in radLine and "SNP" in radLine and "Somatic" in radLine and radCoordinate not in i_cmpDict): #if ("SNP" in radLine and "Somatic" in radLine and radCoordinate not in i_cmpDict): #if ("PASS" in radLine and "SOM" in radLine and radCoordinate not in i_cmpDict): #if ("SOM" in radLine and radCoordinate not in i_cmpDict): if (anIsDebug): logging.debug("new radia call %s", radLine) if (aNonOverlapFilename != None): nonOverlapFileHandler.write(radLine + "\n") # add to maf #if (anOverlapFilename != None): #caller = "ucsc;" #if ("radia" in radLine): # caller += "radia;" #if ("bambam" in radLine): # caller += "bambam;" #caller = "rnaCall;" # split the line on the tab #splitLine = radLine.split("\t") #chrom = splitLine[0] #stopCoordinate = int(splitLine[1]) #startCoordinate = stopCoordinate-1 #output = ["gene", "score", caller, "score", chrom, str(startCoordinate), str(stopCoordinate), "+", "mutClass", "SNP", "Somatic"] #overlapFileHandler.write("\t".join(output) + "\n") # if the coordinates overlap, then count them if (radCoordinate in i_cmpDict): i_statsDict["overlap_events"] += 1 compareLine = i_cmpDict[radCoordinate] # this one is for BB and Maf comparisons #if ("PASS" in radLine and "SNP" in radLine): # this one is for Radia to Radia comparisons #if ("PASS" in radLine and "PASS" in compareLine): # this one is for Radia and validation if ("PASS" in radLine and ("SOM" in radLine or "EDIT" in radLine or "RNA_TUM_VAR" in radLine or "RNA_NOR_VAR" in radLine) and ("PASS" in compareLine and ("SOM" in compareLine or "EDIT" in compareLine or "RNA_TUM_VAR" in compareLine or "RNA_NOR_VAR" in compareLine))): #if ("PASS" in radLine and "SOM" in radLine and "SNP" in compareLine and "Somatic" in compareLine): #if ("Somatic" in radLine and "SNP" in radLine and "Somatic" in compareLine): #if ("SOM" in radLine and "Somatic" in compareLine and "SNP" in compareLine): #if ("PASS" in radLine and "SOM" in radLine): #if ("SOM" in radLine): #if ("PASS" in radLine and "SNP" in compareLine and "Somatic" in compareLine): i_statsDict["overlap_pass_events"] += 1 # for each key to compare # their can be multiple keys for one filter such as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): # break up the strings to get the individual keys radKeyList = radKeyString.split(",") cmpKeyList = cmpKeyString.split(",") # set some booleans foundInRad = False foundInCmp = False # search for one of them for radKey in radKeyList: # if we find one if (radKey in radLine): foundInRad = True break; # search for one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in compareLine): foundInCmp = True break; # if the keys exist in both files at the same position, then count them if (foundInRad and foundInCmp): # if these are germline or they haven't been found in dbSnp, then count them #if (((radKey == "GERM") or ("DB" not in radLine and "DB" not in compareLine))): #if ("SNP" in compareLine): #if ("SNP" in compareLine and ((radKey == "GERM") or ("DB" not in radLine and "DB" not in compareLine))): #if ("SNP" in compareLine): if ("PASS" in compareLine): #if ("PASS" in compareLine and "SNP" in compareLine): #if (True): i_statsDict["overlap_" + radKey] += 1 splitLine = radLine.split("\t") filterString = splitLine[6] filterList = filterString.split(";") #if ("PASS" in radLine and "SNP" in radLine): #if ("PASS" in radLine and "SNP" in radLine): if ("PASS" in radLine): #if ("SNP" in radLine): #if (True): i_statsDict["overlap_pass_" + radKey] += 1 if (anIsDebug): logging.debug("found call %s", compareLine) if (anOverlapFilename != None): # add to maf #caller = ";ucsc;" #if ("radia" in radLine): # caller += "radia;" #if ("bambam" in radLine): # caller += "bambam;" #caller = ";rnaCall" #splitLine = compareLine.split("\t") #splitLine[2] += caller #overlapFileHandler.write("\t".join(splitLine) + "\n") #caller = ";rnaCall" #cmpSplitLine = compareLine.split("\t") #callers = cmpSplitLine[2] + caller #callers = callers.replace(";;", ",") #callers = callers.replace(";", ",") #radSplitLine = radLine.split("\t") #radSplitLine[7] += ";CALLER=" + callers #overlapFileHandler.write("\t".join(radSplitLine) + "\n") overlapFileHandler.write(radLine + "\n") # we only want to write the line to the overlap file once # even if it matches as a SOM and an EDIT break; #overlapFileHandler.write(compareLine + "\n") else: if (anIsDebug): logging.debug("found but no radia pass %s %s", radLine, compareLine) #overlapFileHandler.write(compareLine + "\n") splitLine = radLine.split("\t") filterString = splitLine[6] filterList = filterString.split(";") for filterKey in filterList: i_filterDict[filterKey] += 1 if (aNonOverlapFilename != None): #nonOverlapFileHandler.write(compareLine + "\n") nonOverlapFileHandler.write(radLine + "\n") elif (anIsDebug and foundInRad): logging.debug("overlap but not found in compare file %s %s %s", radKey, radLine, compareLine) #overlapFileHandler.write(compareLine + "\n") elif (anIsDebug and foundInCmp): logging.debug("overlap but not found in RADIA %s %s %s", cmpKey, radLine, compareLine) #overlapFileHandler.write(compareLine + "\n") elif (anIsDebug): logging.debug("overlap but not same type %s %s %s %s", radKeyList, cmpKeyList, radLine, compareLine) #overlapFileHandler.write(compareLine + "\n") # aTCGAId, aChrom, rad_events, cmp_events, overlap_events, [rad_key, cmp_key, overlap_radKey]{n} #outputHeader = ["PatientId", "Chrom", "rad_events", "cmp_events", "overlap_events", "rad_pass_events", "cmp_pass_events", "overlap_pass_events"] outputList = [aTCGAId, aChrom] outputList += [str(i_statsDict["rad_events"]), str(i_statsDict["cmp_events"]), str(i_statsDict["overlap_events"])] outputList += [str(i_statsDict["rad_pass_events"]), str(i_statsDict["cmp_pass_events"]), str(i_statsDict["overlap_pass_events"])] # for each key to compare, get the total radias, total cmps, and overlaps for radKey in sorted(aCompareDict.iterkeys()): cmpKey = aCompareDict[radKey] outputList += [str(i_statsDict["rad_" + radKey]), str(i_statsDict["cmp_" + cmpKey]), str(i_statsDict["overlap_" + radKey])] outputList += [str(i_statsDict["rad_pass_" + radKey]), str(i_statsDict["cmp_pass_" + cmpKey]), str(i_statsDict["overlap_pass_" + radKey])] for (filterKey, count) in i_filterDict.iteritems(): logging.debug("filter: %s\t%s", filterKey, count) # get the files i_statsFileHandler = None if (aStatsFilename != None): i_statsFileHandler = radiaUtil.get_append_fileHandler(aStatsFilename) i_statsFileHandler.write("\t".join(outputList) + "\n") i_statsFileHandler.close() stopTime = time.time() logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs", aChrom, aTCGAId, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) logging.info("\t".join(outputList)) if (anOverlapFilename != None): overlapFileHandler.close() if (aNonOverlapFilename != None): nonOverlapFileHandler.close() return
def main(): startTime = time.time() # create the usage statement usage = "usage: python %prog passingFile originalFile outputFile [Options]" i_cmdLineParser = OptionParser(usage=usage) i_cmdLineParser.add_option( "-l", "--log", default="WARNING", dest="logLevel", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " + "%default by default") i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDERR by default") # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(3, 10, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_passingFilename = i_cmdLineArgs[0] i_originalFilename = i_cmdLineArgs[1] i_outputFilename = i_cmdLineArgs[2] # get the optional params with default values i_logLevel = i_cmdLineOptions.logLevel i_logFilename = None if (i_cmdLineOptions.logFilename is not None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError("Invalid log level: '%s' must be one of the " + "following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename is not None): logging.basicConfig( level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig( level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("passingFile=%s", i_passingFilename) logging.debug("originalFile=%s", i_originalFilename) logging.debug("outputFilename=%s", i_outputFilename) # check for any errors i_readFilenameList = [i_passingFilename, i_originalFilename] i_writeFilenameList = [i_outputFilename] i_dirList = None if (not radiaUtil.check_for_argv_errors(i_dirList, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # get the VCF generator (passHeaderList, chromLine, passInfoList, passFilterList, passCoordinateDict) = get_vcf_data(i_passingFilename, i_debug) (orgHeaderList, chromLine, orgInfoList, orgFilterList, orgCoordinateDict) = get_vcf_data(i_originalFilename, i_debug) outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) for headerLine in orgHeaderList: outputFileHandler.write(headerLine + "\n") for headerLine in orgInfoList: outputFileHandler.write(headerLine + "\n") for headerLine in passInfoList: if (headerLine not in orgInfoList): outputFileHandler.write(headerLine + "\n") for headerLine in orgFilterList: outputFileHandler.write(headerLine + "\n") for headerLine in passFilterList: if (headerLine not in orgFilterList): outputFileHandler.write(headerLine + "\n") outputFileHandler.write(chromLine + "\n") numericKeys = orgCoordinateDict.keys() numericKeys.sort(key=int) for coordinate in numericKeys: if (coordinate in passCoordinateDict): line = passCoordinateDict[coordinate] else: line = orgCoordinateDict[coordinate] outputFileHandler.write(line) stopTime = time.time() logging.info("Total time=%s hrs, %s mins, %s secs", ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) # close the files outputFileHandler.close() return
def compare_events(anId, aChrom, aRadiaFilename, aCompareFilename, aStatsFilename, anOverlapFilename, aNonOverlapFilename, aCompareDict, anIsDebug): ''' ' The function compares variants in one file with variants in another file. ' This can be used to compare variants from different methods, MAF files, ' or validation files. At a minimum, the coordinates are compared. The ' user can also specify additional comparisons that should be done such as ' comparing if the call was classified as somatic in both methods ' (e.g. SOM=Somatic). The keys and values can be comma-separated lists. ' For example, a call may be labeled as blacklisted in one file with "blck" ' and in another file with "blq" or "bldp", then the comparison string ' would be blck=blq,bldp. ' ' anId: The Id for this sample ' aChrom: The chromosome being filtered ' aRadiaFilename: A .vcf file from RADIA ' aCompareFilename: A file to compare to ' aStatsFilename: A stats file ' anOverlapFilename: A file where all the overlaps are output ' aNonOverlapFilename: A file where all the non-overlaps are output ' aCompareDict: A dictionary of key=value to be compare ' anIsDebug: A flag for outputting debug messages to STDERR ''' # create the generators for the filter and vcf files i_statsDict = collections.defaultdict(int) i_filterDict = collections.defaultdict(int) (i_radDict, i_statsDict) = get_vcf_data(aRadiaFilename, i_statsDict, aCompareDict, "rad", anIsDebug) (i_cmpDict, i_statsDict) = get_vcf_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug) ''' (i_radDict, i_statsDict) = get_maf_data(aRadiaFilename, i_statsDict, aCompareDict, "rad", anIsDebug) (i_cmpDict, i_statsDict) = get_maf_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug) (i_cmpDict, i_statsDict) = get_validation_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug) (i_cmpDict, i_statsDict) = get_simulation_data(aCompareFilename, i_statsDict, aCompareDict, "cmp", anIsDebug) ''' if (anOverlapFilename is not None): overlapFileHandler = radiaUtil.get_write_fileHandler(anOverlapFilename) if (aNonOverlapFilename is not None): nonOverlapFileHandler = radiaUtil.get_write_fileHandler( aNonOverlapFilename) # initialize some variables startTime = time.time() # for each cmp event for (cmpCoordinate, cmpLine) in i_cmpDict.iteritems(): ''' # this one is for comparing blacklist results if ("SNP" in cmpLine and ("bldp" in cmpLine or "blq" in cmpLine) and (cmpCoordinate not in i_radDict)): if ("SNP" in cmpLine and "Somatic" in cmpLine and cmpCoordinate not in i_radDict): # this one is for validation data # if (cmpCoordinate not in i_radDict): if ("PASS" in cmpLine and "SNP" in cmpLine and "SOM" in cmpLine and cmpCoordinate not in i_radDict): if ("Somatic" in cmpLine and "SNP" in cmpLine and cmpCoordinate not in i_radDict): ''' # this one is for comparing BB, Radia, or Maf results if ((cmpCoordinate not in i_radDict) and ("PASS" in cmpLine) and ("SOM" in cmpLine or "EDIT" in cmpLine or "RNA_TUM_VAR" in cmpLine or "RNA_NOR_VAR" in cmpLine)): if (anIsDebug): logging.debug("no radia call %s", cmpLine) # if (aNonOverlapFilename is not None): # nonOverlapFileHandler.write(cmpLine + "\n") # add to maf # if (anOverlapFilename is not None): # overlapFileHandler.write(cmpLine + "\n") # for each rad event for (radCoordinate, radLine) in i_radDict.iteritems(): # if ((radCoordinate not in i_cmpDict) and # ("bldp" in radLine or "blq" in radLine)): # if ((radCoordinate not in i_cmpDict) and # ("PASS" in radLine and "SNP" in radLine): # if (radCoordinate not in i_cmpDict): # if ("PASS" in radLine and "SOM" in radLine): # if ("SOM" in radLine and radCoordinate not in i_cmpDict): # if ((radCoordinate not in i_cmpDict) and # ("PASS" in radLine and "SNP" in radLine and "Somatic" in radLine): # if ((radCoordinate not in i_cmpDict) and # ("SNP" in radLine and "Somatic" in radLine)): # if ((radCoordinate not in i_cmpDict) and # ("PASS" in radLine and "SOM" in radLine)): # if ("SOM" in radLine and radCoordinate not in i_cmpDict): if ((radCoordinate not in i_cmpDict) and ("PASS" in radLine and "SNP" in radLine) and ("SOM" in radLine or "EDIT" in radLine or "RNA_TUM_VAR" in radLine or "RNA_NOR_VAR" in radLine)): if (anIsDebug): logging.debug("new radia call %s", radLine) if (aNonOverlapFilename is not None): nonOverlapFileHandler.write(radLine + "\n") # add to maf # if (anOverlapFilename is not None): # caller = "ucsc;" # if ("radia" in radLine): # caller += "radia;" # if ("bambam" in radLine): # caller += "bambam;" # caller = "rnaCall;" # split the line on the tab # splitLine = radLine.split("\t") # chrom = splitLine[0] # stopCoordinate = int(splitLine[1]) # startCoordinate = stopCoordinate-1 # output = ["gene", "score", caller, "score", chrom, # str(startCoordinate), str(stopCoordinate), # "+", "mutClass", "SNP", "Somatic"] # overlapFileHandler.write("\t".join(output) + "\n") # if the coordinates overlap, then count them if (radCoordinate in i_cmpDict): i_statsDict["overlap_events"] += 1 compareLine = i_cmpDict[radCoordinate] # this one is for BB and Maf comparisons # if ("PASS" in radLine and "SNP" in radLine): # this one is for Radia to Radia comparisons # if ("PASS" in radLine and "PASS" in compareLine): # this one is for Radia and validation # if ("PASS" in radLine and # "SOM" in radLine and # "SNP" in compareLine and # "Somatic" in compareLine): # if ("Somatic" in radLine and # "SNP" in radLine and # "Somatic" in compareLine): # if ("SOM" in radLine and # "Somatic" in compareLine and # "SNP" in compareLine): # if ("PASS" in radLine and "SOM" in radLine): # if ("SOM" in radLine): # if ("PASS" in radLine and # "SNP" in compareLine and # "Somatic" in compareLine): # this one is for Radia to Radia comparisons if (("PASS" in radLine) and ("SOM" in radLine or "EDIT" in radLine or "RNA_TUM_VAR" in radLine or "RNA_NOR_VAR" in radLine) and (("PASS" in compareLine) and ("SOM" in compareLine or "EDIT" in compareLine or "RNA_TUM_VAR" in compareLine or "RNA_NOR_VAR" in compareLine))): i_statsDict["overlap_pass_events"] += 1 # for each key to compare # their can be multiple keys for one filter # such as blq and bldp for blacklists for (radKeyString, cmpKeyString) in aCompareDict.iteritems(): # break up the strings to get the individual keys radKeyList = radKeyString.split(",") cmpKeyList = cmpKeyString.split(",") # set some booleans foundInRad = False foundInCmp = False # search for one of them for radKey in radKeyList: # if we find one if (radKey in radLine): foundInRad = True break # search for one of them for cmpKey in cmpKeyList: # if we find one if (cmpKey in compareLine): foundInCmp = True break # if the keys exist in both files at the # same position, then count them if (foundInRad and foundInCmp): # if these are germline or they haven't # been found in dbSnp, then count them # if ((radKey == "GERM") or # ("DB" not in radLine and "DB" not in compareLine)): # if ("SNP" in compareLine): # if (("SNP" in compareLine) and # ((radKey == "GERM") or # ("DB" not in radLine and "DB" not in compareLine))): # if ("PASS" in compareLine): # if ("PASS" in compareLine and "SNP" in compareLine): # if (True): i_statsDict["overlap_" + radKey] += 1 splitLine = radLine.split("\t") filterString = splitLine[6] filterList = filterString.split(";") # if ("PASS" in radLine and "SNP" in radLine): # if ("PASS" in radLine and "SNP" in radLine): # if ("SNP" in radLine): if ("PASS" in radLine): i_statsDict["overlap_pass_" + radKey] += 1 if (anIsDebug): logging.debug("found call %s", compareLine) if (anOverlapFilename is not None): # add to maf # caller = ";ucsc;" # if ("radia" in radLine): # caller += "radia;" # if ("bambam" in radLine): # caller += "bambam;" # caller = ";rnaCall" # splitLine = compareLine.split("\t") # splitLine[2] += caller # newSplitLine = "\t".join(splitLine) + "\n" # overlapFileHandler.write(newSplitLine) # caller = ";rnaCall" # cmpSplitLine = compareLine.split("\t") # callers = cmpSplitLine[2] + caller # callers = callers.replace(";;", ",") # callers = callers.replace(";", ",") # radSplitLine = radLine.split("\t") # radSplitLine[7] += ";CALLER=" + callers # newRadLine = "\t".join(radSplitLine) + "\n" # overlapFileHandler.write(newRadLine) overlapFileHandler.write(radLine + "\n") # we only want to write the line to the # overlap file once even if it matches # as a SOM and an EDIT break # overlapFileHandler.write(compareLine + "\n") else: if (anIsDebug): logging.debug("found but no radia pass %s %s", radLine, compareLine) # overlapFileHandler.write(compareLine + "\n") splitLine = radLine.split("\t") filterString = splitLine[6] filterList = filterString.split(";") for filterKey in filterList: i_filterDict[filterKey] += 1 if (aNonOverlapFilename is not None): # nonOverlapFileHandler.write(compareLine + # "\n") nonOverlapFileHandler.write(radLine + "\n") elif (anIsDebug and foundInRad): logging.debug("overlap but not found in compare file %s " + "%s %s", radKey, radLine, compareLine) # overlapFileHandler.write(compareLine + "\n") elif (anIsDebug and foundInCmp): logging.debug("overlap but not found in RADIA %s %s %s", cmpKey, radLine, compareLine) # overlapFileHandler.write(compareLine + "\n") elif (anIsDebug): logging.debug("overlap but not same type %s %s %s %s", radKeyList, cmpKeyList, radLine, compareLine) # overlapFileHandler.write(compareLine + "\n") # anId, aChrom, rad_events, cmp_events, overlap_events, # [rad_key, cmp_key, overlap_radKey]{n} # outputHeader = ["PatientId", "Chrom", "rad_events", "cmp_events", # "overlap_events", "rad_pass_events", # "cmp_pass_events", "overlap_pass_events"] outputList = [anId, aChrom] outputList += [str(i_statsDict["rad_events"]), str(i_statsDict["cmp_events"]), str(i_statsDict["overlap_events"])] outputList += [str(i_statsDict["rad_pass_events"]), str(i_statsDict["cmp_pass_events"]), str(i_statsDict["overlap_pass_events"])] # for each key to compare, get the total radias, total cmps, and overlaps for radKey in sorted(aCompareDict.iterkeys()): cmpKey = aCompareDict[radKey] outputList += [str(i_statsDict["rad_" + radKey]), str(i_statsDict["cmp_" + cmpKey]), str(i_statsDict["overlap_" + radKey])] outputList += [str(i_statsDict["rad_pass_" + radKey]), str(i_statsDict["cmp_pass_" + cmpKey]), str(i_statsDict["overlap_pass_" + radKey])] for (filterKey, count) in i_filterDict.iteritems(): logging.debug("filter: %s\t%s", filterKey, count) # get the files i_statsFileHandler = None if (aStatsFilename is not None): i_statsFileHandler = radiaUtil.get_append_fileHandler(aStatsFilename) i_statsFileHandler.write("\t".join(outputList) + "\n") i_statsFileHandler.close() stopTime = time.time() logging.info("Chrom %s and Id %s: Total time=%s hrs, %s mins, %s secs", aChrom, anId, ((stopTime-startTime)/(3600)), ((stopTime-startTime)/60), (stopTime-startTime)) logging.info("\t".join(outputList)) if (anOverlapFilename is not None): overlapFileHandler.close() if (aNonOverlapFilename is not None): nonOverlapFileHandler.close() return
def main(): # command for running this on a small test case: # python mergeChroms.py TCGA-BH-A18P # ../data/test/ ../data/test/ --log=DEBUG startTime = time.time() # create the usage statement usage = "usage: python %prog id inputDir outputDir [Options]" i_cmdLineParser = OptionParser(usage=usage) i_cmdLineParser.add_option( "-l", "--log", dest="logLevel", default="WARNING", metavar="LOG", help="the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL), " + "%default by default") i_cmdLineParser.add_option( "-g", "--logFilename", dest="logFilename", metavar="LOG_FILE", help="the name of the log file, STDERR by default") i_cmdLineParser.add_option( "", "--gzip", dest="gzip", action="store_true", default=False, help="include this argument if the final VCF should be " + "compressed with gzip") # range(inclusiveFrom, exclusiveTo, by) i_possibleArgLengths = range(3, 10, 1) i_argLength = len(sys.argv) # check if this is one of the possible correct commands if (i_argLength not in i_possibleArgLengths): i_cmdLineParser.print_help() sys.exit(1) # get the required parameters (i_cmdLineOptions, i_cmdLineArgs) = i_cmdLineParser.parse_args() i_id = i_cmdLineArgs[0] i_inputDir = i_cmdLineArgs[1] i_outputDir = i_cmdLineArgs[2] # get the optional params with default values i_logLevel = i_cmdLineOptions.logLevel i_gzip = i_cmdLineOptions.gzip i_logFilename = None if (i_cmdLineOptions.logFilename is not None): i_logFilename = str(i_cmdLineOptions.logFilename) # assuming loglevel is bound to the string value obtained from the # command line argument. Convert to upper case to allow the user to # specify --log=DEBUG or --log=debug i_numericLogLevel = getattr(logging, i_logLevel.upper(), None) if not isinstance(i_numericLogLevel, int): raise ValueError( "Invalid log level: '%s' must be one of the " + "following: DEBUG, INFO, WARNING, ERROR, CRITICAL", i_logLevel) # set up the logging if (i_logFilename is not None): logging.basicConfig(level=i_numericLogLevel, filename=i_logFilename, filemode='w', format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') else: logging.basicConfig(level=i_numericLogLevel, format='%(asctime)s\t%(levelname)s\t%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') # set the debug i_debug = (i_numericLogLevel == logging.DEBUG) # output some debug info if (i_debug): logging.debug("id=%s", i_id) logging.debug("inputDir=%s", i_inputDir) logging.debug("outputDir=%s", i_outputDir) logging.debug("logLevel=%s", i_logLevel) logging.debug("logFile=%s", i_logFilename) logging.debug("gzip=%s", i_gzip) # check for any errors i_readFilenameList = None if (i_logFilename is not None): i_writeFilenameList = [i_logFilename] else: i_writeFilenameList = None i_dirList = [i_inputDir, i_outputDir] if (not radiaUtil.check_for_argv_errors(i_dirList, i_readFilenameList, i_writeFilenameList)): sys.exit(1) # get the VCF generator (headerDict, coordDict) = get_vcf_data(i_id, i_inputDir, i_debug) if (i_gzip): i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf.gz") else: i_outputFilename = os.path.join(i_outputDir, i_id + ".vcf") outputFileHandler = radiaUtil.get_write_fileHandler(i_outputFilename) # if we have header info to output if (len(headerDict["metadata"]) > 0): # output the header information outputFileHandler.write("\n".join(headerDict["metadata"]) + "\n") outputFileHandler.write("\n".join(headerDict["filter"]) + "\n") outputFileHandler.write("\n".join(headerDict["info"]) + "\n") outputFileHandler.write("\n".join(headerDict["format"]) + "\n") outputFileHandler.write("".join(headerDict["chrom"]) + "\n") # first output the numerical chroms in order numericChromKeys = coordDict["numbers"].keys() numericChromKeys.sort(key=int) for chrom in numericChromKeys: outputFileHandler.write("\n".join(coordDict["numbers"][chrom]) + "\n") # then output the alphabetical chroms in order letterChromKeys = coordDict["letters"].keys() letterChromKeys.sort(key=str) for chrom in letterChromKeys: outputFileHandler.write("\n".join(coordDict["letters"][chrom]) + "\n") stopTime = time.time() logging.info("Total time for Id %s: Total time=%s hrs, %s mins, %s secs", i_id, ((stopTime - startTime) / (3600)), ((stopTime - startTime) / 60), (stopTime - startTime)) # close the files outputFileHandler.close() return