def findMutationType(self, genomeFn, chromSizesDict): ''' Determine the mutation type based on the sequence ''' if self.algorithm == 'SomaticIndelDetector': if len(self.refAllele) < len(self.mutAllele): self.mutationType = Mutation.INS elif len(self.refAllele) > len(self.mutAllele): self.mutationType = Mutation.DEL else: if len(self.refAllele) != 1: errAbort("Can't handle non-single-bp mutations at this point.") mutDNA = getDNA(self.chrom, max(0, self.chromStart - 1), min(self.chromStart + 1 + 1, chromSizesDict[self.chrom]), genomeFn, noMask=True) # Pad out any edge effects if self.chromStart == 0: mutDNA = "N" + mutDNA if self.chromStart + 1 == chromSizesDict[self.chrom]: mutDNA += "N" if len(mutDNA) == 3: self.mutationType = Mutation.getMutationType( mutDNA, coords="%s:%d-%d" % (self.chrom, max(0, self.chromStart), min(self.chromStart + 1 + 1, chromSizesDict[self.chrom]))) else: errAbort("Bad input for mutation analysis: %s" % mutDNA)
def filterPointMutations(mutFn, retList): ''' Append valid point mutations to the retList ''' global AllFilters try: f = open(mutFn) except IOError: errAbort("Point mutation file %s does not exist." % mutFn) version = f.readline().strip() rawHeader = f.readline().strip() numCols = validateMutectorFile(version, rawHeader.replace('\t', ' ')) snpRemovalFilters = AllFilters['SNPRemovalFilters'] snpAnnotationFilters = AllFilters['SNPAnnotationFilters'] print "Screening SNP mutations with the following parameters:" print " ", "For removal of the call:" for k in sorted(snpRemovalFilters.keys()): print " ", k, ":", snpRemovalFilters[k] print " ", "For annotation of the call:" for k in sorted(snpAnnotationFilters.keys()): print " ", k, ":", snpAnnotationFilters[k] # Massage removal filters to be more efficient snpRemovalFilters['muTectJudgmentsAllowed'] = set( snpRemovalFilters['muTectJudgmentsAllowed'].split(',')) # Loop through each successive line in the file while True: line = f.readline() if not line: break mutation = MuTectorLine(line, snpAnnotationFilters) if mutation.shouldBeKept(snpRemovalFilters): retList.append(mutation)
def configure(fn): ''' Read the configuration file ''' global AllFilters config = ConfigParser.SafeConfigParser() config.optionxform = str config.read(fn) def getVal(val, defaultVal): if isinstance(defaultVal, int): return int(val) elif isinstance(defaultVal, float): return float(val) elif isinstance(defaultVal, str): return str(val) else: errAbort("Unsure how to convert %s" % val) for section in AllFilters.keys(): if config.has_section(section): for var, val in config.items(section): if var not in AllFilters[section]: errAbort("%s filter %s is not a valid filter." % (section, var)) AllFilters[section][var] = getVal(val, AllFilters[section][var]) return config
def getVal(val, defaultVal): if isinstance(defaultVal, int): return int(val) elif isinstance(defaultVal, float): return float(val) elif isinstance(defaultVal, str): return str(val) else: errAbort("Unsure how to convert %s" % val)
def map_Q0_reads(self): ''' Return the number of Q0 reads mapped here ''' q0List = filter(lambda x: x.startswith("MQ0="), self.info.split(';')) if len(q0List) == 0: return NA elif len(q0List) > 1: errAbort("MQ0 should only be specified once in the info file: %s" % self.__str__()) return int(q0List[0].replace("MQ0=", '', 1))
def homopolymerContext(self): ''' Return the homopolymer context (or 0 if it is not reported) ''' homopolymerInfo = filter(lambda x: x.startswith("HRun="), self.info.split(';')) if len(homopolymerInfo) == 0: return 0 elif len(homopolymerInfo) > 1: errAbort( "Homopolymer information should be listed at most once: %s" % self.info) return int(homopolymerInfo[0].split('=')[1])
def getMutationType(dna, coords=""): ''' Class method to return the type of mutation given this DNA. Assumes DNA is a 3-letter uppercase sequence and the mutation is a point mutation in the middle base pair ''' if len(dna) != 3: errAbort("getMutationType assumes DNA is a 3-letter sequence!") mut_type = min(dna, reverseComplement(dna)) if mut_type not in Mutation.ALL_MUTATIONS: print "Error: Mutation %s is not one we expect: %s" % (mut_type, coords) return mut_type
def endPosition(self): ''' Return the end position of the alteration ''' if self.ref_allele == '-': # Is an insertion return self.position elif self.alt_allele == '-': # Is a deletion return self.position + len( self.ref_allele) - 1 # One-based position system else: errAbort( "For indels, either ref_allele or alt_allele must be '-': %s" % self.__str__())
def __init__(self, line, snpAnnotationFilters): ''' Initialize a MuTectorLine object with all columns properly instantiated. Assumes columns are in the exact ordering given by the Columns variable. ''' if line.endswith('\n'): line = line[:-1] line = line.split('\t') if len(line) != NumMuTectorCols: errAbort("MuTectorLine has %d columns (not %d): %s" % (len(line), NumMuTectorCols, "\t".join(line))) self.contig = line[0] # Chromosome self.position = int(line[1]) # Chrom position #self.context self.ref_allele = line[3] # DNA self.alt_allele = line[4] # DNA self.tumor_name = line[5] self.normal_name = line[6] self.score = int(line[7]) self.dbsnp_site = line[8] self.covered = line[9] self.power = float(line[10]) self.tumor_power = float(line[11]) self.normal_power = float(line[12]) self.total_pairs = int(line[13]) self.improper_pairs = int(line[14]) self.map_Q0_reads = int(line[15]) self.t_lod_fstar = float(line[16]) self.tumor_f = float(line[17]) self.contaminant_fraction = float(line[18]) self.contaminant_lod = float(line[19]) self.t_ref_count = int(line[20]) self.t_alt_count = int(line[21]) self.t_ref_sum = int(line[22]) self.t_alt_sum = int(line[23]) #self.t_ref_max_mapq #self.t_alt_max_mapq self.t_ins_count = int(line[26]) self.t_del_count = int(line[27]) self.normal_best_gt = line[28] # DNA self.init_n_lod = float(line[29]) self.n_ref_count = int(line[30]) self.n_alt_count = int(line[31]) self.n_ref_sum = int(line[32]) self.n_alt_sum = int(line[33]) self.judgement = line[34] self.algo = MuTectorAlgorithm self.validateMuTectorLine() self.makeOurJudgment(snpAnnotationFilters)
def getPossibleMutations(bedList, genomeFn, chromSizesDict, debug=True): ''' Return a dictionary keyed by mutation type with the total possible number of mutations in this bed list ''' global USE_INDELS if debug: if not BedTools.isNonContiguous(bedList): errAbort("Input regions must be non-contiguous") d = {} curr_chrom = None curr_seq = None for bed in bedList: if bed.chrom < curr_chrom: errAbort("Err: Regions must be sorted by chromosome") elif bed.chrom > curr_chrom: curr_chrom = bed.chrom curr_seq = getDNA(curr_chrom, 0, chromSizesDict[curr_chrom], genomeFn, noMask=True) if USE_INDELS: d[Mutation.INS] = d.get(Mutation.INS, 0) + (bed.chromEnd - bed.chromStart - 1) d[Mutation.DEL] = d.get(Mutation.DEL, 0) + (bed.chromEnd - bed.chromStart - 1) prefix = "" if bed.chromStart == 0: prefix = "N" suffix = "" if bed.chromEnd == chromSizesDict[bed.chrom]: suffix = "N" bedSeq = prefix + curr_seq[max(0, bed.chromStart - 1):min( bed.chromEnd + 1, chromSizesDict[bed.chrom])] + suffix seqLen = len(bedSeq) for i in xrange(1, seqLen - 1): mut_type = Mutation.getMutationType( bedSeq[i - 1:i + 2], coords="%s:%d-%d" % (bed.chrom, bed.chromStart - 1 + (i - 1), bed.chromStart - 1 + (i + 2))) d[mut_type] = d.get(mut_type, 0) + 1 return d
def __init__(self, line, tumorName, normalName, indelAnnotationFilters): ''' Initialize a SomaticIndelLine ''' if line.endswith('\n'): line = line[:-1] line = line.split('\t') if len(line) < SomaticIndelNumRequiredColumns: errAbort("Improperly formatted .vcf line: %s" % " ".join(line)) self.contig = line[0] self.position = int(line[1]) self.id = line[2] self.ref_allele = line[3] self.alt_allele = line[4] self.tumor_name = tumorName self.normal_name = normalName self.quality = line[5] self.filter = line[6] self.info = line[7] self.score = NA self.power = NA self.tumor_power = NA self.normal_power = NA self.improper_pairs = NA self.addlInfo = None self.algo = SomaticIndelAlgorithm if len(line) > SomaticIndelNumRequiredColumns: if len(line) != SomaticIndelNumRequiredColumns + 3: errAbort( "Do not know how to handle non-3-extra-column .vcf files.") dataCols = line[8].split(':') firstData = line[9].split(':') secondData = line[10].split(':') assert len(dataCols) == len(firstData) assert len(dataCols) == len(secondData) self.addlInfo = {"tumor": {}, "normal": {}, "cols": dataCols} for idx, col in enumerate(dataCols): self.addlInfo["tumor"][col] = firstData[idx] self.addlInfo["normal"][col] = secondData[idx] self.validateSomaticIndelLine() self.convertToAnnovar() self.makeOurJudgment(indelAnnotationFilters)
def __init__(self, seqFn, mutFn, genomeFn, chromSizesDict, debug=True): """ Initialize a patient based on what has been sequenced and what mutations exist """ self.sequencedRegions = BedTools.bedChromDictFromFile( seqFn, chromSizes=chromSizesDict) if debug: if not BedTools.isNonContiguousDict(self.sequencedRegions): errAbort("Sequenced regions must be non-contiguous") # Find all the possible mutations based on the amount of DNA sequenced in the patient self.totalSequencedBp = 0 self.totalPossibleMuts = {} for chromBedList in self.sequencedRegions.values(): d = getPossibleMutations(chromBedList, genomeFn, chromSizesDict) for k, v in d.items(): self.totalPossibleMuts[k] = self.totalPossibleMuts.get(k, 0) + v for sequencedBed in chromBedList: self.totalSequencedBp += (sequencedBed.chromEnd - sequencedBed.chromStart) # Read in the actual mutations in the patient self.mutations = [ Mutation(line, genomeFn, chromSizesDict) for line in open(mutFn) if not line.startswith("gene") ] self.mutations[:] = sorted(self.mutations, key=operator.attrgetter( 'chrom', 'chromStart')) for mutation in self.mutations: chromBedList = self.sequencedRegions.get(mutation.chrom, []) #endToTest = mutation.chromStart + max(len(mutation.refAllele), len(mutation.mutAllele)) if mutation.mutationType not in (Mutation.INS, Mutation.DEL) else mutation.chromStart+1 endToTest = mutation.chromStart + 1 if chromBedList == [] or not BedTools.isEncompassedByBed( mutation.chrom, mutation.chromStart, endToTest, chromBedList): errAbort("Mutation is not encompassed by sequenced regions.") if len( set(self.totalPossibleMuts.iterkeys()).difference( set(Mutation.ALL_MUTATIONS.keys()))) > 0: print "Error: Patient %s has some mutations not found..." % mutFn print self.totalPossibleMuts
def convertToAnnovar(self): ''' Convert reference and alternate allele and position to type of position that AnnoVar uses ''' if len(self.ref_allele) > len(self.alt_allele): # Deletion if (len(self.alt_allele) != 1) or not self.ref_allele.startswith( self.alt_allele): errAbort( "Deletions expected to be formatted in a different way: %s, %s" % (self.ref_allele, self.alt_allele)) self.position += len(self.alt_allele) self.alt_allele = '-' self.ref_allele = self.ref_allele.replace( self.alt_allele, '', 1) # Replace the first instance with nothing elif len(self.ref_allele) < len(self.alt_allele): # Insertion if (len(self.ref_allele) != 1) or not self.alt_allele.startswith( self.ref_allele): errAbort( "Insertions expected to be formatted in a different way: %s, %s" % (self.ref_allele, self.alt_allele)) self.ref_allele = '-' self.alt_allele = self.alt_allele.replace( self.ref_allele, '', 1) # Replace the first instance of this with nothing else: errAbort( "For indels, either ref_allele or alt_allele must be larger: %s" % self.__str__())
def validateSomaticIndelLine(self): ''' Make sure all SomaticIndel fields make sense ''' if self.position < 0: errAbort("Position must be non-negative: %s" % self.__str__()) if not isDNA(self.ref_allele): errAbort("Ref allele must be DNA: %s" % self.__str__()) if not isDNA(self.alt_allele): errAbort("Alt allele must be DNA: %s" % self.__str__())
def filterIndels(indelFn, retList): ''' Append valid indel mutations to the retList ''' global AllFilters try: f = open(indelFn) except IOError: errAbort("Indel mutation file %s does not exist." % indelFn) inDataLine = False tumorName = "" normalName = "" indelRemovalFilters = AllFilters['IndelRemovalFilters'] indelAnnotationFilters = AllFilters['IndelAnnotationFilters'] print "Screening indels with the following parameters:" print " ", "For removal of the call:" for k in sorted(indelRemovalFilters.keys()): print " ", k, ":", indelRemovalFilters[k] print " ", "For annotation of the call:" for k in sorted(indelAnnotationFilters.keys()): print " ", k, ":", indelAnnotationFilters[k] # Loop through each successive line in the file while True: line = f.readline() if not line: break if line.startswith("##"): if inDataLine: errAbort( "Invalid format for .vcf, should not have any comments after initial headers: %s" % line) elif line.startswith("#"): if inDataLine: errAbort( "Invalid format for .vcf, should not have any comments after initial headers: %s" % line) spaceLine = line.strip().replace('\t', ' ') if not spaceLine.startswith(PindelRequiredColumns): errAbort("Expecting .vcf to have different columns than: %s" % line) tumorName, normalName = line.strip().split('\t')[-2:] else: inDataLine = True mutation = PindelLine(line, tumorName, normalName, indelAnnotationFilters) if mutation.shouldBeKept(indelRemovalFilters): retList.append(mutation)
def validateMutectorFile(version, cols): ''' Abort if MuTect file is not a recognized version or does not have the proper headers ''' global MuTectorVersion, MuTectorColumns, NumMuTectorColumns if version.strip() != MuTectorVersion: print "Warning: MuTector version (%s) not what we are expecting (%s)..." % (version, MuTectorVersion) if cols != MuTectorColumns: print "MuTectorc olumns not the expected columns." actualCols = cols.split(' ') expectedCols = MuTectorColumns.split(' ') actualLen = len(actualCols) print "#Col\tActual\tExpected" for i in xrange(max(actualLen, NumMutectorColumns)): a = "" if i < actualLen: a = actualCols[i] e = "" if i < expectedLen: e = expectedCols[i] note = "" if a != e: note = "*" print "%d\t%s\t%s\t%s" % (i, a, e, note) errAbort("")
def main(): ''' Main function for PadBed ''' global IS_TWO_BIT, IS_SIZES args = parseArgv() chromSizesFn, inFn, outFn = args # Check if chromSizesFn type has been set explicitly by arguments, if not, try to guess if (not IS_TWO_BIT) and (not IS_SIZES): if chromSizesFn.endswith(".2bit"): IS_TWO_BIT = True elif chromSizesFn.endswith(".sizes"): IS_SIZES = True else: errAbort( "Unknown file type for %s. Please either end in '.2bit' or '.sizes' or set the appropriate flag." % chromSizes) if IS_TWO_BIT: chromSizesDict = getChromSizesDict(chromSizesFn) else: chromSizesDict = getChromSizesDictFromText(chromSizesFn) padBed(chromSizesDict, inFn, outFn)
def padBed(sizes, inFn, outFn): ''' Perform the actual padding of the Beds. ''' global PADDING, UPSTREAM, DOWNSTREAM isStrandedPadding = (PADDING < 0) if not isStrandedPadding: UPSTREAM = PADDING DOWNSTREAM = PADDING else: UPSTREAM = max(0, UPSTREAM) DOWNSTREAM = max(0, DOWNSTREAM) f = open(inFn) g = open(outFn, "w") validStrands = ('+', '-') for line in f: b = Bed.Bed(line, chromSizes=sizes) if isStrandedPadding: if (not hasattr(b, 'strand')) or (b.strand not in validStrands): errAbort( "Strand-specific padding requires all input elements have valid strands: %s" % b) elif b.strand == '+': b.chromStart = max(0, b.chromStart - UPSTREAM) b.chromEnd = min(sizes[b.chrom], b.chromEnd + DOWNSTREAM) elif b.strand == '-': b.chromStart = max(0, b.chromStart - DOWNSTREAM) b.chromEnd = min(sizes[b.chrom], b.chromEnd + UPSTREAM) else: errAbort("Should not happen, programmer error.") else: b.chromStart = max(0, b.chromStart - UPSTREAM) b.chromEnd = min(sizes[b.chrom], b.chromEnd + DOWNSTREAM) g.write("%s\n" % b) f.close() g.close()
def __init__(self, line, genomeFn, chromSizesDict): if line.endswith('\n'): line = line[:-1] line = line.split('\t') if len(line) != 31: errAbort("Mutation lines must have 31 columns.", "\t".join(line)) self.geneSymbol = line[0] self.chrom = line[1] self.chromStart = int(line[2]) - 1 if self.chromStart < 0: errAbort("Invalid position for mutation to occur.") if self.chromStart + 1 > chromSizesDict[self.chrom]: errAbort("Invalid position for mutation to occur.") self.refAllele = line[3] self.mutAllele = line[4] self.ntMut = line[5] self.aaMut = line[6] self.mutContext = line[7] self.geneMutType = line[8] self.mutStatus = line[9] # "KEEP" self.tumorName = line[10] self.normalName = line[11] self.score = line[12] self.power = line[13] self.tumorPower = line[14] self.normalPower = line[15] self.totalPairs = int(line[16]) self.improperPairs = -1 if canBeInt(line[17]): self.improperPairs = int(line[17]) self.mapQ0Reads = -1 if canBeInt(line[18]): self.mapQ0Reads = int(line[18]) self.contamFrac = float(line[19]) self.contamLOD = -1 if canBeNum(line[20]): self.contamLOD = float(line[20]) self.tumorRefCount = int(line[21]) self.tumorMutCount = int(line[22]) self.normalRefCount = int(line[23]) self.normalMutCount = int(line[24]) self.tumorVarFreq = float(line[25]) self.normalVarFreq = float(line[26]) self.accessionList = line[27].strip().split(',') self.exon = map( int, map(lambda x: x.replace('NA', '-1').replace('UNKNOWN', '-1'), line[28].strip().split(','))) self.knownVariant = line[29] self.algorithm = line[30] self.findMutationType(genomeFn, chromSizesDict) if not roughlyEqual(self.tumorVarFreq, (self.tumorMutCount * 1. / (self.tumorMutCount + self.tumorRefCount)), epsilon=0.005): print "Tumor variant frequency is not accurate! %f %f" % ( self.tumorVarFreq, (self.tumorMutCount * 1. / (self.tumorMutCount + self.tumorRefCount))) if not roughlyEqual(self.normalVarFreq, (self.normalMutCount * 1. / (self.normalMutCount + self.normalRefCount)), epsilon=0.005): print "Normal variant frequency is not accurate! %f %f" % ( self.normalVarFreq, (self.normalMutCount * 1. / (self.normalMutCount + self.normalRefCount)))
len(args)) print __doc__ sys.exit(-1) for o, a in opts: if o in ("-h", "-?", "--help"): print __doc__ sys.exit(0) elif o == "--twoBit": IS_TWO_BIT = True elif o == "--sizes": IS_SIZES = True elif o == "--padding": PADDING = int(a) if PADDING < 0: errAbort("Padding must be a non-negative integer: %d" % PADDING) elif o == "--upstream": UPSTREAM = int(a) if UPSTREAM < 0: errAbort( "Upstream padding must be a non-negative integer: %d" % UPSTREAM) elif o == "--downstream": DOWNSTREAM = int(a) if DOWNSTREAM < 0: errAbort( "Downstream padding must be a non-negative integer: %d" % DOWNSTREAM) if IS_TWO_BIT and IS_SIZES: errAbort("Must specify only one of '--twoBit', '--sizes' flags.")
def validateMuTectorLine(self): ''' Make sure all MuTector fields make sense ''' if self.position < 0: errAbort("Position must be non-negative: %s" % self.__str__()) if not isDNA(self.ref_allele): errAbort("Ref allele must be DNA: %s" % self.__str__()) if not isDNA(self.alt_allele): errAbort("Alt allele must be DNA: %s" % self.__str__()) if self.score < 0: errAbort("Score must be non-negative: %s" % self.__str__()) if self.power < 0 or self.power > 1: errAbort("Power must be in [0,1]: %s" % self.__str__()) if self.tumor_power < 0 or self.tumor_power > 1: errAbort("Tumor power must be in [0,1]: %s" % self.__str__()) if self.normal_power < 0 or self.normal_power > 1: errAbort("Normal power must be in [0,1]: %s" % self.__str__()) if self.total_pairs < 0: errAbort("Total pairs must be non-negative: %s" % self.__str__()) if self.improper_pairs < 0 or self.improper_pairs > self.total_pairs: errAbort( "Improper pairs must be non-negative and less than total pairs: %s" % self.__str__()) if self.map_Q0_reads < 0: errAbort("map_Q0_reads must be non-negative: %s" % self.__str__()) if self.contaminant_fraction < 0 or self.contaminant_fraction > 1: errAbort("Contaminant fraction must be in [0,1]: %s" % self.__str__()) if self.t_ref_count < 0 or self.t_ref_sum < 0 or self.t_ref_count > self.t_ref_sum: errAbort("Tumor ref count/sum are bad: %s" % self.__str__()) if self.t_alt_count < 0 or self.t_alt_sum < 0 or self.t_alt_count > self.t_alt_sum: errAbort("Tumor alt count/sum are bad: %s" % self.__str__()) if self.t_ins_count < 0 or self.t_del_count < 0: errAbort( "Tumor insertion/deletion counts must be non-negative: %s" % self.__str__()) if not isDNA(self.normal_best_gt): errAbort("Normal best GT must be DNA: %s" % self.__str__()) if self.n_ref_count < 0 or self.n_ref_sum < 0 or self.n_ref_count > self.n_ref_sum: errAbort("Normal ref count/sum are bad: %s" % self.__str__()) if self.n_alt_count < 0 or self.n_alt_sum < 0 or self.n_alt_count > self.n_alt_sum: errAbort("Normal alt count/sum are bad: %s" % self.__str__())