def make_bed(homeDir): """ Input: Regions.04.txt Output: PhyloCSFNovel.bed. Output PhyloCSF Candidate Coding Regions in bed format for browser tracks. """ err_msg('Writing .bed file.') inFileName = get_input_fileName(homeDir, 4) novelFileName = pjoin(homeDir, 'PhyloCSFNovel.bed') recs = list(get_reader(inFileName)) """ Color regions on +/- strands green/red to match PhyloCSF tracks, and dim ones with higher ranks. Ranks matter more at the start so use a logarithmicish scale. Put the middle of the range at the somewhat arbitrary rank 5000, which is sort of where they aren't as useful. UCSC says to limit to 8 colors to keep browser working well. """ numBins = 8 midInd = 5000 colorStrs = {'+': '0,175,0', '-': '200,0,0'} numRecs = len(recs) # Slightly more than the largest index def scale_rank(recInd): # 0 -> 0, midInd -> 0.5, numRecs - 1 -> 1 - epsilon a = (numRecs - 2 * midInd) / midInd**2 return math.log(1 + a * recInd) / math.log(1 + a * numRecs) def color_str(recInd, strand): binInd = (int)(numBins * scale_rank(recInd)) # bin 0 -> colorStrs. # bin numBins -> white = (255,255,255) (never happens cause bin < numBins) fullRGB = map(int, colorStrs[strand].split(',')) whiteRGB = (255, 255, 255) return ','.join( '%d' % (fullRGB[ii] * (1 - binInd / numBins) + whiteRGB[ii] * binInd / numBins) for ii in range(3)) with myopen(novelFileName, 'w') as novelFile: for recInd, rec in enumerate(recs): chrom = rec.Chrom bedLine = intervals_to_bed_line(chrom, [(rec.Start, rec.End)], rec.Strand, recInd + 1, rec.Start, rec.End, color=color_str( recInd, rec.Strand)) print(bedLine, file=novelFile)
def __init__(self, fileName, fieldNames, delimiter='\t', writeHeader=True, backup=False): self.outFile = open_with_backup(fileName) if backup else myopen( fileName, 'w') self.delimiter = delimiter self.fieldNames = fieldNames[:] # Make a copy for name in self.fieldNames: assert sum( n == name for n in self.fieldNames) == 1, 'Duplicate field %s.' % name if writeHeader: print(delimiter.join(fieldNames), file=self.outFile)
def _write_phyloCSF_in(homeDir): """ Input: Regions.01.txt Output: Regions.pcsf.in containing each region and its antisense region """ inFileName = get_input_fileName(homeDir, 1) outFileName = pjoin(homeDir, 'Regions.pcsf.in') with myopen(outFileName, 'w') as outf: for recInd, rec in enumerate(get_reader(inFileName)): print(chromInt2Str(rec.Chrom, [(rec.Start, rec.End)]), rec.Strand, sep=Tab, file=outf) antiInterval, antiStrand = anti_interval([rec.Start, rec.End], rec.Strand) print(chromInt2Str(rec.Chrom, [antiInterval]), antiStrand, sep=Tab, file=outf)
def __init__(self, fileName, delimiter='\t', fieldNames=None, intFields=None, floatFields=None, skipComments=True, allowDupFields=False, skipQuotes=False, skipEmptyLines=False): """ If fieldNames is None, get from the first line of the file; o.w., assume there is no header line. Integer and float fields may be added later using add_mapper instead of here. If skipComments, ignore lines that start with '#'. If allowDupFields, allow several fields to have the same name; in that case the result of accessing that field is ambiguous. If skipQuotes, strip any double quote characters from start or end of string. Excel sometimes adds these when saving Tab delimited files. If skipEmptyLines, allow blank lines, and ignore them. """ self.inFile = myopen(fileName) self.fileIter = (_skip_pounds(self.inFile, skipEmptyLines) if skipComments else self.inFile) self.delimiter = delimiter if fieldNames == None: self.prevLine = self.fileIter.next() self.fieldNames = strip_nl(self.prevLine).split(self.delimiter) else: self.prevLine = None self.fieldNames = list(fieldNames) if not allowDupFields: for name in self.fieldNames: assert sum( n == name for n in self.fieldNames) == 1, 'Dup field %s' % name self.fieldMappers = {} if intFields != None: self.add_mapper(intFields, int) if floatFields != None: self.add_mapper(floatFields, float) self.skipQuotes = skipQuotes self.skipEmptyLines = skipEmptyLines
def create_PhyloCSF_Regions( hmmParams, # Parameters for get_coding_hmm phyloCSFoutputDir, # Directory containing input file with phyloCSF scores phyloCSFregionDir, # Directory to contain output bed file chrom, strand, frame): """ Given a file with PhyloCSF scores (including bls) of consecutive codons for one frame of one strand of a chromosome, create corresponding .bed file of the PhyloCSF Regions in most likely path. The PhyloCSF scores file must be named {chrom}.Strand{strand}.Frame{frame}.fixed.out, e.g., chr1.Strand+.Frame2.fixed.out. Frame is 0, 1, or 2. Each line of the PhyloCSF scores file must be of the form: CHROM:CODON_START-CODON_END STRAND score(decibans) PHYLOCSF_SCORE BLS where PHYLOCSF_SCORE and BLS are the output of PhyloCSF using the --strategy=fixed and --bls options. Codons should be consecutive, in increasing nominal order (i.e., the numbers are increasing even for scores on the minus strand). The separator between fields must be a Tab character. For codons for which there is no alignment, put a single field containing "No_Alignment" instead of the last 3 fields. For the resulting bed file to be usable in the UCSC browser, chromosome names should be UCSC chromosome names, and positions must be 1-based. For example: chr1:10915-10917 - No_Alignment chr1:10918-10920 - score(decibans) 0.6017 0.0023 The output bed file will be named {chrom}.Strand{strand}.Frame{frame}.coding.bed. """ phyloCSFfileName = pjoin( phyloCSFoutputDir, '%s.Strand%s.Frame%s.fixed.out' % (chrom, strand, frame)) outputBedFileName = pjoin( phyloCSFregionDir, '%s.Strand%s.Frame%s.coding.bed' % (chrom, strand, frame)) print >> sys.stderr, 'Processing %s.' % phyloCSFfileName minRelBranchLength = 0.1 bedFile = myopen(outputBedFileName, 'w') def processScores(scores, blockStartPos, chrom, strand): # Process and write one block of scores. if len(scores) == 0: return hmm = get_coding_hmm(*hmmParams) codingProbabilities = hmm.state_probabilities(scores)[:, 0] bestPath = [state == 0 for state in hmm.best_path(scores)] curCodonCount = 0 for chunk in _chunkify(bestPath, lambda elt1, elt2: elt1 == elt2): # Iterate through chunks of positions in the same state if chunk[0]: # Only write coding chunks maxCodingProb = max( islice(codingProbabilities, curCodonCount, curCodonCount + len(chunk))) maxLogOdds = prob_to_log_odds(maxCodingProb) # 8 possible gray scales, 0, 30, ..., 210 (more than 210 is too light) # Divide into 8 bins based on maxLogOdds from 0-8 (but handle extremes) # (In sample region, 90th percentile of positive scores was 7.) grayScale = 210 if maxLogOdds < 1 else \ 0 if maxLogOdds > 7 else \ 210 - 30 * int(floor(maxLogOdds)) chunkStartPos = blockStartPos + 3 * curCodonCount chunkEndPos = chunkStartPos + 3 * len(chunk) - 1 print >> bedFile, '\t'.join( map( str, [ chrom, chunkStartPos - 1, # Bed counts from 0 instead of 1. chunkEndPos, # Count from 0, but chromEnd is first position _after_ end '%s:%d-%d' % (chrom, chunkStartPos, chunkEndPos), # Name 0, strand, chunkStartPos - 1, # thickStart chunkEndPos, # thickEnd '%d,%d,%d' % (grayScale, grayScale, grayScale), # itemRgb ])) curCodonCount += len(chunk) scores = [] strand = None frame = None for lineCount, line in enumerate(open(phyloCSFfileName)): words = line.split() if words[2] == 'No_Alignment': continue region = words[0] score = float(words[3]) relBranchLength = float(words[4]) chrom, interval = region.split(':') pos = int(interval.split('-')[0]) if strand == None: strand = words[1] else: assert words[1] == strand, 'Strand mismatch.' if frame == None: frame = pos % 3 else: assert pos % 3 == frame, 'Position %d does not match frame %d.' % ( pos, frame) if len(scores) > 0 and (relBranchLength <= minRelBranchLength or chrom != prevChrom or pos != prevPos + 3): processScores(scores, blockStartPos, prevChrom, strand) del scores[:] # Reset to 0 but allow Python to reuse list (don't know if it does) if relBranchLength > minRelBranchLength: if len(scores) == 0: blockStartPos = pos scores.append(score) prevChrom = chrom prevPos = pos else: if len(scores) > 0: processScores(scores, blockStartPos, chrom, strand) bedFile.close()
def estimate_hmm_params_for_genome(codingExonsFileName, genomeLength): """ codingExonsFileName is the name of a file containing information about every annotated coding exon in the genome. Each line contains five Tab-separated fields, namely chromosome, strand, frame, start, end (start <= end) for the annotated coding portion of every exon in the genome. Frame is the remainder mod 3 of the chromosomal coordinate of the first base of a codon, first being counted along the + strand even if the exon is on the - strand. genomeLength is the sum of the number of nucleotides of all chromosomes (and scaffolds) in the genome assembly. Return a 4-tuple containing the parameters needed by get_coding_hmm, namely: codingPrior: probability that a random codon is coding codingNumCodons: typical length of a coding region, in codons; the reciprocal of the probability of going from a coding state to a noncoding state. nonCodingWeights: a list representing the probability that a coding to noncoding transition will go into a particular noncoding state (sum is 1). len(nonCodingWeights) determines the number of noncoding states. nonCodingNumCodonsList: typical region length for each of the noncoding states, in codons; the reciprocal of the probability of going from that state to the coding state. Must have same length as nonCodingWeights. Priors for each noncoding state will be computed from the weights and lengths. """ exonList = [] for line in myopen(codingExonsFileName): line = line.strip() words = line.split('\t') exonList.append( (words[0], words[1], int(words[2]), int(words[3]), int(words[4]))) exonsByChrStrFr = { } # {(chrom, strand, frame) : [(start1, end1), (start2, end2),...] exonList.sort() for (chrom, strand, frame, start, end) in exonList: exonsByChrStrFr.setdefault((chrom, strand, frame), []).append( (start, end)) gapsNT = [ ] # Gaps between consecutive non-overlapping coding exons in same frame # Note: ideally this should be adjusting e1 and s2 to codon boundaries, # since we are interested in the gap between in-frame codons (even if we # are reporting it in NT). numExons = 0 totalCodingLengthNT = 0 for pairsList in exonsByChrStrFr.values( ): # Deleting from sublist is much faster index = 0 while index < len(pairsList) - 1: start1, end1 = pairsList[index] start2, end2 = pairsList[index + 1] if start2 <= end1: if end1 - start1 >= end2 - start2: del pairsList[index + 1] else: del pairsList[index] else: index += 1 gapsNT.extend(start2 - end1 - 1 for (start1, end1), (start2, end2) in neighbors(pairsList) if start2 > end1 + 1) # Exclude 0 length numExons += len(pairsList) totalCodingLengthNT += sum(end - start + 1 for start, end in pairsList) # Estimate distribution of gaps between coding regions as a mixture of exponential distributions nonCodingLengthsNT, nonCodingWeights = estimate_gap_mixture_model( gapsNT, 3, numSteps=20) nonCodingLengthsInCodons = [x / 3 for x in nonCodingLengthsNT] codingPrior = totalCodingLengthNT / genomeLength / 6 # Prior for being coding in a particular frame codingLengthInCodons = totalCodingLengthNT / numExons / 3 # Mean length in codons return codingPrior, codingLengthInCodons, nonCodingWeights, nonCodingLengthsInCodons
def classify_regions(homeDir, regionsDir, codingBedFileName, pseudoBedFileName): """ Inputs: - PhyloCSF Regions bed files in regionsDir, produced by the PhyloCSF HMM. The file names should have the format: {CHROMOSOME}.Strand{STRAND}.Frame{FRAME}.coding.bed.gz where STRAND is + or - and FRAME is 0, 1, or 2. - Coding and pseudogene annotated transcripts. Output: Regions.01.txt or Regions.01.txt Find overlapping transcripts, classify regions, and create extension regions. Also create input file for running PhyloCSF using the strategy=mle option. """ assure_dir(homeDir) outFileName = get_output_fileName(homeDir, 1) err_msg('Reading transcripts') codingTrs = [bed_line_to_tr(line) for line in myopen(codingBedFileName)] pseudoTrs = [bed_line_to_tr(line) for line in myopen(pseudoBedFileName)] err_msg('Creating overlap checkers') codingOverlapChecker = OverlapChecker(codingTrs, onlyCDS=True) pseudoOverlapChecker = OverlapChecker(pseudoTrs, onlyCDS=False) outRecs = [] lineCounter = plusCounter = minusCounter = 0 def get_rec_name(rec): return '%s:%d-%d%s' % (rec.Chrom, rec.Start, rec.End, rec.Strand) err_msg('Processing input PhyloCSF Regions.') for bedFileName in ls(regionsDir): if not bedFileName.endswith('.coding.bed.gz'): continue for line in myopen(pjoin(regionsDir, bedFileName)): lineCounter += 1 if '.Strand+' in bedFileName: plusCounter += 1 else: minusCounter += 1 dummyName, chrom, bedIntervals, strand = bed_line_to_intervals( line) assert len(bedIntervals) == 1 interval = bedIntervals[0] rec = DictClass() rec.Chrom = chrom rec.Start = interval[0] rec.End = interval[1] rec.NumCodons = (rec.End - rec.Start + 1) // 3 rec.Strand = strand rec.Name = get_rec_name(rec) rec.Parent = 'NA' assert (rec.End - rec.Start + 1) % 3 == 0, rec.Name # Find overlaps and set up RegType codingOverlapTrs = codingOverlapChecker.overlapping_trs( rec.Chrom, '+-', interval) pseudoOverlapTrs = pseudoOverlapChecker.overlapping_trs( rec.Chrom, '+-', interval) if len(pseudoOverlapTrs) > 0: rec.RegType = PseudoOverlap elif any_same_frame(interval, rec.Strand, codingOverlapTrs): rec.RegType = CodingOverlap elif any_anti_frame(interval, rec.Strand, codingOverlapTrs): rec.RegType = AntisenseOverlap else: rec.RegType = NoOverlap outRecs.append(rec) # Subtract from intervals overlaps with pseudogenes in any frame or # coding in same or antisense frame. # Make a new record for each resulting segment if rec.RegType != NoOverlap: intervals = subtract_trs(interval, rec.Strand, pseudoOverlapTrs, codingOverlapTrs) for interval in intervals: subRec = DictClass() subRec.RegType = 'Extension' subRec.Chrom = rec.Chrom subRec.Start = interval[0] subRec.End = interval[1] subRec.NumCodons = (subRec.End - subRec.Start + 1) // 3 subRec.Strand = rec.Strand subRec.Name = get_rec_name(subRec) subRec.Parent = rec.Name assert (subRec.End - subRec.Start + 1) % 3 == 0, subRec.Name outRecs.append(subRec) outRecs.sort(key=lambda rec: (RegTypes.index(rec.RegType), rec.Chrom, rec. Strand, rec.Start, rec.End)) fields = list(Fields) fields.remove(Rank) # We'll insert it at the beginning later outDFW = DFW(outFileName, fields) for recInd, rec in enumerate(outRecs): outDFW.write_line(rec) outDFW.close() err_msg('Writing PhyloCSF input file.') _write_phyloCSF_in(homeDir)