def make_bed(homeDir):
    """
    Input: Regions.04.txt
    Output: PhyloCSFNovel.bed.
    Output PhyloCSF Candidate Coding Regions in bed format for browser tracks.
    
    """
    err_msg('Writing .bed file.')
    inFileName = get_input_fileName(homeDir, 4)
    novelFileName = pjoin(homeDir, 'PhyloCSFNovel.bed')
    recs = list(get_reader(inFileName))
    """
    Color regions on +/- strands green/red to match PhyloCSF tracks, and dim ones with
       higher ranks. Ranks matter more at the start so use a logarithmicish scale. Put the
       middle of the range at the somewhat arbitrary rank 5000, which is sort of where
       they aren't as useful.
    UCSC says to limit to 8 colors to keep browser working well.
    """
    numBins = 8
    midInd = 5000

    colorStrs = {'+': '0,175,0', '-': '200,0,0'}

    numRecs = len(recs)  # Slightly more than the largest index

    def scale_rank(recInd):
        # 0 -> 0, midInd -> 0.5, numRecs - 1 -> 1 - epsilon
        a = (numRecs - 2 * midInd) / midInd**2
        return math.log(1 + a * recInd) / math.log(1 + a * numRecs)

    def color_str(recInd, strand):
        binInd = (int)(numBins * scale_rank(recInd))
        # bin 0 -> colorStrs.
        # bin numBins -> white = (255,255,255) (never happens cause bin < numBins)
        fullRGB = map(int, colorStrs[strand].split(','))
        whiteRGB = (255, 255, 255)
        return ','.join(
            '%d' % (fullRGB[ii] *
                    (1 - binInd / numBins) + whiteRGB[ii] * binInd / numBins)
            for ii in range(3))

    with myopen(novelFileName, 'w') as novelFile:
        for recInd, rec in enumerate(recs):
            chrom = rec.Chrom
            bedLine = intervals_to_bed_line(chrom, [(rec.Start, rec.End)],
                                            rec.Strand,
                                            recInd + 1,
                                            rec.Start,
                                            rec.End,
                                            color=color_str(
                                                recInd, rec.Strand))
            print(bedLine, file=novelFile)
Exemple #2
0
 def __init__(self,
              fileName,
              fieldNames,
              delimiter='\t',
              writeHeader=True,
              backup=False):
     self.outFile = open_with_backup(fileName) if backup else myopen(
         fileName, 'w')
     self.delimiter = delimiter
     self.fieldNames = fieldNames[:]  # Make a copy
     for name in self.fieldNames:
         assert sum(
             n == name
             for n in self.fieldNames) == 1, 'Duplicate field %s.' % name
     if writeHeader:
         print(delimiter.join(fieldNames), file=self.outFile)
def _write_phyloCSF_in(homeDir):
    """
    Input:  Regions.01.txt
    Output: Regions.pcsf.in containing each region and its antisense region
    """
    inFileName = get_input_fileName(homeDir, 1)
    outFileName = pjoin(homeDir, 'Regions.pcsf.in')
    with myopen(outFileName, 'w') as outf:
        for recInd, rec in enumerate(get_reader(inFileName)):
            print(chromInt2Str(rec.Chrom, [(rec.Start, rec.End)]),
                  rec.Strand,
                  sep=Tab,
                  file=outf)
            antiInterval, antiStrand = anti_interval([rec.Start, rec.End],
                                                     rec.Strand)
            print(chromInt2Str(rec.Chrom, [antiInterval]),
                  antiStrand,
                  sep=Tab,
                  file=outf)
Exemple #4
0
 def __init__(self,
              fileName,
              delimiter='\t',
              fieldNames=None,
              intFields=None,
              floatFields=None,
              skipComments=True,
              allowDupFields=False,
              skipQuotes=False,
              skipEmptyLines=False):
     """ If fieldNames is None, get from the first line of the file;
             o.w., assume there is no header line.
         Integer and float fields may be added later using add_mapper instead of here.
         If skipComments, ignore lines that start with '#'.
         If allowDupFields, allow several fields to have the same name; in that case
             the result of accessing that field is ambiguous.
         If skipQuotes, strip any double quote characters from start or end of string.
             Excel sometimes adds these when saving Tab delimited files.
         If skipEmptyLines, allow blank lines, and ignore them.
     """
     self.inFile = myopen(fileName)
     self.fileIter = (_skip_pounds(self.inFile, skipEmptyLines)
                      if skipComments else self.inFile)
     self.delimiter = delimiter
     if fieldNames == None:
         self.prevLine = self.fileIter.next()
         self.fieldNames = strip_nl(self.prevLine).split(self.delimiter)
     else:
         self.prevLine = None
         self.fieldNames = list(fieldNames)
     if not allowDupFields:
         for name in self.fieldNames:
             assert sum(
                 n == name
                 for n in self.fieldNames) == 1, 'Dup field %s' % name
     self.fieldMappers = {}
     if intFields != None:
         self.add_mapper(intFields, int)
     if floatFields != None:
         self.add_mapper(floatFields, float)
     self.skipQuotes = skipQuotes
     self.skipEmptyLines = skipEmptyLines
def create_PhyloCSF_Regions(
        hmmParams,  # Parameters for get_coding_hmm
        phyloCSFoutputDir,  # Directory containing input file with phyloCSF scores
        phyloCSFregionDir,  # Directory to contain output bed file
        chrom,
        strand,
        frame):
    """
    Given a file with PhyloCSF scores (including bls) of consecutive codons for one frame
        of one strand of a chromosome, create corresponding .bed file of the PhyloCSF
        Regions in most likely path.
    The PhyloCSF scores file must be named {chrom}.Strand{strand}.Frame{frame}.fixed.out,
        e.g., chr1.Strand+.Frame2.fixed.out. Frame is 0, 1, or 2.
    Each line of the PhyloCSF scores file must be of the form:
        CHROM:CODON_START-CODON_END   STRAND  score(decibans) PHYLOCSF_SCORE  BLS
    where PHYLOCSF_SCORE and BLS are the output of PhyloCSF using the --strategy=fixed
    and --bls options. Codons should be consecutive, in increasing nominal order (i.e.,
    the numbers are increasing even for scores on the minus strand). The separator between
    fields must be a Tab character. For codons for which there is no alignment, put
    a single field containing "No_Alignment" instead of the last 3 fields.
    For the resulting bed file to be usable in the UCSC browser, chromosome names should
    be UCSC chromosome names, and positions must be 1-based.
    For example:
        chr1:10915-10917        -       No_Alignment
        chr1:10918-10920        -       score(decibans) 0.6017  0.0023
    The output bed file will be named {chrom}.Strand{strand}.Frame{frame}.coding.bed.
    """
    phyloCSFfileName = pjoin(
        phyloCSFoutputDir,
        '%s.Strand%s.Frame%s.fixed.out' % (chrom, strand, frame))
    outputBedFileName = pjoin(
        phyloCSFregionDir,
        '%s.Strand%s.Frame%s.coding.bed' % (chrom, strand, frame))

    print >> sys.stderr, 'Processing %s.' % phyloCSFfileName
    minRelBranchLength = 0.1
    bedFile = myopen(outputBedFileName, 'w')

    def processScores(scores, blockStartPos, chrom, strand):
        # Process and write one block of scores.
        if len(scores) == 0:
            return

        hmm = get_coding_hmm(*hmmParams)
        codingProbabilities = hmm.state_probabilities(scores)[:, 0]
        bestPath = [state == 0 for state in hmm.best_path(scores)]

        curCodonCount = 0
        for chunk in _chunkify(bestPath, lambda elt1, elt2: elt1 == elt2):
            # Iterate through chunks of positions in the same state
            if chunk[0]:  # Only write coding chunks
                maxCodingProb = max(
                    islice(codingProbabilities, curCodonCount,
                           curCodonCount + len(chunk)))
                maxLogOdds = prob_to_log_odds(maxCodingProb)
                # 8 possible gray scales, 0, 30, ..., 210 (more than 210 is too light)
                # Divide into 8 bins based on maxLogOdds from 0-8 (but handle extremes)
                # (In sample region, 90th percentile of positive scores was 7.)
                grayScale = 210 if maxLogOdds < 1 else \
                            0 if maxLogOdds > 7 else   \
                            210 - 30 * int(floor(maxLogOdds))
                chunkStartPos = blockStartPos + 3 * curCodonCount
                chunkEndPos = chunkStartPos + 3 * len(chunk) - 1
                print >> bedFile, '\t'.join(
                    map(
                        str,
                        [
                            chrom,
                            chunkStartPos -
                            1,  # Bed counts from 0 instead of 1.
                            chunkEndPos,  # Count from 0, but chromEnd is first position _after_ end
                            '%s:%d-%d' %
                            (chrom, chunkStartPos, chunkEndPos),  # Name
                            0,
                            strand,
                            chunkStartPos - 1,  # thickStart
                            chunkEndPos,  # thickEnd
                            '%d,%d,%d' %
                            (grayScale, grayScale, grayScale),  # itemRgb
                        ]))
            curCodonCount += len(chunk)

    scores = []
    strand = None
    frame = None
    for lineCount, line in enumerate(open(phyloCSFfileName)):
        words = line.split()
        if words[2] == 'No_Alignment':
            continue
        region = words[0]
        score = float(words[3])
        relBranchLength = float(words[4])
        chrom, interval = region.split(':')
        pos = int(interval.split('-')[0])
        if strand == None:
            strand = words[1]
        else:
            assert words[1] == strand, 'Strand mismatch.'
        if frame == None:
            frame = pos % 3
        else:
            assert pos % 3 == frame, 'Position %d does not match frame %d.' % (
                pos, frame)

        if len(scores) > 0 and (relBranchLength <= minRelBranchLength
                                or chrom != prevChrom or pos != prevPos + 3):
            processScores(scores, blockStartPos, prevChrom, strand)
            del scores[:]  # Reset to 0 but allow Python to reuse list (don't know if it does)

        if relBranchLength > minRelBranchLength:
            if len(scores) == 0:
                blockStartPos = pos
            scores.append(score)
            prevChrom = chrom
            prevPos = pos
    else:
        if len(scores) > 0:
            processScores(scores, blockStartPos, chrom, strand)
    bedFile.close()
Exemple #6
0
def estimate_hmm_params_for_genome(codingExonsFileName, genomeLength):
    """
    codingExonsFileName is the name of a file containing information about every annotated
        coding exon in the genome. Each line contains five Tab-separated fields, namely
            chromosome, strand, frame, start, end (start <= end)
        for the annotated coding portion of every exon in the genome.
        Frame is the remainder mod 3 of the chromosomal coordinate of the first base of a
        codon, first being counted along the + strand even if the exon is on the - strand.
    genomeLength is the sum of the number of nucleotides of all chromosomes (and
        scaffolds) in the genome assembly.
        
    Return a 4-tuple containing the parameters needed by get_coding_hmm, namely:
        codingPrior: probability that a random codon is coding
        codingNumCodons: typical length of a coding region, in codons; the reciprocal of
            the probability of going from a coding state to a noncoding state.
        nonCodingWeights: a list representing the probability that a coding to noncoding
            transition will go into a particular noncoding state (sum is 1).
            len(nonCodingWeights) determines the number of noncoding states.
        nonCodingNumCodonsList: typical region length for each of the noncoding states, in
            codons; the reciprocal of the probability of going from that state to the
            coding state. Must have same length as nonCodingWeights.
            Priors for each noncoding state will be computed from the weights and lengths.
    """
    exonList = []
    for line in myopen(codingExonsFileName):
        line = line.strip()
        words = line.split('\t')
        exonList.append(
            (words[0], words[1], int(words[2]), int(words[3]), int(words[4])))

    exonsByChrStrFr = {
    }  # {(chrom, strand, frame) : [(start1, end1), (start2, end2),...]
    exonList.sort()
    for (chrom, strand, frame, start, end) in exonList:
        exonsByChrStrFr.setdefault((chrom, strand, frame), []).append(
            (start, end))
    gapsNT = [
    ]  # Gaps between consecutive non-overlapping coding exons in same frame
    # Note: ideally this should be adjusting e1 and s2 to codon boundaries,
    # since we are interested in the gap between in-frame codons (even if we
    # are reporting it in NT).
    numExons = 0
    totalCodingLengthNT = 0
    for pairsList in exonsByChrStrFr.values(
    ):  # Deleting from sublist is much faster
        index = 0
        while index < len(pairsList) - 1:
            start1, end1 = pairsList[index]
            start2, end2 = pairsList[index + 1]
            if start2 <= end1:
                if end1 - start1 >= end2 - start2:
                    del pairsList[index + 1]
                else:
                    del pairsList[index]
            else:
                index += 1
        gapsNT.extend(start2 - end1 - 1
                      for (start1, end1), (start2,
                                           end2) in neighbors(pairsList)
                      if start2 > end1 + 1)  # Exclude 0 length
        numExons += len(pairsList)
        totalCodingLengthNT += sum(end - start + 1 for start, end in pairsList)

    # Estimate distribution of gaps between coding regions as a mixture of exponential distributions
    nonCodingLengthsNT, nonCodingWeights = estimate_gap_mixture_model(
        gapsNT, 3, numSteps=20)
    nonCodingLengthsInCodons = [x / 3 for x in nonCodingLengthsNT]

    codingPrior = totalCodingLengthNT / genomeLength / 6  # Prior for being coding in a particular frame
    codingLengthInCodons = totalCodingLengthNT / numExons / 3  # Mean length in codons

    return codingPrior, codingLengthInCodons, nonCodingWeights, nonCodingLengthsInCodons
def classify_regions(homeDir, regionsDir, codingBedFileName,
                     pseudoBedFileName):
    """
    Inputs: - PhyloCSF Regions bed files in regionsDir, produced by the PhyloCSF HMM.
                  The file names should have the format:
                      {CHROMOSOME}.Strand{STRAND}.Frame{FRAME}.coding.bed.gz
                  where STRAND is + or - and FRAME is 0, 1, or 2.
            - Coding and pseudogene annotated transcripts.
    Output: Regions.01.txt or Regions.01.txt
    Find overlapping transcripts, classify regions, and create extension regions.
    Also create input file for running PhyloCSF using the strategy=mle option.
    """
    assure_dir(homeDir)
    outFileName = get_output_fileName(homeDir, 1)
    err_msg('Reading transcripts')
    codingTrs = [bed_line_to_tr(line) for line in myopen(codingBedFileName)]
    pseudoTrs = [bed_line_to_tr(line) for line in myopen(pseudoBedFileName)]

    err_msg('Creating overlap checkers')
    codingOverlapChecker = OverlapChecker(codingTrs, onlyCDS=True)
    pseudoOverlapChecker = OverlapChecker(pseudoTrs, onlyCDS=False)

    outRecs = []
    lineCounter = plusCounter = minusCounter = 0

    def get_rec_name(rec):
        return '%s:%d-%d%s' % (rec.Chrom, rec.Start, rec.End, rec.Strand)

    err_msg('Processing input PhyloCSF Regions.')
    for bedFileName in ls(regionsDir):
        if not bedFileName.endswith('.coding.bed.gz'):
            continue
        for line in myopen(pjoin(regionsDir, bedFileName)):
            lineCounter += 1
            if '.Strand+' in bedFileName:
                plusCounter += 1
            else:
                minusCounter += 1
            dummyName, chrom, bedIntervals, strand = bed_line_to_intervals(
                line)
            assert len(bedIntervals) == 1
            interval = bedIntervals[0]
            rec = DictClass()
            rec.Chrom = chrom
            rec.Start = interval[0]
            rec.End = interval[1]
            rec.NumCodons = (rec.End - rec.Start + 1) // 3
            rec.Strand = strand
            rec.Name = get_rec_name(rec)
            rec.Parent = 'NA'
            assert (rec.End - rec.Start + 1) % 3 == 0, rec.Name

            # Find overlaps and set up RegType
            codingOverlapTrs = codingOverlapChecker.overlapping_trs(
                rec.Chrom, '+-', interval)
            pseudoOverlapTrs = pseudoOverlapChecker.overlapping_trs(
                rec.Chrom, '+-', interval)
            if len(pseudoOverlapTrs) > 0:
                rec.RegType = PseudoOverlap
            elif any_same_frame(interval, rec.Strand, codingOverlapTrs):
                rec.RegType = CodingOverlap
            elif any_anti_frame(interval, rec.Strand, codingOverlapTrs):
                rec.RegType = AntisenseOverlap
            else:
                rec.RegType = NoOverlap
            outRecs.append(rec)

            # Subtract from intervals overlaps with pseudogenes in any frame or
            #     coding in same or antisense frame.
            # Make a new record for each resulting segment
            if rec.RegType != NoOverlap:
                intervals = subtract_trs(interval, rec.Strand,
                                         pseudoOverlapTrs, codingOverlapTrs)
                for interval in intervals:
                    subRec = DictClass()
                    subRec.RegType = 'Extension'
                    subRec.Chrom = rec.Chrom
                    subRec.Start = interval[0]
                    subRec.End = interval[1]
                    subRec.NumCodons = (subRec.End - subRec.Start + 1) // 3
                    subRec.Strand = rec.Strand
                    subRec.Name = get_rec_name(subRec)
                    subRec.Parent = rec.Name
                    assert (subRec.End - subRec.Start +
                            1) % 3 == 0, subRec.Name
                    outRecs.append(subRec)

    outRecs.sort(key=lambda rec: (RegTypes.index(rec.RegType), rec.Chrom, rec.
                                  Strand, rec.Start, rec.End))

    fields = list(Fields)
    fields.remove(Rank)  # We'll insert it at the beginning later
    outDFW = DFW(outFileName, fields)
    for recInd, rec in enumerate(outRecs):
        outDFW.write_line(rec)
    outDFW.close()

    err_msg('Writing PhyloCSF input file.')
    _write_phyloCSF_in(homeDir)