Ejemplo n.º 1
0
    def updateIndels(self, snp, is_negative_strand):

        contig = snp.chromosome
        lcontig = self.mFasta.getLength(contig)

        # get location of insertion/deletion. The location
        # is after position, hence get position and position + 1
        code = self.mAnnotations.getSequence(contig, "+", snp.pos, snp.pos + 2)
        self.mCode = code

        variants = snp.genotype.split("/")
        for variant in variants:

            if variant[0] == "*":
                self.mVariantType.append("W")

            elif variant[0] == "+":
                toinsert = variant[1:]
                self.mVariantType.append("I")

            elif variant[0] == "-":
                todelete = variant[1:]
                # deletions need to be looked at in a wider range
                self.mVariantType.append("D")

            else:
                raise ValueError("unknown variant sign '%s'" % variant[0])

        # ignore non-coding Indels
        if code[0] and code[1] not in 'abcABC':
            return

        if is_negative_strand:
            variants = [Genomics.complement(x) for x in variants]

        for reference_codon in self.mReferenceCodons:

            variants = snp.genotype.split("/")
            variants = [x[1:] for x in variants]

            for variant in variants:
                if len(variant) % 3 != 0:
                    self.mVariantCodons.append("!")
                else:
                    self.mVariantCodons.append(variant)

            self.mVariantAAs.extend(
                [Genomics.translate(x) for x in self.mVariantCodons])
Ejemplo n.º 2
0
def countMotifs(infile, motifs):
    '''find regular expression *motifs* in
    sequences within fasta formatted *infile*.
    '''

    it = FastaIterator.FastaIterator(infile)
    positions = []
    while 1:
        try:
            seq = it.next()
        except StopIteration:
            break
        if not seq:
            break

        rseq = Genomics.complement(seq.sequence)
        lsequence = len(seq.sequence)
        pos = []
        for motif, pattern in motifs:

            for x in pattern.finditer(seq.sequence):
                pos.append((motif, "+", x.start(), x.end()))
            for x in pattern.finditer(rseq):
                pos.append(
                    (motif, "-", lsequence - x.end(), lsequence - x.start()))

        positions.append((seq.title, pos))

    return positions
Ejemplo n.º 3
0
    def updateSNPs(self, snp, is_negative_strand, pos):
        """update SNPs."""

        contig = snp.chromosome
        lcontig = self.mFasta.getLength(contig)
        reference_base = snp.reference_base

        if snp.genotype in "ACGTacgt":
            # homozygous substitution
            self.mVariantType.append("O")
        else:
            # heterozygous substitution
            self.mVariantType.append("E")

        # switch reference strand codon to correct strand
        if reference_base != "*" and is_negative_strand:
            reference_base = Genomics.complement(reference_base)

        # collect all possible variants of reference codons
        for reference_codon in self.mReferenceCodons:

            self.mReferenceAAs.append(Genomics.translate(reference_codon))

            # process single base changes
            variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)

            if reference_codon[pos] != reference_base:
                raise ValueError(
                    "base mismatch at %i (codon=%s,%i): codon:%s != genome:%s; `%s`"
                    % (snp.pos, reference_codon, pos, reference_codon[pos], reference_base, ";".join(map(str, snp)))
                )

            for variant_base in variant_bases:
                if is_negative_strand:
                    variant_base = Genomics.complement(variant_base)

        self.mVariantAAs.extend([Genomics.translate(x) for x in self.mVariantCodons])
Ejemplo n.º 4
0
    def updateSNPs(self, snp, is_negative_strand, pos):
        '''update SNPs.'''

        contig = snp.chromosome
        lcontig = self.mFasta.getLength(contig)
        reference_base = snp.reference_base

        if snp.genotype in 'ACGTacgt':
            # homozygous substitution
            self.mVariantType.append("O")
        else:
            # heterozygous substitution
            self.mVariantType.append("E")

        # switch reference strand codon to correct strand
        if reference_base != "*" and is_negative_strand:
            reference_base = Genomics.complement(reference_base)

        # collect all possible variants of reference codons
        for reference_codon in self.mReferenceCodons:

            self.mReferenceAAs.append(Genomics.translate(reference_codon))

            # process single base changes
            variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)

            if reference_codon[pos] != reference_base:
                raise ValueError("base mismatch at %i (codon=%s,%i): codon:%s != genome:%s; `%s`" %
                                 (snp.pos, reference_codon, pos, reference_codon[pos], reference_base, ";".join(map(str, snp))))

            for variant_base in variant_bases:
                if is_negative_strand:
                    variant_base = Genomics.complement(variant_base)

        self.mVariantAAs.extend([Genomics.translate(x)
                                 for x in self.mVariantCodons])
Ejemplo n.º 5
0
def GetMapSequences(sequences):

    map_cluster2sequence = {}
    map_sequence2cluster = {}
    # cluster sequences by identity
    # (clumsy sort, use hashes for bigger sets)
    for key, sequence in sequences.items():
        h = Genomics.GetHID(sequence)

        if h not in map_cluster2sequence:
            map_cluster2sequence[h] = []

        map_sequence2cluster[key] = h
        map_cluster2sequence[h].append(key)

    return map_cluster2sequence, map_sequence2cluster
Ejemplo n.º 6
0
def countSites(model):
    """count number of expected synonymous/nonsynonymous sites in a grammar.
    """

    ## number of synonymous/non-synonymous sites
    n, s = 0.0, 0.0

    xpi = model.evaluateTerminalFrequencies()[('COD0', 'COD1', 'COD2')]
    pi = {}
    for codon, f in xpi.items():
        pi["".join(codon).upper()] = f

    ## translate pi and the matrix to codons
    for key, value in pi.items():
        del pi[key]
        pi["".join(key).upper()] = value

    for codon, freq in pi.items():

        try:
            degeneracy = Genomics.GetDegeneracy(codon)
        except KeyError:
            continue

        for x in range(1, 4):
            d = (degeneracy[x] - 1.0) / 3.0
            s += freq * d
            n += freq * (1.0 - d)


##              if degeneracy[x] > 1:
##                  s += freq
##              else:
##                  n += freq

    assert (float("%5.2f" % (n + s)) == 3.0)

    ##     print s / (n+s)

    ##     n = 184.9
    ##     s = 76.1
    ##     t = n + s
    ##     n /= t
    ##     s /= t
    ##     print s / (n+s)

    return n, s
Ejemplo n.º 7
0
    def getCopy(self):
        """return a new copy.
        """

        new_entry = Prediction()

        new_entry.mExpand = self.mExpand

        new_entry.mPredictionId = self.mPredictionId
        new_entry.mQueryToken = self.mQueryToken
        new_entry.mQueryFrom = self.mQueryFrom
        new_entry.mQueryTo = self.mQueryTo
        new_entry.mSbjctToken = self.mSbjctToken
        new_entry.mSbjctStrand = self.mSbjctStrand
        new_entry.mSbjctFrom = self.mSbjctFrom
        new_entry.mSbjctTo = self.mSbjctTo
        new_entry.mRank = self.mRank
        new_entry.score = self.score
        new_entry.mQueryLength = self.mQueryLength
        new_entry.mQueryCoverage = self.mQueryCoverage
        new_entry.mNGaps = self.mNGaps
        new_entry.mNFrameShifts = self.mNFrameShifts
        new_entry.mNIntrons = self.mNIntrons
        new_entry.mNSplits = self.mNSplits
        new_entry.mNStopCodons = self.mNStopCodons
        new_entry.mPercentIdentity = self.mPercentIdentity
        new_entry.mPercentSimilarity = self.mPercentSimilarity
        new_entry.mTranslation = self.mTranslation
        new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom
        new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo
        new_entry.mAlignmentString = self.mAlignmentString
        new_entry.mQueryAli = self.mQueryAli
        new_entry.mSbjctAli = self.mSbjctAli

        if self.mExpand:
            new_entry.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector(
            )
            alignlib_lite.py_copyAlignment(new_entry.mMapPeptide2Translation,
                                           self.mMapPeptide2Translation)
            new_entry.mMapPeptide2Genome = Genomics.String2Alignment(
                new_entry.mAlignmentString)
        else:
            new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None
            new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None

        return new_entry
Ejemplo n.º 8
0
def ClusterPeptidesByHid(peptides):
    """cluster peptide sequences by hid."""
    map_cluster2peptide = {}
    map_peptide2cluster = {}

    # cluster peptides by identity
    # (clumsy sort, use hashes for bigger sets)
    for key, sequence in peptides.items():
        h = Genomics.GetHID(sequence)

        if h not in map_cluster2peptide:
            map_cluster2peptide[h] = []

        map_peptide2cluster[key] = h
        map_cluster2peptide[h].append(key)

    return map_cluster2peptide, map_peptide2cluster
Ejemplo n.º 9
0
def annotatePromoters( iterator, fasta, options ):
    """annotate promoters within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator( iterator )

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand )
        lcontig = fasta.getLength( gene[0][0].contig )
        promotors = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] )
            transcript_ids.append( transcript[0].transcript_id )
            # if tss is directly at start/end of contig, the tss will be within an exon.
            # otherwise, it is outside an exon.
            if is_negative_strand:
                promotors.append( (min( lcontig-options.promotor, ma), min(lcontig, ma + options.promotor)) )
            else:
                promotors.append( (max(0,mi - options.promotor), max(options.promotor,mi)) )

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have changed)
            promotors = Intervals.combine( promotors )
            transcript_ids = ["%i" % (x+1) for x in range(len(promotors) )]
            
        gtf = GTF.Entry()
        gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id )
        gtf.source = "promotor"

        x = 0
        for start, end in promotors:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write( "%s\n" % str(gtf) )
            npromotors += 1
            x += 1

    E.info( "ngenes=%i, ntranscripts=%i, npromotors=%i" % (ngenes, ntranscripts, npromotors) )
Ejemplo n.º 10
0
def transform_third_codon(start, end, intervals_with_gff):
    """transform: only return nucleotide positions in window (start, end) 
    that are in third codon position.
    """
    intervals = []
    for istart, iend, gff in intervals_with_gff:

        if gff.frame == ".":
            raise ValueError("need a frame for third codon positions.")

        # frame = nucleotides from start to next codon
        frame = int(gff.frame)

        # to make life easier, convert to 0-based coordinates,
        # with zero starting at first position in window
        # re-arrange positions on negative strand
        if Genomics.IsNegativeStrand(gff.strand):
            # convert to negative strand coordinates counting from 0
            coordinate_offset = end
            reverse = True
            istart, iend = end - iend, end - istart
        else:
            istart, iend = istart - start, iend - start
            reverse = False
            coordinate_offset = start

        # make sure that you start on a second codon position and within window
        if istart < 0:
            frame = (frame + istart) % 3
            istart = 0
        if frame != 0:
            istart -= (3 - frame)
        istart += 2

        iend = min(iend, end - start)

        for x in range(istart, iend, 3):

            if reverse:
                c = coordinate_offset - x - 1
            else:
                c = coordinate_offset + x
            intervals.append((c, c + 1))

    return Intervals.combineIntervals(intervals)
Ejemplo n.º 11
0
    def getEntropy(self, usage=None):
        """return entropy of a source in terms of a reference usage.
        Also called conditional entropy or encoding cost.

        Note that here I compute the sum over 20 entropies,
        one for each amino acid.

        If not given, calculate entropy.
        """

        e = 0
        freqs = Genomics.CalculateCodonFrequenciesFromCounts(
            self.mCodonCounts, self.mPseudoCounts)
        if usage is None:
            usage = freqs
        for codon, count in list(self.mCodonCounts.items()):
            e -= freqs[codon] * math.log(usage[codon])
        return e
Ejemplo n.º 12
0
    def fillFromTable(self, table_row):

        if len(table_row) == 25:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString) = table_row
        elif len(table_row) == 26:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString, self.mNAssembled) = table_row[:26]
        elif len(table_row) > 26:
            (self.mPredictionId, self.mQueryToken, self.mSbjctToken,
             self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom,
             self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo,
             self.mSbjctAli, self.mQueryLength, self.mQueryCoverage,
             self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits,
             self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity,
             self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo,
             self.mAlignmentString, self.mNAssembled) = table_row[:26]
        else:
            raise ValueError, "unknown format: %i fields" % len(data)
            sys.exit(0)

        if self.mExpand:
            self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector(
            )

            if self.mQueryAli != "" and self.mSbjctAli != "":
                alignlib_lite.py_AlignmentFormatEmissions(
                    self.mQueryFrom, self.mQueryAli, self.mSbjctFrom,
                    self.mSbjctAli).copy(self.mMapPeptide2Translation)

            self.mMapPeptide2Genome = Genomics.String2Alignment(
                self.mAlignmentString)
Ejemplo n.º 13
0
    def GetBlocks(self, s1, s2):
        """the strings have to be already aligned!!!"""

        handle_tmpfile, filename_tmpfile = tempfile.mkstemp()
        os.write(handle_tmpfile, ">s1\n%s\n" % (s1))
        os.write(handle_tmpfile, ">s2\n%s\n" % (s2))
        os.close(handle_tmpfile)

        statement = " ".join("(", self.mEnvironment,
                             self.mExecutable % filename_tmpfile,
                             self.mOptions, ")")

        p = subprocess.Popen(statement,
                             shell=True,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (file_stdout, file_stdin, file_stderr) = (p.stdin, p.stdout, p.stderr)

        file_stdin.close()
        lines = file_stdout.readlines()
        lines_stderr = file_stderr.readlines()
        exit_code = file_stdout.close()
        file_stderr.close()
        if exit_code:
            raise ValueError("Error while executing statement %s" % statement)

        if not os.path.exists(filename_tmpfile + "-gb"):
            os.remove(filename_tmpfile)
            return "", ""

        lines = open(filename_tmpfile + "-gb").readlines()
        r = Genomics.ParseFasta2Hash(lines)

        if not r:
            return "", ""

        os.remove(filename_tmpfile)
        os.remove(filename_tmpfile + "-gb")
        os.remove(filename_tmpfile + "-gb.htm")

        return r['s1'], r['s2']
Ejemplo n.º 14
0
    def loadSequence(self, sequence, seqtype="na"):
        """load sequence properties from a sequence."""

        SequenceProperties.loadSequence(self, sequence, seqtype)

        if len(sequence) % 3:
            raise ValueError(
                '''sequence length is not a multiple of 3 (length=%i)''' %
                (len(sequence)))

        # counts of amino acids
        self.mCountsAA = {}

        for x in Bio.Alphabet.IUPAC.extended_protein.letters:
            self.mCountsAA[x] = 0

        for codon in (sequence[x:x + 3] for x in range(0, len(sequence), 3)):
            aa = Genomics.MapCodon2AA(codon)
            self.mCountsAA[aa] += 1
Ejemplo n.º 15
0
def filterMali(mali, method="3rd"):
    """build a new multiple alignment based on a filter.

    valid methods are
    3rd:        only third positions
    4d:         only four-fold degenerate sites
    """

    if method not in ("3rd", "4d"):
        raise "unknown method %s" % method

    if method == "3rd":
        columns = range(2, mali.getWidth(), 3)

    elif method == "4d":
        # translate
        trans_mali = Mali.Mali()
        for id, seq in mali.items():
            s = []
            sequence = seq.mString
            l = len(sequence)
            for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                aa = Genomics.MapCodon2AA(codon)
                s.append(aa)

            trans_mali.addSequence(id, 0, l, "".join(s))

        # get four-fold (or higher) degenerate amino acids
        aa_columns = trans_mali.getColumns()
        columns = []
        for c in range(len(aa_columns)):
            chars = set(aa_columns[c])
            chars = chars.difference(set(mali.mGapChars))
            if len(chars) == 1:
                char = list(chars)[0].upper()
                try:
                    deg = Genomics.DegeneracyAA[char]
                except KeyError:
                    continue
                if deg >= 4:
                    columns.append(c * 3)

    mali.takeColumns(columns)
Ejemplo n.º 16
0
def WriteOverviewFrequencies(fields, table, options):

    WriteHeader(options)
    output = []

    for x in range(1, len(fields) - 1):
        for y in range(x + 1, len(fields)):
            frequencies = {}

            # collect frequencies per amino acid
            for c in table:
                codon = c[0]
                f1 = c[x]
                f2 = c[y]
                aa = Genomics.MapCodon2AA(codon)
                if aa not in frequencies:
                    frequencies[aa] = []

                frequencies[aa].append((codon, f1, f2))

            changed = {}

            # sort for both genomes, and check if preference has changed
            for aa, codons in frequencies.items():
                codons.sort(lambda x, y: cmp(x[1], y[1]))
                pref_codon1 = codons[-1]
                codons.sort(lambda x, y: cmp(x[2], y[2]))
                pref_codon2 = codons[-1]

                if pref_codon1 == pref_codon2:
                    continue
                else:
                    changed[aa] = [(True, pref_codon1[2], pref_codon1[0]),
                                   (False, pref_codon2[1], pref_codon2[0])]

            output += WriteChanges(fields[x], fields[y], changed, options)

    WriteOutput(output, options)
Ejemplo n.º 17
0
def maskMali(mali, method="seg"):
    """mask multiple alignment according to an external masker.
    """

    if method == "seg":
        masker = Masker.MaskerSeg()
    elif method == "bias":
        masker = Masker.MaskerBias()
    elif method == "random":
        masker = Masker.MaskerRandom()

    if mali.getAlphabet() == "na" and method in ("seg", "bias"):
        for id, s in mali.items():
            ss = Genomics.TranslateDNA2Protein(s.mString)
            mss = masker(ss)
            columns = []
            for x in range(0, len(mss)):
                if mss[x] in ("X", "x"):
                    columns += range(x, x + 3)
            mali.getEntry(id).maskColumns(columns)
    else:
        for id, s in mali.items():
            mali[id].mString = masker(s.mString)
Ejemplo n.º 18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "apply",
                          "change-format",
                          "renumber-reads",
                          "sample",
                          "sort",
                          "trim3",
                          "trim5",
                          "unique",
                          "reverse-complement",
                          "grep"),
                      help="method to apply [%default]")

    parser.add_option(
        "--target-format", dest="target_format", type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="guess quality score format and set quality scores "
        "to format [default=%default].")

    parser.add_option(
        "--guess-format", dest="guess_format", type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option(
        "--sample-size", dest="sample_size", type="float",
        help="proportion of reads to sample. "
        "Provide a proportion of reads to sample, e.g. 0.1 for 10%, "
        "0.5 for 50%, etc [default=%default].")

    parser.add_option(
        "--pair-fastq-file", dest="pair", type="string",
        help="if data is paired, filename with second pair. "
        "Implemented for sampling [default=%default].")

    parser.add_option(
        "--map-tsv-file", dest="map_tsv_file", type="string",
        help="filename with tab-separated identifiers mapping for "
        "method apply [default=%default].")

    parser.add_option(
        "--num-bases", dest="nbases", type="int",
        help="number of bases to trim [default=%default].")

    parser.add_option(
        "--seed", dest="seed", type="int",
        help="seed for random number generator [default=%default].")

    parser.add_option(
        "--pattern-identifier", dest="renumber_pattern", type="string",
        help="rename reads in file by pattern [default=%default]")

    parser.add_option(
        "--grep-pattern", dest="grep_pattern", type="string",
        help="subset to reads matching pattern [default=%default]")

    parser.set_defaults(
        method=None,
        change_format=None,
        guess_format=None,
        sample_size=0.1,
        nbases=0,
        pair=None,
        apply=None,
        seed=None,
        renumber_pattern="read_%010i",
        grep_pattern=".*")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    c = E.Counter()

    if options.method is None:
        raise ValueError("no method specified, please use --method")

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "reverse-complement":
        for record in Fastq.iterate(options.stdin):
            record.seq = Genomics.complement(record.seq)
            record.quals = record.quals[::-1]
            options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename pattern for "
                    "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.output_filename_pattern, "w")

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)
        else:
            for record in Fastq.iterate(options.stdin):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(IOTools.readList(IOTools.openFile(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn(
                "consider sorting individual fastq files - "
                "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(IOTools.openFile(options.pair))):
                entries1[
                    record1.identifier[:-2]] = (record1.seq, record1.quals)
                entries2[
                    record2.identifier[:-2]] = (record2.seq, record2.quals)

            outfile1 = options.stdout
            outfile2 = IOTools.openFile(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
Ejemplo n.º 19
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string",
                      help="OUTPUT filename with histogram information on aggregate coverages [%default].")

    parser.add_option("--read-length-mean", dest="read_length_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--read-length-std", dest="read_length_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--coverage-mean", dest="coverage_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--coverage-std", dest="coverage_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--ds-mean", dest="ds_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--ds-std", dest="ds_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--error-mean", dest="error_mean", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--error-std", dest="error_stddev", type="float",
                      help="simulation parameter [default=%default].")

    parser.add_option("--min-read-length", dest="min_read_length", type="int",
                      help="minimum read length [default=%default].")

    parser.add_option("--sample-size", dest="sample_size", type="int",
                      help="randomly sample from selected transcripts [default=%default].")

    parser.add_option("--test", dest="test", type="int",
                      help="test with # first entries [default=%default].")

    parser.add_option("--mode", dest="mode", type="choice",
                      choices=("genes", "transcripts"),
                      help="use genes or transcripts [default=%default].")

    parser.set_defaults(
        genome_file=None,
        read_length_mean=200.0,
        read_length_stddev=20.0,
        coverage_mean=2.0,
        coverage_stddev=1.0,
        ds_mean=None,
        ds_stddev=None,
        error_mean=None,
        error_stddev=None,
        min_read_length=50,
        test=None,
        mode="transcripts",
        output_filename_pattern=None,
        output_format_id="%010i",
        sample_size=0,
    )

    (options, args) = E.Start(parser, argv)

    assert options.genome_file, "please supply an indexed genome."

    if options.output_filename_pattern:
        outfile_stats = IOTools.openFile(options.output_filename_pattern % "stats", "w")
        outfile_stats.write(
            "id\tlen\tnreads\tlen_mean\tlen_std\tcov_mean\tcov_std\n")
        outfile_map = IOTools.openFile(options.output_filename_pattern % "map", "w")
        outfile_map.write("id\ttranscript\n")
    else:
        outfile_stats = None
        outfile_map = None

    genome = IndexedFasta.IndexedFasta(options.genome_file)

    ninput, noutput, nskipped = 0, 0, 0

    total_counts, total_read_lengths, total_len = [], [], 0
    total_pids = []
    total_error_pids = []

    if options.mode == "transcripts":
        iterator = GTF.transcript_iterator(
            GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon"))
        getId = lambda x: x.transcript_id
    elif options.mode == "genes":
        iterator = GTF.flat_gene_iterator(
            GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon"))
        getId = lambda x: x.gene_id

    if options.sample_size:
        iterator = Iterators.sample(iterator)

    if options.ds_mean:
        do_mutate = True
        pid_calc = SequencePairProperties.SequencePairPropertiesPID()
    else:
        do_mutate = False

    if options.error_mean:
        do_error = True
        pid_calc = SequencePairProperties.SequencePairPropertiesPID()
    else:
        do_error = False

    for gtfs in iterator:

        id = getId(gtfs[0])

        try:
            sequence = GTF.toSequence(gtfs, genome)
        except KeyError as msg:
            if options.loglevel >= 2:
                options.stdlog.write("# skipping %s: %s\n" % (id, msg))
            nskipped += 1
            continue

        lsequence = len(sequence)

        if lsequence <= options.min_read_length * 2:
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# skipping %s - sequence is too short: %i\n" % (id, lsequence))
            nskipped += 1
            continue

        ninput += 1

        if do_mutate:
            new_sequence = getMutatedSequence(sequence, options.ds_mean)
            pid_calc.loadPair(sequence, new_sequence)
            pid = pid_calc.mPID
            total_pids.append(pid)
            sequence = new_sequence
        else:
            pid = 100.0

        if options.loglevel >= 2:
            options.stdlog.write(
                "# processing %s - len=%i\n" % (id, lsequence))
            options.stdlog.flush()

        total_len += lsequence
        lvsequence = lsequence * \
            random.gauss(options.coverage_mean, options.coverage_stddev)

        covered = 0
        counts = numpy.zeros(lsequence)
        nreads = 0

        error_pids, read_lengths = [], []

        while covered < lvsequence:

            read_length = int(
                random.gauss(options.read_length_mean, options.read_length_stddev))
            positive = random.randint(0, 1)

            if positive:
                start = random.randint(0, lsequence)
                end = min(lsequence, start + read_length)
            else:
                end = random.randint(0, lsequence)
                start = max(0, end - read_length)

            read_length = end - start
            if read_length < options.min_read_length:
                continue

            segment = sequence[start:end]

            if not positive:
                segment = Genomics.complement(segment)

            noutput += 1

            if do_error:
                new_segment = getMutatedSequence(segment, options.error_mean)
                pid_calc.loadPair(segment, new_segment)
                pid = pid_calc.mPID
                error_pids.append(pid)
                segment = new_segment
            else:
                pid = 100.0

            options.stdout.write(
                ">%s\n%s\n" % (options.output_format_id % noutput, segment))

            if outfile_map:
                outfile_map.write(
                    "%s\t%s\n" % (id, options.output_format_id % noutput))

            for x in range(start, end):
                counts[x] += 1

            nreads += 1

            covered += read_length
            read_lengths.append(read_length)

        if options.loglevel >= 2:
            options.stdout.write("# transcript %s: len=%i, nreads=%i, len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f\n" % (id,
                                                                                                                                        lsequence,
                                                                                                                                        nreads,
                                                                                                                                        numpy.mean(
                                                                                                                                            read_lengths),
                                                                                                                                        numpy.std(
                                                                                                                                            read_lengths),
                                                                                                                                        numpy.mean(
                                                                                                                                            counts),
                                                                                                                                        numpy.std(counts)))
        if outfile_stats:
            outfile_stats.write("%s\t%i\t%i\t%.2f\t%.2f\t%.2f\t%.2f\n" % (id,
                                                                          lsequence,
                                                                          nreads,
                                                                          numpy.mean(
                                                                              read_lengths),
                                                                          numpy.std(
                                                                              read_lengths),
                                                                          numpy.mean(
                                                                              counts),
                                                                          numpy.std(counts)))

        total_counts += list(counts)
        total_read_lengths += read_lengths
        total_error_pids += error_pids

        if options.test and ninput >= options.test:
            break

        if options.sample_size and ninput >= options.sample_size:
            break

    if options.loglevel >= 1:
        output = ["len=%i, nreads=%i" % (total_len,
                                         noutput)]
        output.append("len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f" % (
            numpy.mean(total_read_lengths),
            numpy.std(total_read_lengths),
            numpy.mean(total_counts),
            numpy.std(total_counts)))

        no_uncovered = [x for x in total_counts if x > 0]

        output.append("cov0_mean=%.2f, cov0_stddev=%.2f" % (numpy.mean(no_uncovered),
                                                            numpy.std(no_uncovered)))

        if do_mutate:
            output.append("pid_mean=%.2f, pid_std=%.2f" %
                          (numpy.mean(total_pids), numpy.std(total_pids)))

        if do_error:
            output.append("pid_error_mean=%.2f, pid_error_std=%.2f" %
                          (numpy.mean(total_error_pids), numpy.std(total_error_pids)))

        options.stdlog.write("# effective: %s\n" % ", ".join(output))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped))

    E.Stop()
Ejemplo n.º 20
0
def buildCompactVariantSequences(variants, sequences):
    '''build variant sequences by inserting ``variants`` into ``sequences``.

    The original frame of the sequence is maintained by
    converting the input sequence to a list. Each entry
    in the list corresponds to a position in a wild type.

    The wild type (WT) sequence is lower case

    SNP:                    variant (ambiguity codes for variants)
    homozygous insertion:   upper-case bases after lower-case (WT) base
    heterozygous insertion: lower-case bases after lower-case (WT) base
    homozygous deletion:    empty fields
    heterozygous deletion:  "-" after lower-case (WT) base

    returns a dictionary of lists.
    '''

    result = {}
    for key, sequence in sequences.iteritems():
        variant_seq = list(sequence.lower())
        start, end = key

        # get all variants that overlap with sequences
        for var_start, var_end, values in variants.find(start, end):
            reference, action, has_wildtype, variantseqs = values

            is_homozygous = len(variantseqs) == 1 and not has_wildtype

            rel_start, rel_end = var_start - start, var_end - start
            startoffset = max(0, start - var_start)
            endoffset = max(0, var_end - end)

            if action == "=":
                assert rel_start >= 0
                assert sequence[rel_start].upper() == reference, \
                    'reference base mismatch: expected %s, got %s at %i-%i' % \
                    (sequence[rel_start].upper(), reference,
                     var_start, var_end)

                if is_homozygous:
                    variant_seq[rel_start] = variantseqs[0]
                else:
                    variant_seq[rel_start] = Genomics.resolveReverseAmbiguousNA(
                        "".join(variantseqs))

            elif action == "-":

                xstart, xend = max(0, rel_start), min(len(sequence), rel_end)

                for variant in variantseqs:
                    # truncated for variants of unequal lengths (-AA/-AAA)
                    refseq = sequence[xstart:xend].upper()[:len(variant)]

                    assert refseq == variant[startoffset:len(variant) - endoffset], \
                        'reference base mismatch at deletion: expected %s %s %s, got %s[%i:%i] at %i-%i (%i-%i), action=%s' % \
                        (sequence[xstart - 10:xstart],
                         refseq,
                         sequence[xend:xend + 10],
                         variant, startoffset, len(variant) - endoffset,
                         var_start, var_end, start, end,
                         action)

                    l = len(variant) - startoffset - endoffset

                    if is_homozygous:
                        variant_seq[xstart:xend] = [""] * l
                    else:
                        for x in range(xstart, xend):
                            if variant_seq[x].endswith("-"):
                                assert not has_wildtype
                                variant_seq[x] = ""
                            else:
                                variant_seq[x] += "-"

            elif action == "+":

                if is_homozygous:
                    variant_seq[rel_start] += variantseqs[0].upper()
                else:
                    if has_wildtype:
                        variant_seq[rel_start] += variantseqs[0].upper()
                    else:
                        # merge indels like +AAA/+AA
                        a, b = variantseqs
                        if a.startswith(b):
                            variant_seq[
                                rel_start] += b.upper() + a[len(b):].lower()
                        elif b.startswith(a):
                            variant_seq[
                                rel_start] += a.upper() + b[len(a):].lower()
                        else:
                            raise ValueError(
                                "don't know how to encode variant: %s" % variantseqs)

        result[(start, end)] = variant_seq
    return result
Ejemplo n.º 21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tablename", dest="tablename", type="string",
                      help="tablename to get variants from (in samtools pileup format) [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-f", "--exons-file", dest="filename_exons", type="string",
                      help="filename with transcript model information (gtf formatted file)  [default=%default].")
    parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string",
                      help="filename with transcript models of a reference gene set. Stop codons that do not"
                      " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default].")
    parser.add_option("--vcf-file", dest="filename_vcf", type="string",
                      help="filename with variants in VCF format. Should be indexed by tabix  [default=%default].")
    parser.add_option("--pileup-file", dest="filename_pileup", type="string",
                      help="filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default].")
    parser.add_option("--vcf-sample", dest="vcf_sample", type="string",
                      help="sample id for species of interest in vcf formatted file [default=%default].")
    parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string",
                      help="filename of a list of transcript ids that are selenoproteins [default=%default].")
    parser.add_option("-m", "--module", dest="modules", type="choice", action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o", "--output-section", dest="output", type="choice", action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true",
                      help="add alleles that are knocked out to fasta and gtf files [default=%default].")

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(
            options.database, options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(
            options.filename_vcf, options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(
            ("gene_id",
             "transcript_id", "allele_id", "contig", "strand",
             "is_wildtype",
             ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print "# collected variants:", variants

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print "# merged variants:", variants

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(
                indexed_variants, all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print "exon", key
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print "intron", key
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] + variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(transcript,
                                   variant_exons,
                                   variant_introns,
                                   all_exons,
                                   all_introns,
                                   offsets,
                                   is_seleno=transcript_id in seleno,
                                   reference_coordinates=False,
                                   )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(allele.exon_starts[1:],
                                                                allele.cds_starts[
                                                                    1:],
                                                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(
                            ">%s\n%s\n" % (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig - allele.reference_first_stop_start, )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % (
                        "\t".join((gene_id,
                                   transcript_id,
                                   allele_id,
                                   contig,
                                   strand,
                                   "%i" % is_wildtype)),
                        "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-t",
        "--tablename",
        dest="tablename",
        type="string",
        help=
        "tablename to get variants from (in samtools pileup format) [default=%default]."
    )
    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with transcript model information (gtf formatted file)  [default=%default]."
    )
    parser.add_option(
        "-r",
        "--filename-reference",
        dest="filename_reference",
        type="string",
        help=
        "filename with transcript models of a reference gene set. Stop codons that do not"
        " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default]."
    )
    parser.add_option(
        "--vcf-file",
        dest="filename_vcf",
        type="string",
        help=
        "filename with variants in VCF format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--pileup-file",
        dest="filename_pileup",
        type="string",
        help=
        "filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help=
        "sample id for species of interest in vcf formatted file [default=%default]."
    )
    parser.add_option(
        "-s",
        "--seleno-tsv-file",
        dest="filename_seleno",
        type="string",
        help=
        "filename of a list of transcript ids that are selenoproteins [default=%default]."
    )
    parser.add_option("-m",
                      "--module",
                      dest="modules",
                      type="choice",
                      action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option(
        "-k",
        "--with-knockouts",
        dest="with_knockouts",
        action="store_true",
        help=
        "add alleles that are knocked out to fasta and gtf files [default=%default]."
    )

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(options.database,
                                             options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(options.filename_vcf,
                                          options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(("gene_id", "transcript_id",
                                         "allele_id", "contig", "strand",
                                         "is_wildtype",
                                         ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(indexed_variants,
                                                    all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] +
                        variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(
                transcript,
                variant_exons,
                variant_introns,
                all_exons,
                all_introns,
                offsets,
                is_seleno=transcript_id in seleno,
                reference_coordinates=False,
            )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(
                                allele.exon_starts[1:], allele.cds_starts[1:],
                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(">%s\n%s\n" %
                                               (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig -
                        allele.reference_first_stop_start,
                    )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % ("\t".join(
                        (gene_id, transcript_id, allele_id, contig, strand,
                         "%i" % is_wildtype)), "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 23
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/cds2codons.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="filename with mapping information.")
    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output file format [fasta-codons].")
    parser.add_option("-c",
                      "--codons",
                      dest="codons",
                      action="store_true",
                      help="print codons separated by spaces.")

    parser.set_defaults(
        filename_cds=None,
        codons=False,
        format="fasta",
        filename_map=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if not options.filename_map:
        raise "please supply filename with map between peptide to cds."

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#":
                continue
            m = Map()
            m.Read(line)
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.filename_cds:
        sequences = Genomics.ReadPeptideSequences(
            open(options.filename_cds, "r"))
    else:
        sequences = Genomics.ReadPeptideSequences(sys.stdin)

    if options.loglevel >= 1:
        print "# read %i sequences" % len(sequences)
        sys.stdout.flush()

    ninput, nskipped, noutput, nerrors, nstops = 0, 0, 0, 0, 0

    for key, s in sequences.items():

        ninput += 1

        if key not in map_old2new:
            nskipped += 1
            continue

        out_seq = []

        m = map_old2new[key]
        m.Expand()
        mm = m.mMapOld2New

        if mm.getColTo() > len(s):
            options.stderr.write(
                "# error for %s: sequence shorter than alignment: %i < %i\n" %
                (key, len(s), mm.getColTo()))
            nerrors += 1
            continue

        for x in range(mm.getRowFrom(), mm.getRowTo() + 1):

            y = mm.mapRowToCol(x)
            if y > 0:
                out_seq.append(s[y - 1])

        m.Clear()

        out_seq = "".join(out_seq)
        translation = Genomics.TranslateDNA2Protein(out_seq)

        if "X" in translation:
            nstops += 1

        if options.codons:
            out_seq = " ".join(
                [out_seq[x:x + 3] for x in range(0, len(out_seq), 3)])

        noutput += 1
        options.stdout.write(">%s\n%s\n" % (key, out_seq))

    options.stderr.write("# input=%i, output=%i, errors=%i, stops=%i\n" %
                         (ninput, noutput, nerrors, nstops))

    E.Stop()
Ejemplo n.º 24
0
    def getSequence(self,
                    contig,
                    strand="+",
                    start=0,
                    end=0,
                    converter=None,
                    as_array=False):
        """get a genomic fragment.

        A genomic fragment is identified by the coordinates
        contig, strand, start, end.

        The converter function supplied translated these coordinates
        into 0-based coordinates. By default, start and end are assumed
        to be pythonic coordinates and are forward/reverse coordinates.

        If as_array is set to true, return the AString object. This might
        be beneficial for large sequence chunks. If as_array is set to False,
        return a python string.
        """

        contig = self.getToken(contig)

        data = self.mIndex[contig]
        # dummy is
        # -> pos_seq for seekable streams
        # -> block_size for unseekable streams
        try:
            pos_id, dummy, lsequence = struct.unpack("QQi", data)
        except (struct.error, TypeError):
            pos_id, dummy, lsequence, points = data

        pos_seq = dummy
        block_size = dummy

        if end == 0:
            end = lsequence

        if end > lsequence:
            raise ValueError("3' coordinate on %s out of bounds: %i > %i" %
                             (contig, end, lsequence))

        if start < 0:
            raise ValueError("5' coordinate on %s out of bounds: %i < 0" %
                             (contig, start))

        if converter:
            first_pos, last_pos = converter(start, end,
                                            str(strand) in ("+", "1"),
                                            lsequence)
        elif self.mConverter:
            first_pos, last_pos = self.mConverter(start, end,
                                                  str(strand) in ("+", "1"),
                                                  lsequence)
        else:
            first_pos, last_pos = start, end
            if str(strand) in ("-", "0", "-1"):
                first_pos, last_pos = lsequence - \
                    last_pos, lsequence - first_pos

        if first_pos == last_pos:
            return ""

        assert first_pos < last_pos, \
            "first position %i is larger than last position %i " % \
            (first_pos, last_pos)

        p = AString()

        if self.mNoSeek:
            # read directly from position
            p.fromstring(
                self.mDatabaseFile.read(block_size, data[3], first_pos,
                                        last_pos))
        else:
            first_pos += pos_seq
            last_pos += pos_seq

            self.mDatabaseFile.seek(first_pos)
            p.fromstring(self.mDatabaseFile.read(last_pos - first_pos))

        if str(strand) in ("-", "0", "-1"):
            p = AString(Genomics.reverse_complement(str(p)))

        if self.mTranslator:
            return self.mTranslator.translate(p)
        elif as_array:
            return p
        else:
            return p.tostring().decode("ascii")
Ejemplo n.º 25
0
def buildCompactVariantSequences(variants, sequences):
    '''build variant sequences by inserting ``variants`` into ``sequences``.

    The original frame of the sequence is maintained by
    converting the input sequence to a list. Each entry
    in the list corresponds to a position in a wild type.

    The wild type (WT) sequence is lower case

    SNP:                    variant (ambiguity codes for variants)
    homozygous insertion:   upper-case bases after lower-case (WT) base
    heterozygous insertion: lower-case bases after lower-case (WT) base
    homozygous deletion:    empty fields
    heterozygous deletion:  "-" after lower-case (WT) base

    returns a dictionary of lists.
    '''

    result = {}
    for key, sequence in sequences.items():
        variant_seq = list(sequence.lower())
        start, end = key

        # get all variants that overlap with sequences
        for var_start, var_end, values in variants.find(start, end):
            reference, action, has_wildtype, variantseqs = values

            is_homozygous = len(variantseqs) == 1 and not has_wildtype

            rel_start, rel_end = var_start - start, var_end - start
            startoffset = max(0, start - var_start)
            endoffset = max(0, var_end - end)

            if action == "=":
                assert rel_start >= 0
                assert sequence[rel_start].upper() == reference, \
                    'reference base mismatch: expected %s, got %s at %i-%i' % \
                    (sequence[rel_start].upper(), reference,
                     var_start, var_end)

                if is_homozygous:
                    variant_seq[rel_start] = variantseqs[0]
                else:
                    variant_seq[
                        rel_start] = Genomics.resolveReverseAmbiguousNA(
                            "".join(variantseqs))

            elif action == "-":

                xstart, xend = max(0, rel_start), min(len(sequence), rel_end)

                for variant in variantseqs:
                    # truncated for variants of unequal lengths (-AA/-AAA)
                    refseq = sequence[xstart:xend].upper()[:len(variant)]

                    assert refseq == variant[startoffset:len(variant) - endoffset], \
                        'reference base mismatch at deletion: expected %s %s %s, got %s[%i:%i] at %i-%i (%i-%i), action=%s' % \
                        (sequence[xstart - 10:xstart],
                         refseq,
                         sequence[xend:xend + 10],
                         variant, startoffset, len(variant) - endoffset,
                         var_start, var_end, start, end,
                         action)

                    l = len(variant) - startoffset - endoffset

                    if is_homozygous:
                        variant_seq[xstart:xend] = [""] * l
                    else:
                        for x in range(xstart, xend):
                            if variant_seq[x].endswith("-"):
                                assert not has_wildtype
                                variant_seq[x] = ""
                            else:
                                variant_seq[x] += "-"

            elif action == "+":

                if is_homozygous:
                    variant_seq[rel_start] += variantseqs[0].upper()
                else:
                    if has_wildtype:
                        variant_seq[rel_start] += variantseqs[0].upper()
                    else:
                        # merge indels like +AAA/+AA
                        a, b = variantseqs
                        if a.startswith(b):
                            variant_seq[rel_start] += b.upper(
                            ) + a[len(b):].lower()
                        elif b.startswith(a):
                            variant_seq[rel_start] += a.upper(
                            ) + b[len(a):].lower()
                        else:
                            raise ValueError(
                                "don't know how to encode variant: %s" %
                                variantseqs)

        result[(start, end)] = variant_seq
    return result
Ejemplo n.º 26
0
def ProcessResult(result, options, mali=None, prefix=None, p_value=None):

    counts = None

    if options.method == "summary-slr":

        thresholds = "95%", "99%", "95% corrected", "99% corrected"

        if prefix:
            options.stdout.write("%s\t" % prefix)

        options.stdout.write("%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t" % (
            result.mTreeLength,
            result.mOmega,
            result.mKappa,
            result.mLogLikelihood,
            len(result.mSites),
            result.mNSitesSynonymous,
            result.mNSitesGaps + result.mNSitesSingleChar,
        ))
        options.stdout.write("\t".join(
            map(lambda x: "%i" % result.mNPositiveSites[x][0], thresholds)))
        options.stdout.write("\t")
        options.stdout.write("\t".join(
            map(lambda x: "%i" % result.mNNegativeSites[x], thresholds)))
        options.stdout.write("\n")

    elif options.method in ("summary-filtered", "positive-site-table",
                            "negative-site-table", "neutral-site-table",
                            "positive-site-list", "negative-site-list",
                            "neutral-site-list"):

        mali_length = mali.getLength()
        mali_width = mali.getWidth()
        column_data = map(
            lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."),
            mali.getColumns())

        # sanity check: do lengths of mali and # of sites correspond
        if len(result.mSites) * 3 != mali_width:
            raise "mali (%i) and # of sites (%i) do not correspond." % (
                mali_width, len(result.mSites))

        if options.method == "summary-filtered":
            # count sites, but filter with multiple alignment
            ntotal = 0
            npositive = 0
            nnegative = 0
            nneutral = 0
            nfiltered = 0
            nsynonymous = 0

            if prefix:
                options.stdout.write("%s\t" % prefix)

            for x in range(len(result.mSites)):
                site = result.mSites[x]
                column = column_data[x * 3]

                if column.mNChars != mali_length:
                    nfiltered += 1
                    continue

                if site.isPositive(options.significance_threshold,
                                   options.use_adjusted):
                    npositive += 1
                elif site.isNegative(options.significance_threshold,
                                     options.use_adjusted):
                    nnegative += 1

                if site.isSynonymous():
                    nsynonymous += 1

                ntotal += 1

            options.stdout.write(
                "%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t%i\t%i\t%i\n" %
                (result.mTreeLength, result.mOmega, result.mKappa,
                 result.mLogLikelihood, len(result.mSites), nfiltered, ntotal,
                 nsynonymous, nnegative, npositive))
            counts = Result(nfiltered, ntotal, nsynonymous, nnegative,
                            npositive)

        elif options.method in (
                "positive-site-table",
                "negative-site-table",
                "neutral-site-table",
                "positive-site-list",
                "negative-site-list",
                "neutral-site-list",
        ):

            select_positive_sites = options.method in ("positive-site-table",
                                                       "positive-site-list")
            select_negative_sites = options.method in ("negative-site-table",
                                                       "negative-site-list")

            # iterate over sites and output those under xxx selection
            identifiers = mali.getIdentifiers()
            chars_per_row = [[] for x in range(mali_length)]

            sites = []

            for col in range(len(result.mSites)):

                site = result.mSites[col]
                column = column_data[col * 3]

                if column.mNChars != mali_length:
                    continue

                keep = False

                if select_positive_sites and site.isPositive(
                        options.significance_threshold, options.use_adjusted):
                    keep = True

                elif select_negative_sites and site.isNegative(
                        options.significance_threshold, options.use_adjusted):
                    keep = True

                if not keep:
                    continue

                sites.append((col, site))

            nsites = len(sites)

            if options.truncate_sites_list:
                # truncate sites list, sort by significance
                sites.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue))
                sites = sites[:options.truncate_sites_list]

            for col, site in sites:

                site = result.mSites[col]
                xcol = col * 3

                for row in range(mali_length):
                    id = identifiers[row]
                    x = max(xcol - options.context_size * 3, 0)
                    y = min(xcol + 3 + options.context_size * 3, mali_width)
                    segment = mali[id][x:y]
                    codon = mali[id][xcol:xcol + 3]
                    pos = mali.getResidueNumber(id, xcol)
                    pos /= 3

                    # save as real-world coordinates
                    chars_per_row[row].append(
                        PositionInformation(
                            Genomics.MapCodon2AA(codon), pos + 1, xcol,
                            Genomics.TranslateDNA2Protein(segment).upper()))

            if p_value is not None:
                pp_value = p_value
            else:
                pp_value = "na"

            if options.method in ("positive-site-table", "negative-site-table",
                                  "neutral-site-table"):

                if options.context_size:
                    for row in range(mali_length):
                        if prefix:
                            options.stdout.write("%s\t" % prefix)

                        options.stdout.write(
                            "%s\t%i\t%s\t%s\n" %
                            (identifiers[row], nsites, pp_value, ";".join([
                                "%s%i in %s" %
                                (x.mAA, x.mSequencePosition, x.mContext)
                                for x in chars_per_row[row]
                            ])))
                else:
                    for row in range(mali_length):
                        if prefix:
                            options.stdout.write("%s\t" % prefix)

                        options.stdout.write(
                            "%s\t%i\t%s\t%s\n" %
                            (identifiers[row], nsites, pp_value, ";".join([
                                "%s%i" % (x.mAA, x.mSequencePosition)
                                for x in chars_per_row[row]
                            ])))

            elif options.method in ("positive-site-list", "negative-site-list",
                                    "neutral-site-list"):

                for row in range(mali_length):

                    if prefix:
                        xprefix = "%s\t%s" % (prefix, identifiers[row])
                    else:
                        xprefix = "%s" % (identifiers[row])
                    x = 0
                    for chars in chars_per_row[row]:
                        x += 1
                        options.stdout.write(
                            "%s\t%i\t%s\t%i\t%i\t%s\n" %
                            (xprefix, x, chars.mAA, chars.mSequencePosition,
                             chars.mMaliPosition, chars.mContext))

    options.stdout.flush()

    return counts
Ejemplo n.º 27
0
    def updateProperties(self):

        SequencePropertiesCodons.updateProperties(self)

        self.mCodonFrequencies = Genomics.CalculateCodonFrequenciesFromCounts(
            self.mCodonCounts)
Ejemplo n.º 28
0
Archivo: Variants.py Proyecto: SCV/cgat
def updateVariants(variants, lcontig, strand, phased=True):
    '''update variants such that they use same coordinate
    system (and strand) as the transcript

    fixes 1-ness of variants
    '''

    new_variants = []
    is_positive = Genomics.IsPositiveStrand(strand)

    for variant in variants:

        pos = variant.pos
        genotype = bytes(variant.genotype)
        reference = bytes(variant.reference)

        # fix 1-ness of variants
        # pos -= 1

        if len(genotype) == 1:
            variantseqs = list(Genomics.decodeGenotype(genotype))
            has_wildtype = reference in variantseqs
            action = "="
            start, end = pos, pos + 1
        else:

            variantseqs = [x[1:] for x in genotype.split("/")]
            lvariant = max([len(x) for x in variantseqs])
            if not phased:
                variantseqs = [x for x in variantseqs if x]
            has_wildtype = "*" in genotype

            if "+" in genotype and "-" in genotype:
                # both insertion and deletion at position
                # the range is given by the deletion
                # see below for explanations
                if genotype.startswith("+"):
                    action = ">"
                    variantseqs[1] += "-" * (lvariant - len(variantseqs[1]))
                else:
                    action = "<"
                    variantseqs[0] += "-" * (lvariant - len(variantseqs[0]))

                start, end = pos + 1, pos + lvariant + 1

            elif "-" in genotype:
                action = "-"
                # samtools: deletions are after the base denoted by snp.position
                #   * <- deletion at 1
                # 0 1 2 3 4 5 6
                #     - -
                # 6 5 4 3 2 1 0
                # deletion of 2+3 = (2,4)
                # on reverse: (7-4, 7-2) = (3,5)
                start, end = pos + 1, pos + lvariant + 1

                # deletions of unequal length are filled up with "-"
                # This is necessary to deal with negative strands:
                # -at/-atg on the positive strand deletes a t [g]
                # -at/-atg on the negative strand deletes [g] t a
                variantseqs = [x + "-" * (lvariant - len(x))
                               for x in variantseqs]

            elif "+" in genotype:
                action = "+"
                # indels are after the base denoted by position
                # as region use both flanking base so that negative strand
                # coordinates work
                # insertion between position 2 and 3
                #     * <- insection at pos 2
                # 0 1 2i3 4
                # 4 3 2i1 0
                # is insertion between 1 and 2 in reverse
                # including both flanking residues makes it work:
                # (2,3) = (5-3,5-2) = (2,3)
                # but:
                # (2,4) = (5-4,5-2) = (1,3)
                start, end = pos, pos + 2

        # revert strand
        if not is_positive:
            reference = Genomics.complement(reference)
            variantseqs = [Genomics.complement(x.upper()) for x in variantseqs]
            start, end = lcontig - end, lcontig - start

        new_variants.append(ExtendedVariant._make((
            start, end, reference.upper(), action, has_wildtype, variantseqs)))

    return new_variants
Ejemplo n.º 29
0
    def buildSequenceVariants(self, seq, strand, pos, snp):
        '''build new sequence by modifying a sequence fragment in seq at
        pos with snp.

        It is assumed that seq is already oriented according to strand.
        The strand is used to revert the snp if necessary.

        Note that only sequences different from seq will be returned.

        returns is_homozygous, seqs
        '''
        is_negative_strand = Genomics.IsNegativeStrand(strand)
        reference_base = snp.reference_base

        if reference_base != "*" and is_negative_strand:
            reference_base = Genomics.complement(reference_base)

        new_sequences = []
        is_homozygous = True
        if reference_base != "*":
            if seq[pos].upper() != reference_base.upper():
                raise ValueError("base mismatch at snp %i, expected %s, got %s in %s at position %i; snp=%s" %
                                 (snp.pos, reference_base, seq[pos], seq, pos,
                                  ";".join(map(str, snp))))

            # single base changes
            variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)
            if len(variant_bases) == 1:
                is_homozygous = True
            else:
                is_homozygous = False

            for variant_base in variant_bases:
                if is_negative_strand:
                    variant_base = Genomics.complement(variant_base)

                s = list(seq)
                s[pos] = variant_base
                s = "".join(s)
                if s != seq:
                    new_sequences.append(s)
        else:
            variants = snp.genotype.split("/")
            is_homozygous = False
            for variant in variants:

                s = list(seq)
                # samtools denotes insert/deletion after position
                # while python is before/at position, hence the pos+1
                if variant[0] == "+":
                    toinsert = variant[1:].upper()
                    if is_negative_strand:
                        toinsert = Genomics.complement(toinsert)
                        s.insert(pos, toinsert)
                    else:
                        s.insert(pos + 1, toinsert)

                elif variant[0] == "-":
                    # pos+1+len(x)-1 = pos+len(x)
                    todelete = variant[1:].upper()
                    l = len(todelete)
                    if is_negative_strand:
                        # delete left of pos
                        xstart = max(0, pos - l)
                        xend = pos
                        todelete = todelete[:min(l, pos)]
                    else:
                        # delete right of pos
                        xstart = pos + 1
                        xend = min(self.mSize, pos + 1 + l)
                        todelete = todelete[:self.mSize - (pos + 1)]

                    deleted = "".join(s[xstart:xend])

                    if is_negative_strand:
                        deleted = Genomics.complement(deleted)

                    if deleted != todelete:
                        raise ValueError("base mismatch at indel %i, expected %s, got %s in %s at position %i(%i:%i); is_negative_strand=%s, snp=%s" %
                                         (snp.pos, todelete, deleted, seq, pos, xstart, xend,
                                          is_negative_strand,
                                          ";".join(map(str, snp))))
                    del s[xstart:xend]

                elif variant[0] == "*":
                    is_homozygous = True
                else:
                    raise ValueError("unknown variant sign '%s'" % variant[0])

                s = "".join(s)
                if s != seq:
                    new_sequences.append(s)

        return is_homozygous, new_sequences
Ejemplo n.º 30
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option(
        "--merge-exons-distance",
        dest="merge_exons_distance",
        type="int",
        help="distance in nucleotides between " "exons to be merged [%default].",
    )

    parser.add_option(
        "--pattern-identifier",
        dest="pattern",
        type="string",
        help="pattern to use for renaming genes/transcripts. "
        "The pattern should contain a %i, for example "
        "--pattern-identifier=ENSG%010i [%default].",
    )

    parser.add_option(
        "--sort-order",
        dest="sort_order",
        type="choice",
        choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"),
        help="sort input data [%default].",
    )

    parser.add_option(
        "-u",
        "--with-utr",
        dest="with_utr",
        action="store_true",
        help="include utr in merged transcripts " "[%default].",
    )

    parser.add_option(
        "--filter-method",
        dest="filter_method",
        type="choice",
        choices=(
            "gene",
            "transcript",
            "longest-gene",
            "longest-transcript",
            "representative-transcript",
            "proteincoding",
            "lincrna",
        ),
        help="Filter method to apply. Available filters are: "
        "'gene': filter by gene_id given in ``--map-tsv-file``, "
        "'transcript': filter by transcript_id given in ``--map-tsv-file``, "
        "'longest-gene': output the longest gene for overlapping genes ,"
        "'longest-transcript': output the longest transcript per gene,"
        "'representative-transcript': output the representative transcript "
        "per gene. The representative transcript is the transcript "
        "that shares most exons with other transcripts in a gene. "
        "The input needs to be sorted by gene. "
        "'proteincoding': only output protein coding features. "
        "'lincrna': only output lincRNA features. "
        "[%default].",
    )

    parser.add_option(
        "-a",
        "--map-tsv-file",
        dest="filename_filter",
        type="string",
        metavar="tsv",
        help="filename of ids to map/filter [%default].",
    )

    parser.add_option(
        "--gff-file",
        dest="filename_gff",
        type="string",
        metavar="GFF",
        help="second filename of features (see --remove-overlapping) " "[%default]",
    )

    parser.add_option(
        "--invert-filter",
        dest="invert_filter",
        action="store_true",
        help="when using --filter, invert selection " "(like grep -v). " "[%default].",
    )

    parser.add_option(
        "--sample-size",
        dest="sample_size",
        type="int",
        help="extract a random sample of size # if the option "
        "'--method=filter --filter-method' is set "
        "[%default].",
    )

    parser.add_option(
        "--intron-min-length",
        dest="intron_min_length",
        type="int",
        help="minimum length for introns (for --exons-file2introns) " "[%default].",
    )

    parser.add_option(
        "--min-exons-length",
        dest="min_exons_length",
        type="int",
        help="minimum length for gene (sum of exons) " "(--sam-fileple-size) [%default].",
    )

    parser.add_option(
        "--intron-border",
        dest="intron_border",
        type="int",
        help="number of residues to exclude at intron at either end " "(--exons-file2introns) [%default].",
    )

    parser.add_option(
        "--ignore-strand",
        dest="ignore_strand",
        action="store_true",
        help="remove strandedness of features (set to '.') when "
        "using ``transcripts2genes`` or ``filter``"
        "[%default].",
    )

    parser.add_option(
        "--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[%default]"
    )

    parser.add_option(
        "--duplicate-feature",
        dest="duplicate_feature",
        type="choice",
        choices=("gene", "transcript", "both", "ucsc", "coordinates"),
        help="remove duplicates by gene/transcript. "
        "If ``ucsc`` is chosen, transcripts ending on _dup# are "
        "removed. This is necessary to remove duplicate entries "
        "that are next to each other in the sort order "
        "[%default]",
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        action="append",
        choices=(
            "add-protein-id",
            "exons2introns",
            "filter",
            "find-retained-introns",
            "genes-to-unique-chunks",
            "intersect-transcripts",
            "join-exons",
            "merge-exons",
            "merge-transcripts",
            "merge-genes",
            "merge-introns",
            "remove-overlapping",
            "remove-duplicates",
            "rename-genes",
            "rename-transcripts",
            "rename-duplicates",
            "renumber-genes",
            "renumber-transcripts",
            "set-transcript-to-gene",
            "set-gene-to-transcript",
            "set-protein-to-transcript",
            "set-score-to-distance",
            "set-gene_biotype-to-source",
            "sort",
            "transcript2genes",
            "unset-genes",
        ),
        help="Method to apply [%default]." "Please only select one.",
    )

    parser.set_defaults(
        sort_order="gene",
        filter_method="gene",
        pattern="%i",
        merge_exons_distance=0,
        filename_filter=None,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        ignore_strand=False,
        with_utr=False,
        invert_filter=False,
        duplicate_feature=None,
        strict=True,
        method=None,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.method is None:
        raise ValueError("please specify a --method")

    if len(options.method) > 1:
        raise ValueError("multiple --method arguements specified")
    else:
        options.method = options.method[0]

    if options.method == "set-transcript-to-gene":

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.method == "set-gene_biotype-to-source":

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            if "gene_biotype" not in gff:
                gff.setAttribute("gene_biotype", gff.source)

            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.method == "remove-duplicates":

        counts = collections.defaultdict(int)

        if options.duplicate_feature == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.duplicate_feature == "gene":
                gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(["\n".join([str(y) for y in xx]) for xx in x])
            elif options.duplicate_feature == "transcript":
                gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.duplicate_feature == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.duplicate_feature == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" % (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" % (id, counts[id]))

    elif "sort" == options.method:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort_order):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif "set-gene-to-transcript" == options.method:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif "set-protein-to-transcript" == options.method:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif "add-protein-id" == options.method:

        transcript2protein = IOTools.readMap(IOTools.openFile(options.filename_filter, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute("protein_id", transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" % len(missing))

    elif "join-exons" == options.method:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons]), max([x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif "merge-genes" == options.method:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(
            GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start"
        )

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, (
                        "input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n"
                    ) % (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or gffs[0].strand != last[0].strand or d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.method == "renumber-genes":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "unset-genes":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "renumber-transcripts":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "transcripts2genes":

        transcripts = set()
        genes = set()
        ignore_strand = options.ignore_strand
        for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if ignore_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes)))

    elif options.method in ("rename-genes", "rename-transcripts"):

        map_old2new = IOTools.readMap(IOTools.openFile(options.filename_filter, "r"))

        if options.method == "rename-transcripts":
            is_gene_id = False
        elif options.method == "rename-genes":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute("transcript_id", map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" % gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.method == "filter":

        keep_genes = set()
        if options.filter_method == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter_method in ("longest-transcript", "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                """select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                """
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"])
                    # add transcript id to sort to provide a stable
                    # segmentation.
                    transcript_counts.append((count, transcript[0].transcript_id, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][-1]

            if options.filter_method == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter_method == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                # sort in order to make reproducible which
                # gene is chosen.
                transcript = _select(sorted(gene))
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter_method in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(IOTools.openFile(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter_method == "gene"
                by_transcript = options.filter_method == "transcript"
                invert = options.invert_filter

                ignore_strand = options.ignore_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if ignore_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter_method == "gene":
                    iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
                elif options.filter_method == "transcript":
                    iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator, min_length=options.min_exons_length, feature="exon"
                    )

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--map-tsv-file) or a sample-size."

        elif options.filter_method in ("proteincoding", "lincrna", "processed-pseudogene"):
            # extract entries by transcript/gene biotype.
            # This filter uses a test on the source field (ENSEMBL pre v78)
            # a regular expression on the attributes (ENSEMBL >= v78).
            tag = {
                "proteincoding": "protein_coding",
                "processed-pseudogene": "processed_pseudogene",
                "lincrna": "lincRNA",
            }[options.filter_method]
            rx = re.compile('"%s"' % tag)
            if not options.invert_filter:
                f = lambda x: x.source == tag or rx.search(x.attributes)
            else:
                f = lambda x: x.source != tag and not rx.search(x.attributes)

            for gff in GTF.iterator(options.stdin):
                ninput += 1
                if f(gff):
                    options.stdout.write(str(gff) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1

    elif options.method == "exons2introns":

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [x for x in output_ranges if x[1] - x[0] > l]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.method == "set-score-to-distance":

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs]), max([x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.method == "remove-overlapping":

        index = GTF.readAndIndex(GTF.iterator(IOTools.openFile(options.filename_gff, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.method == "intersect-transcripts":

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif "rename-duplicates" == options.method:
        # note: this will only rename entries with "CDS" in feature column

        assert options.duplicate_feature in ["gene", "transcript", "both"], (
            "for renaming duplicates, --duplicate-feature must be set to one " "of 'gene', transcript' or 'both'"
        )

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if options.duplicate_feature in ["both", "gene"]:
                    if gtf.gene_id in dup_gene:
                        gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                        gtf.setAttribute("gene_id", gtf.gene_id + "." + str(gene_dict[gtf.gene_id]))

                if options.duplicate_feature in ["both", "transcript"]:
                    if gtf.transcript_id in dup_transcript:
                        transcript_dict[gtf.transcript_id] = transcript_dict[gtf.transcript_id] + 1
                        gtf.setAttribute(
                            "transcript_id", gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])
                        )

            options.stdout.write("%s\n" % gtf)

    elif options.method in ("merge-exons", "merge-introns", "merge-transcripts"):
        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError("can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            try:
                biotypes = [x["gene_biotype"] for x in gffs]
                biotype = ":".join(set(biotypes))
            except (KeyError, AttributeError):
                biotype = None

            if options.method == "merge-exons":
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.method == "merge-transcripts":

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                if biotype:
                    entry.addAttribute("gene_biotype", biotype)
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.method == "merge-introns":

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    elif options.method == "find-retained-introns":

        for gene in GTF.gene_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found_any = False
            for intron in find_retained_introns(gene):
                found_any = True
                options.stdout.write("%s\n" % str(intron))
                nfeatures += 1
            if found_any:
                noutput += 1

    elif options.method == "genes-to-unique-chunks":

        for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            for exon in gene_to_blocks(gene):
                options.stdout.write("%s\n" % str(exon))
                nfeatures += 1
            noutput += 1

    else:
        raise ValueError("unknown method '%s'" % options.method)

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()
Ejemplo n.º 31
0
def _alignToProfile( infile, outfile, 
                     min_score = 0 ):
    '''align sequences in *infile* against mali

    Only alignments with a score higher than *min_score* are accepted.

    Output multiple alignment in fasta format to *outfile* and a table
    in :file:`outfile.log`.
    '''

    mali = Mali.Mali()
    mali.readFromFile( open("../data/mouse.fasta") )
    src_mali = Mali.convertMali2Alignlib( mali )
    
    E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() ))

    # add pseudocounts
    profile_mali = mali.getClone()
    n = profile_mali.getNumColumns() 
    for x in "ACGT": 
        for y in range(0,2):
            profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n )


    profile_mali = Mali.convertMali2Alignlib( profile_mali )
    alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) )
    alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() )

    # bg = alignlib.FrequencyVector()
    # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) )
    # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov(
    #         alignlib.makeSubstitutionMatrixDNA4(),
    #         bg,
    #         "ACGTN",
    #         10.0, 1.0) )

    profile = alignlib.makeProfile( profile_mali )
    
    alignment_mode = alignlib.ALIGNMENT_WRAP

    alignator = alignlib.makeAlignatorDPFull( alignment_mode,
                                              -5.0,
                                              -0.5 )
    
    map_seq2profile = alignlib.makeAlignmentVector()
    map_rseq2profile = alignlib.makeAlignmentVector()
    profile.prepare()

    # print profile

    build_mali = alignlib.makeMultAlignment()
    m = alignlib.makeAlignmentVector()
    m.addDiagonal( 0, n, 0 )
    build_mali.add( src_mali, m )

    outf = open( outfile, "w" )
    outf_log = open( outfile + ".info", "w" )
    outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" )

    sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector()
    ids = []

    for pid in mali.getIdentifiers():
        sequences.append( re.sub( "-", "", mali[pid] ) )
        ids.append( pid )

    # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences ))

    c = E.Counter()

    for s in FastaIterator.FastaIterator( open(infile)):

        E.debug("adding %s" % s.title )
        c.input += 1
        rsequence = Genomics.complement(s.sequence)
        seq = alignlib.makeSequence( s.sequence )
        rseq = alignlib.makeSequence( rsequence )

        alignator.align( map_seq2profile, seq, profile )
        alignator.align( map_rseq2profile, rseq, profile )

        if map_seq2profile.getScore() > map_rseq2profile.getScore():
            m, seq, sequence = map_seq2profile, seq, s.sequence
        else:
            m, seq, sequence = map_rseq2profile, rseq, rsequence

        if m.getLength() == 0:
            c.skipped += 1
            continue

        if m.getScore() < min_score: 
            c.skipped += 1
            continue

        r = getParts( m )

        covered = 0
        for mm in r:
            build_mali.add( mm )
            sequences.append( sequence )
            ids.append( s.title )
            covered += mm.getLength() - mm.getNumGaps()

        mali_covered = m.getColTo() - m.getColFrom()

        outf_log.write( "\t".join( map(str, (
                        s.title,
                        len(s.sequence),
                        m.getRowFrom(),
                        m.getRowTo(),
                        len(r),
                        covered,
                        "%5.2f" % (100.0 * covered / len(s.sequence) ),
                        m.getScore(),
                        m.getColFrom(),
                        m.getColTo(),
                        mali_covered,
                        "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())
                        ) ) ) + "\n" )

        c.output += 1

    #build_mali.expand( aa )
    result = str(alignlib.MultAlignmentFormatPlain( build_mali, 
                                                    sequences, 
                                                    alignlib.UnalignedStacked ))

    for pid, data in zip(ids, result.split("\n") ):
        start, sequence, end = data.split("\t")
        outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) )


    outf.close()
    outf_log.close()

    E.info( "%s\n" % str(c) )
Ejemplo n.º 32
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: shuffle_fasta.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-c",
        "--codons",
        dest="codons",
        action="store_true",
        help="make sure that shuffled sequences only contain valid codons.")

    parser.add_option("-a",
                      "--conserve-aminos",
                      dest="conserve_aminos",
                      action="store_true",
                      help="conserve amino acids.")

    parser.add_option(
        "-b",
        "--bias",
        dest="bias",
        type="float",
        help=
        "introduce bias into codon usage choice. Complete bias is 1.0, while no bias is 0.0."
    )

    parser.add_option(
        "-i",
        "--biased-codon-usage",
        dest="filename_biased_codon_usage",
        type="string",
        help="Filename with reference codon usage table for biased codon usage."
    )

    parser.add_option(
        "-u",
        "--bulk-codon-usage",
        dest="filename_bulk_codon_usage",
        type="string",
        help=
        "Filename with reference codon usage table for unbiased codon usage.")

    parser.set_defaults(
        codons=False,
        conserve_aminos=False,
        bias=0.0,
        filename_biased_codon_usage=None,
        filename_bulk_codon_usage=None,
        stop_codons=("TAG", "TAA", "TGA"),
        precision=10000,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    # get map of amino acids to codons
    map_aa2codons = Genomics.GetMapAA2Codons()

    # for codon based shuffling: build ranges based on strength of bias and on reference codon usage
    # Bias switches from completely biased to unbiased. Unbiased is uniform
    # usage.
    if options.filename_biased_codon_usage:

        map_codon2frequency = IOTools.ReadMap(open(
            options.filename_biased_codon_usage, "r"),
                                              map_functions=(str, float),
                                              has_header=True)

        if options.filename_bulk_codon_usage:
            map_codon2frequency_bulk = IOTools.ReadMap(
                open(options.filename_bulk_codon_usage, "r"),
                map_functions=(str, float),
                has_header=True)

        codon_ranges = {}
        for aa in map_aa2codons.keys():
            c = []
            x = 0
            for codon in map_aa2codons[aa]:

                if options.filename_bulk_codon_usage:
                    u = map_codon2frequency_bulk[codon]
                else:
                    # uniform usage
                    u = 1.0 / len(map_aa2codons[aa])

                g = map_codon2frequency[codon]
                f = g + (u - g) * (1.0 - options.bias)
                x += f * options.precision
                c.append(x)
            codon_ranges[aa] = c

    while 1:
        cur_record = iterator.next()

        if cur_record is None:
            break

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if options.conserve_aminos:
            n = []
            for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                aa = Genomics.MapCodon2AA(codon)
                if aa not in map_aa2codons:
                    continue
                if options.bias or options.filename_biased_codon_usage:
                    # get random number from 0 to precision
                    v = random.randint(0, options.precision)
                    # find the corresponding intervall:
                    l = len(map_aa2codons[aa])
                    x = 0
                    while x < l - 1:
                        if v < codon_ranges[aa][x]:
                            break
                        x += 1
                else:
                    x = random.randint(0, len(map_aa2codons[aa]) - 1)
                n.append(map_aa2codons[aa][x])
            sequence = "".join(n)
        else:
            sequence = list(sequence)
            if options.codons:
                while 1:
                    random.shuffle(sequence)
                    for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                        if codon in options.stop_codons:
                            redo = True
                            break
                    else:
                        break
            else:
                random.shuffle(sequence)
            sequence = "".join(sequence)
        options.stdout.write(">%s\n%s\n" %
                             (cur_record.title, "".join(sequence)))

    E.Stop()
Ejemplo n.º 33
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        merge=False,
                        feature=None,
                        filename_masks=None,
                        remove_masked_regions=False,
                        min_length=0,
                        max_length=0,
                        extend_at=None,
                        extend_by=100,
                        extend_with=None,
                        masker=None,
                        fold_at=None,
                        naming_attribute=False)

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with IOTools.openFile(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        options.stdout.write(
            ">%s %s:%s:%s\n%s\n" %
            (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.Stop()
Ejemplo n.º 34
0
    def getSequence(self, contig, strand="+", start=0, end=0, converter=None, as_array=False):
        """get a genomic fragment.

        A genomic fragment is identified by the coordinates
        contig, strand, start, end.

        The converter function supplied translated these coordinates
        into 0-based coordinates. By default, start and end are assumed
        to be pythonic coordinates and are forward/reverse coordinates.

        If as_array is set to true, return the AString object. This might
        be beneficial for large sequence chunks. If as_array is set to False,
        return a python string.
        """

        contig = self.getToken(contig)

        data = self.mIndex[contig]
        # dummy is
        # -> pos_seq for seekable streams
        # -> block_size for unseekable streams
        try:
            pos_id, dummy, lsequence = struct.unpack("QQi", data)
        except (struct.error, TypeError):
            pos_id, dummy, lsequence, points = data

        pos_seq = dummy
        block_size = dummy

        if end == 0:
            end = lsequence

        if end > lsequence:
            raise ValueError("3' coordinate on %s out of bounds: %i > %i" % (contig, end, lsequence))

        if start < 0:
            raise ValueError("5' coordinate on %s out of bounds: %i < 0" % (contig, start))

        if converter:
            first_pos, last_pos = converter(start, end, str(strand) in ("+", "1"), lsequence)
        elif self.mConverter:
            first_pos, last_pos = self.mConverter(start, end, str(strand) in ("+", "1"), lsequence)
        else:
            first_pos, last_pos = start, end
            if str(strand) in ("-", "0", "-1"):
                first_pos, last_pos = lsequence - last_pos, lsequence - first_pos

        if first_pos == last_pos:
            return ""

        assert first_pos < last_pos, "first position %i is larger than last position %i " % (first_pos, last_pos)

        p = AString()

        if self.mNoSeek:
            # read directly from position
            p.fromstring(self.mDatabaseFile.read(block_size, data[3], first_pos, last_pos))
        else:
            first_pos += pos_seq
            last_pos += pos_seq

            self.mDatabaseFile.seek(first_pos)
            p.fromstring(self.mDatabaseFile.read(last_pos - first_pos))

        if str(strand) in ("-", "0", "-1"):
            p = AString(Genomics.complement(str(p)))

        if self.mTranslator:
            return self.mTranslator.translate(p)
        elif as_array:
            return p
        else:
            if IS_PY3:
                return p.tostring().decode("ascii")
            else:
                return p.tostring()
Ejemplo n.º 35
0
    def _buildAllele(allele_id,
                     transcript,
                     exons,
                     introns,
                     offsets,
                     virtual_coordinates=False,
                     reference_exons=None):
        def _getOffset(pos, offsets):
            x = 0
            while x < len(offsets) and offsets[x][0] <= pos:
                x += 1
            x -= 1
            if x >= 0:
                return offsets[x][1]
            else:
                return 0

        def _sumIndels(ss):
            '''sum indels within ss'''
            c = 0
            for s in ss:
                c += len(s) - 1
            return c

        def _getEndOffsets(ss):
            '''get the offset at exons due to deletions at
            start/end of exon.'''
            l = len(ss)
            x = 0
            while x < l and ss[x] == "":
                x += 1
            start_offset = x

            x = l - 1
            while x >= 0 and ss[x] == "":
                x -= 1
            if x >= 0:
                return start_offset, (l - 1) - x
            else:
                return start_offset, 0

        def _addCds2Reference(map_cds2reference, cds_start, cds_seq,
                              reference_start):
            '''add cds to reference'''
            c, r = cds_start, reference_start
            for x in cds_seq:
                l = len(x)
                if l == 0:
                    r += 1
                else:
                    map_cds2reference.addPair(c, r)
                    c += l
                    r += 1

        # counts
        is_splice_truncated = False
        is_nmd_knockout = False
        is_stop_truncated = False
        nuncorrected_frameshifts = 0
        ncorrected_frameshifts = 0
        nframeshifts = 0
        nsplice_noncanonical = 0
        reference_first_stop_start = -1
        reference_first_stop_end = -1

        # map between the new cds sequence and the reference
        # sequence
        map_cds2reference = alignlib_lite.py_makeAlignmentBlocks()

        ###################################################
        # process first exon
        exon = transcript[0]
        transcript_id = exon.transcript_id

        # collect offset for exon.start
        genome_start = exon.start
        genome_start += _getOffset(genome_start, offsets)
        lcds, cds = 0, []
        cds_starts = [0]

        # still need to deal with deletions of first base:
        exon_starts = [genome_start]
        exon_key = (exon.start, exon.end)
        exon_sequence = exons[exon_key]
        exon_seq = "".join(exon_sequence)

        cds.append(exon_seq)
        _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start)
        lcds = len(exon_seq)

        if len(exon_seq) != exon.end - exon.start:
            nframeshifts += 1

        # add first exon to genome position
        genome_pos = genome_start + len(exon_seq)
        last_end = exon.end

        # correct for deletions at start/end of exon
        start_offset, end_offset = _getEndOffsets(exon_sequence)

        # length of original transcript
        loriginal = sum([x.end - x.start for x in transcript])

        if E.global_options.loglevel >= 8:
            print("%i: exon_indels (%i-%i):" %
                  (allele_id, exon.start, exon.end))
            for x, c in enumerate(exons[exon_key]):
                if len(c) != 1:
                    print(x + exon.start, ":%s:" % c)
            print()
            print(exons[exon_key])
            print("genome_pos=", genome_pos,
                  ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)),
                  ", len(exon_seq)=", len(exon_seq), ", len(exon)=",
                  exon.end - exon.start,
                  ", offsets=%i,%i," % (start_offset, end_offset),
                  ", offset at start=", getOffset(exon.start,
                                                  offsets), ", offset at end=",
                  getOffset(exon.end, offsets))

        for exon in transcript[1:]:

            last_exon_sequence = exon_sequence
            last_start_offset, last_end_offset = start_offset, end_offset

            # get the next intron/exon parameters
            exon_key = (exon.start, exon.end)
            exon_sequence = exons[exon_key]
            start_offset, end_offset = _getEndOffsets(exon_sequence)
            intron_key = (last_end, exon.start)

            if last_end == exon.start:
                # catch empty introns
                intron_sequence = []
                intron_key = None
            else:
                intron_sequence = introns[intron_key]

            intron_seq = "".join(intron_sequence)

            ###################################################
            ###################################################
            ###################################################
            # add preceding intron
            new_exon = True

            if len(intron_seq) > frameshiftsize:

                intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType(
                    intron_seq)
                if intron_name == "unknown":
                    if intron_seq[:2].islower() and intron_seq[-2:].islower():
                        E.debug(
                            "%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s"
                            % (transcript_id, intron_name, intron_seq5,
                               intron_seq3))
                        nsplice_noncanonical += 1
                    else:
                        is_splice_truncated = True
                        E.debug(
                            "%s: transcript has splice truncated allele: %s: %s:%s"
                            % (transcript_id, intron_name, intron_seq5,
                               intron_seq3))
                        break
                # start a new exon
                cds_starts.append(lcds)

            else:
                # treat as frameshifting intron
                #
                # frame-shifting introns are checked if they are
                # fixed by indels either in the intron itself or
                # the terminal exon sequence. To this end, the effective
                # size of the intron is computed:
                # effective size of intron =
                # indels at terminal x bases at previous exon
                # + size of intron
                # + indels at terminal x bases at next exon
                effective_intron_size = len(intron_seq)
                previous_indels = _sumIndels(
                    last_exon_sequence[max(0, -frameshiftsize):])
                next_indels = _sumIndels(exon_sequence[:frameshiftsize])
                effective_intron_size += previous_indels + next_indels

                if previous_indels + next_indels == 0 and len(
                        intron_seq) % 3 == 0:
                    has_stop = "X" in Genomics.translate(intron_seq.upper(),
                                                         is_seleno=is_seleno)
                else:
                    has_stop = False

                if effective_intron_size % 3 == 0 and not has_stop:
                    E.debug(
                        "%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)"
                        % (
                            transcript_id,
                            last_end,
                            exon.start,
                            effective_intron_size,
                            len(intron_seq),
                            previous_indels,
                            next_indels,
                        ))

                    # add to previous exon
                    cds.append(intron_seq)
                    lcds += len(intron_seq)
                    ncorrected_frameshifts += 1
                    new_exon = False
                else:
                    E.debug(
                        "%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)"
                        % (transcript_id, last_end, exon.start,
                           effective_intron_size, len(intron_seq),
                           previous_indels, next_indels, has_stop))

                    nuncorrected_frameshifts += 1
                    # start a new exon
                    cds_starts.append(lcds)

            if E.global_options.loglevel >= 8:
                print("%i: intron_indels (%i-%i):" %
                      (allele_id, last_end, exon.start))
                if intron_key:
                    for x, c in enumerate(introns[intron_key]):
                        if len(c) != 1:
                            print(x + last_end, ":%s:" % c)
                    print()
                    print(introns[intron_key])
                    print(
                        "genome_pos=", genome_pos, ",intron=%i-%i" %
                        (genome_pos, genome_pos + len(intron_seq)),
                        ", len(intron_seq)=", len(intron_seq),
                        ", len(intron)=",
                        exon.start - last_end, ", offset at start=",
                        _getOffset(last_end, offsets), ", offset at end=",
                        _getOffset(exon.start, offsets))
                else:
                    print("empty intron")

            genome_pos += len(intron_seq)

            # assertion - check if genomic coordinate of intron is consistent
            # with offset
            test_offset = _getOffset(exon.start, offsets)
            is_offset = genome_pos - exon.start
            assert is_offset == test_offset, "intron offset difference: %i != %i" % (
                is_offset, test_offset)

            ###################################################
            ###################################################
            ###################################################
            # add the exon
            exon_seq = "".join(exon_sequence)
            cds.append(exon_seq)

            if len(exon_seq) != exon.end - exon.start:
                nframeshifts += 1

            if new_exon:
                if reference_coordinates:
                    exon_starts.append(exon.start + start_offset)
                else:
                    exon_starts.append(genome_pos)

            _addCds2Reference(map_cds2reference, lcds, exon_sequence,
                              exon.start)

            lcds += len(exon_seq)
            last_end = exon.end

            if E.global_options.loglevel >= 8:
                print("%i: exon_indels (%i-%i):" %
                      (allele_id, exon.start, exon.end))
                for x, c in enumerate(exons[exon_key]):
                    if len(c) != 1:
                        print(x + exon.start, ":%s:" % c)
                print()
                print(exons[exon_key])
                print("genome_pos=", genome_pos,
                      ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)),
                      ", len(exon_seq)=", len(exon_seq), ", len(exon)=",
                      exon.end - exon.start, ", offsets=%i,%i," %
                      (start_offset, end_offset), ", offset at start=",
                      getOffset(exon.start, offsets), ", offset at end=",
                      getOffset(exon.end, offsets))

            genome_pos += len(exon_seq)

            test_offset = _getOffset(exon.end, offsets)
            is_offset = genome_pos - exon.end
            assert is_offset == test_offset, "exon offset difference: %i != %i" % (
                is_offset, test_offset)

        cds = "".join(cds)
        assert lcds == len(cds)

        # fix incomplete codons at the end of the sequence
        if lcds % 3 != 0:
            offset = lcds % 3
            cds = cds[:-offset]

        # add frame correction for transcripts that do not start at frame=0
        start_frame = (3 - (int(transcript[0].frame) % 3)) % 3

        # n are ignored (? in sequence to deal with genes like Muc2)
        peptide = Genomics.translate("n" * start_frame + cds,
                                     is_seleno=is_seleno,
                                     prefer_lowercase=False,
                                     ignore_n=True)

        # find the first stop codon
        if start_frame != 0:
            # ignore first, potentially incomplete base
            pep_first_stop = peptide.upper().find("X", 1)
        else:
            pep_first_stop = peptide.upper().find("X")

        E.debug("%s: translated peptide = %s, first stop at %i" %
                (transcript_id, peptide, pep_first_stop))

        peptide = peptide.replace("?", "x")

        if E.global_options.loglevel >= 8:
            E.debug("peptide=%s" % peptide)
            E.debug("cds=%s" % cds)

        E.debug("%s: start_frame=%i, first stop at %i/%i" %
                (transcript_id, start_frame, pep_first_stop, len(peptide)))

        lpeptide, lcds = len(peptide), len(cds)

        # check for non-sense mediated decay
        if pep_first_stop != -1:
            cds_first_stop = pep_first_stop * 3 - start_frame
            if cds_first_stop < cds_starts[-1]:
                if ncorrected_frameshifts or nuncorrected_frameshifts:
                    E.warn(
                        "nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected"
                        % (transcript_id, ncorrected_frameshifts,
                           nuncorrected_frameshifts))
                is_nmd_knockout = True
                cds = peptide = ""
                lpeptide, lcds = 0, 0
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            elif pep_first_stop < len(peptide) - 1:
                is_stop_truncated = True
                cds = cds[:cds_first_stop]
                peptide[:pep_first_stop]
                lpeptide, lcds = len(peptide), len(cds)
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            else:
                E.warn(
                    "first stop at %i(cds=%i) ignored: last exon start at %i" %
                    (pep_first_stop, cds_first_stop, cds_starts[-1]))

        else:
            # -1 for no stop codon found
            pep_first_stop = -1
            cds_first_stop = -1
            lpeptide, lcds = len(peptide), len(cds)

        if peptide is None and nframeshifts == 0:
            E.warn(
                "transcript %s is knockout, though there are no indels - must be nonsense mutation"
                % (transcript_id))

        # build frames
        frames = [start_frame]
        start = start_frame
        l = 0
        for end in cds_starts[1:]:
            l += end - start
            frames.append((3 - l % 3) % 3)
            start = end

        return Allele._make((
            cds,
            peptide,
            len(cds_starts),
            cds_starts,
            exon_starts,
            frames,
            is_nmd_knockout,
            is_splice_truncated,
            is_stop_truncated,
            nframeshifts,
            ncorrected_frameshifts,
            nuncorrected_frameshifts,
            pep_first_stop,
            lpeptide,
            cds_first_stop,
            lcds,
            reference_first_stop_start,
            reference_first_stop_end,
            loriginal,
            nsplice_noncanonical,
        )), map_cds2reference
Ejemplo n.º 36
0
    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator( options.stdin ):        
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment( -min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts) )
        q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo )
        t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo )
        query_ali, sbjct_ali = getAlignment( m, q, t, options )

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement( query_ali )
            sbjct_ali = Genomics.complement( sbjct_ali )

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \
                                 (options.query_prefix, 
                                  options.output_format_id % id,
                                  match.mQueryId, match.mQueryFrom, match.mQueryTo,
                                  query_ali,
                                  options.target_prefix,
                                  options.output_format_id % id, 
                                  match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo,
                                  sbjct_ali ) )
        id += 1

    E.Stop()
Ejemplo n.º 37
0
    def Align(self, method, anchor=0, loglevel=1):
        """align a pair of sequences.
        get rid of this and use a method class instead in the future
        """

        map_a2b = alignlib_lite.py_makeAlignmentVector()
        s1 = "A" * anchor + self.mSequence1 + "A" * anchor
        s2 = "A" * anchor + self.mSequence2 + "A" * anchor

        self.strand = "+"

        if method == "dialign":
            dialign = WrapperDialign.Dialign(self.mOptionsDialign)
            dialign.Align(s1, s2, map_a2b)
        elif method == "blastz":
            blastz = WrapperBlastZ.BlastZ(self.mOptionsBlastZ)
            blastz.Align(s1, s2, map_a2b)
            if blastz.isReverseComplement():
                self.strand = "-"
                self.mSequence2 = Genomics.complement(self.mSequence2)

        elif method == "dialignlgs":
            dialignlgs = WrapperDialign.Dialign(self.mOptionsDialignLGS)
            dialignlgs.Align(s1, s2, map_a2b)
        elif method == "dba":
            dba = WrapperDBA.DBA()
            dba.Align(s1, s2, map_a2b)
        elif method == "clustal":
            raise NotImplementedError("clustal wrapper needs to be updated")
            clustal = WrapperClustal.Clustal()
            clustal.Align(s1, s2, map_a2b)
        elif method == "nw":
            seq1 = alignlib_lite.py_makeSequence(s1)
            seq2 = alignlib_lite.py_makeSequence(s2)
            alignator = alignlib_lite.py_makeAlignatorDPFull(alignlib_lite.py_ALIGNMENT_GLOBAL,
                                                             gop=-12.0,
                                                             gep=-2.0)
            alignator.align(map_a2b, seq1, seq2)
        elif method == "sw":
            seq1 = alignlib_lite.py_makeSequence(s1)
            seq2 = alignlib_lite.py_makeSequence(s2)
            alignlib_lite.py_performIterativeAlignment(
                map_a2b, seq1, seq2, alignator_sw, min_score_sw)
        else:
            # use callback function
            method(s1, s2, map_a2b)

        if map_a2b.getLength() == 0:
            raise AlignmentError("empty alignment")

        if anchor:
            map_a2b.removeRowRegion(
                anchor + len(self.mSequence1) + 1, map_a2b.getRowTo())
            map_a2b.removeRowRegion(1, anchor)
            map_a2b.removeColRegion(
                anchor + len(self.mSequence2) + 1, map_a2b.getColTo())
            map_a2b.removeColRegion(1, anchor)
            map_a2b.moveAlignment(-anchor, -anchor)

        f = alignlib_lite.py_AlignmentFormatExplicit(map_a2b,
                                                     alignlib_lite.py_makeSequence(
                                                         self.mSequence1),
                                                     alignlib_lite.py_makeSequence(self.mSequence2))

        self.mMethod = method
        self.mAlignment = map_a2b
        self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment
        f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
        self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment
        self.mAlignmentFrom1 = map_a2b.getRowFrom()
        self.mAlignmentTo1 = map_a2b.getRowTo()
        self.mAlignmentFrom2 = map_a2b.getColFrom()
        self.mAlignmentTo2 = map_a2b.getColTo()
        self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength()
        self.mAligned = self.mLength - self.mNumGaps

        self.SetPercentIdentity()
        self.SetBlockSizes()
Ejemplo n.º 38
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        method="join",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(IOTools.openFile(fn1))
        iter2 = Fastq.iterate(IOTools.openFile(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            # reverse complement
            s2 = Genomics.complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in offsets.items()])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier,
                                                  max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
Ejemplo n.º 39
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$")

    parser.add_option("-p",
                      "--peptides-fasta-file",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences [%default].")

    parser.add_option("-c",
                      "--cds-gtf-file",
                      "--cdnas",
                      dest="filename_cdna",
                      type="string",
                      help="filename with cdna sequences [%default].")

    parser.add_option(
        "-m",
        "--map",
        dest="filename_map",
        type="string",
        help=
        "filename with map of peptide identifiers to cdna identifiers [%default]."
    )

    parser.add_option("--output-identifier",
                      dest="output_identifier",
                      type="choice",
                      choices=("cdna", "peptide"),
                      help="output identifier to use [%default].")

    parser.add_option("-f",
                      "--output-format=",
                      dest="output_format",
                      type="choice",
                      choices=("alignment", "fasta"),
                      help="output format.")

    parser.set_defaults(
        peptides=None,
        filename_cdna=None,
        output_format="alignment",
        filename_map=None,
        stop_codons=("TAG", "TAA", "TGA"),
        output_identifier="peptide",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if not options.filename_cdna:
        raise ValueError("please supply filename with cds sequences.")

    if options.filename_peptides:
        infile = open(options.filename_peptides, "r")
        E.info("reading from %s" % options.filename_peptides)
    else:
        E.info("reading from stdin")
        infile = sys.stdin

    if options.filename_map:
        E.info("reading map")
        map_peptide2cds = IOTools.readMap(
            IOTools.openFile(options.filename_map, "r"))
        E.info("read map for %i identifiers" % len(map_peptide2cds))
    else:
        map_peptide2cds = {}

    E.info("reading cds sequences")

    cds_sequences = Genomics.ReadPeptideSequences(
        IOTools.openFile(options.filename_cdna, "r"))

    E.info("read %i cds sequences" % len(cds_sequences))

    ninput, noutput = 0, 0
    nskipped, nnosequence = 0, 0

    # iterate over peptide sequences
    iterator = FastaIterator.FastaIterator(infile)

    use_cds_id = options.output_identifier == "cds"

    for cur_record in iterator:

        ninput += 1

        peptide_identifier = re.split("\s+", cur_record.title)[0]
        cds_identifier = map_peptide2cds.get(peptide_identifier,
                                             peptide_identifier)

        if cds_identifier not in cds_sequences:
            nnosequence += 1
            continue

        p = cur_record.sequence
        c = cds_sequences[cds_identifier]

        E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" %
                (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c)))

        try:
            map_p2c = Peptides2Cds.getMapPeptide2Cds(p, c, options)
        except ValueError:
            nskipped += 1
            continue

        if use_cds_id:
            identifier = cds_identifier
        else:
            identifier = peptide_identifier

        if options.output_format == "alignment":
            options.stdout.write("\t".join(
                map(str, (identifier,
                          alignlib_lite.py_AlignmentFormatEmissions(map_p2c),
                          len(cur_record.sequence),
                          len(cds_sequences[identifier])))) + "\n")

        elif options.output_format == "fasta":

            map_p2c.switchRowCol()

            alignatum = alignlib_lite.py_makeAlignatum(c)

            alignatum.mapOnAlignment(map_p2c, len(p) * 3)

            s = alignatum.getString()
            if len(s) != len(p) * 3:
                raise ValueError(
                    "incomplete aligned string for %s: %s, cds=%s" %
                    (cur_record.title, s, c))

            options.stdout.write(">%s\n%s\n" % (identifier, s))

        noutput += 1
        sys.stdout.flush()

    E.info("ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" %
           (ninput, noutput, nnosequence, nskipped))

    E.Stop()
Ejemplo n.º 40
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--query-psl-file", dest="filename_query", type="string",
                      help="fasta filename with queries.")

    parser.add_option("--target-psl-file", dest="filename_target", type="string",
                      help="fasta filename with target.")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "full", "pileup-query", "pileup-target", "gapless"),
                      help="method to use for constructing the alignment [%default].")

    parser.add_option("--forward-query", dest="forward_query", action="store_true",
                      help="reverse-complement sequences such that query is always on forward strand [%default]")

    parser.add_option("--target-prefix", dest="target_prefix", type="string",
                      help="prefix to use for target [%default].")

    parser.add_option("--query-prefix", dest="query_prefix", type="string",
                      help="prefix to use for query [%default].")

    parser.add_option("--id", dest="id", type="choice",
                      choices=("numeric", "query"),
                      help="choose type of identifier to use [%default]")

    parser.set_defaults(
        filename_query=None,
        filename_target=None,
        method="full",
        output_format_id="%06i",
        target_prefix="",
        query_prefix="",
        forward_query=False,
    )

    (options, args) = E.Start(parser)

    if options.filename_query:
        query = IndexedFasta.IndexedFasta(options.filename_query)

    if options.filename_target:
        target = IndexedFasta.IndexedFasta(options.filename_target)

    if options.method == "full":
        getAlignment = getAlignmentFull

    id = 0
    for match in Blat.iterator(options.stdin):
        if options.loglevel >= 2:
            options.stdout.write("# %s\n" % str(match))

        m = match.getMapQuery2Target()
        m.moveAlignment(-min(match.mQueryBlockStarts), -
                        min(match.mSbjctBlockStarts))
        q = query.getSequence(
            match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo)
        t = target.getSequence(
            match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)
        query_ali, sbjct_ali = getAlignment(m, q, t, options)

        if match.strand == "-" and options.forward_query:
            query_ali = Genomics.complement(query_ali)
            sbjct_ali = Genomics.complement(sbjct_ali)

        options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" %
                             (options.query_prefix,
                              options.output_format_id % id,
                              match.mQueryId, match.mQueryFrom, match.mQueryTo,
                              query_ali,
                              options.target_prefix,
                              options.output_format_id % id,
                              match.mSbjctId, match.strand,
                              match.mSbjctFrom, match.mSbjctTo,
                              sbjct_ali))
        id += 1

    E.Stop()
Ejemplo n.º 41
0
    def _buildAllele(allele_id,
                     transcript, exons,
                     introns, offsets,
                     virtual_coordinates=False,
                     reference_exons=None):

        def _getOffset(pos, offsets):
            x = 0
            while x < len(offsets) and offsets[x][0] <= pos:
                x += 1
            x -= 1
            if x >= 0:
                return offsets[x][1]
            else:
                return 0

        def _sumIndels(ss):
            '''sum indels within ss'''
            c = 0
            for s in ss:
                c += len(s) - 1
            return c

        def _getEndOffsets(ss):
            '''get the offset at exons due to deletions at
            start/end of exon.'''
            l = len(ss)
            x = 0
            while x < l and ss[x] == "":
                x += 1
            start_offset = x

            x = l - 1
            while x >= 0 and ss[x] == "":
                x -= 1
            if x >= 0:
                return start_offset, (l - 1) - x
            else:
                return start_offset, 0

        def _addCds2Reference(map_cds2reference,
                              cds_start,
                              cds_seq,
                              reference_start):
            '''add cds to reference'''
            c, r = cds_start, reference_start
            for x in cds_seq:
                l = len(x)
                if l == 0:
                    r += 1
                else:
                    map_cds2reference.addPair(c, r)
                    c += l
                    r += 1
        # counts
        is_splice_truncated = False
        is_nmd_knockout = False
        is_stop_truncated = False
        nuncorrected_frameshifts = 0
        ncorrected_frameshifts = 0
        nframeshifts = 0
        nsplice_noncanonical = 0
        reference_first_stop_start = -1
        reference_first_stop_end = -1

        # map between the new cds sequence and the reference
        # sequence
        map_cds2reference = alignlib_lite.py_makeAlignmentBlocks()

        ###################################################
        # process first exon
        exon = transcript[0]
        transcript_id = exon.transcript_id

        # collect offset for exon.start
        genome_start = exon.start
        genome_start += _getOffset(genome_start, offsets)
        lcds, cds = 0, []
        cds_starts = [0]

        # still need to deal with deletions of first base:
        exon_starts = [genome_start]
        exon_key = (exon.start, exon.end)
        exon_sequence = exons[exon_key]
        exon_seq = "".join(exon_sequence)

        cds.append(exon_seq)
        _addCds2Reference(map_cds2reference,
                          lcds,
                          exon_sequence,
                          exon.start)
        lcds = len(exon_seq)

        if len(exon_seq) != exon.end - exon.start:
            nframeshifts += 1

        # add first exon to genome position
        genome_pos = genome_start + len(exon_seq)
        last_end = exon.end

        # correct for deletions at start/end of exon
        start_offset, end_offset = _getEndOffsets(exon_sequence)

        # length of original transcript
        loriginal = sum([x.end - x.start for x in transcript])

        if E.global_options.loglevel >= 8:
            print "%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)
            for x, c in enumerate(exons[exon_key]):
                if len(c) != 1:
                    print x + exon.start, ":%s:" % c
            print
            print exons[exon_key]
            print "genome_pos=", genome_pos, \
                ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), \
                ", len(exon_seq)=", len(exon_seq), \
                ", len(exon)=", exon.end - exon.start, \
                ", offsets=%i,%i," % (start_offset, end_offset), \
                ", offset at start=", _getOffset( exon.start, offsets), \
                ", offset at end=", _getOffset(exon.end, offsets)

        for exon in transcript[1:]:

            last_exon_sequence = exon_sequence
            last_start_offset, last_end_offset = start_offset, end_offset

            # get the next intron/exon parameters
            exon_key = (exon.start, exon.end)
            exon_sequence = exons[exon_key]
            start_offset, end_offset = _getEndOffsets(exon_sequence)
            intron_key = (last_end, exon.start)

            if last_end == exon.start:
                # catch empty introns
                intron_sequence = []
                intron_key = None
            else:
                intron_sequence = introns[intron_key]

            intron_seq = "".join(intron_sequence)

            ###################################################
            ###################################################
            ###################################################
            # add preceding intron
            new_exon = True

            if len(intron_seq) > frameshiftsize:

                intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType(
                    intron_seq)
                if intron_name == "unknown":
                    if intron_seq[:2].islower() and intron_seq[-2:].islower():
                        E.debug("%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" %
                                (transcript_id, intron_name, intron_seq5, intron_seq3))
                        nsplice_noncanonical += 1
                    else:
                        is_splice_truncated = True
                        E.debug("%s: transcript has splice truncated allele: %s: %s:%s" %
                                (transcript_id, intron_name, intron_seq5, intron_seq3))
                        break
                # start a new exon
                cds_starts.append(lcds)

            else:
                # treat as frameshifting intron
                #
                # frame-shifting introns are checked if they are
                # fixed by indels either in the intron itself or
                # the terminal exon sequence. To this end, the effective
                # size of the intron is computed:
                # effective size of intron =
                # indels at terminal x bases at previous exon
                # + size of intron
                # + indels at terminal x bases at next exon
                effective_intron_size = len(intron_seq)
                previous_indels = _sumIndels(
                    last_exon_sequence[max(0, -frameshiftsize):])
                next_indels = _sumIndels(exon_sequence[:frameshiftsize])
                effective_intron_size += previous_indels + next_indels

                if previous_indels + next_indels == 0 and len(intron_seq) % 3 == 0:
                    has_stop = "X" in Genomics.translate(intron_seq.upper(),
                                                         is_seleno=is_seleno)
                else:
                    has_stop = False

                if effective_intron_size % 3 == 0 and not has_stop:
                    E.debug("%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)" %
                            (transcript_id, last_end, exon.start,
                             effective_intron_size,
                             len(intron_seq),
                             previous_indels, next_indels,))

                    # add to previous exon
                    cds.append(intron_seq)
                    lcds += len(intron_seq)
                    ncorrected_frameshifts += 1
                    new_exon = False
                else:
                    E.debug("%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)" %
                            (transcript_id, last_end, exon.start,
                             effective_intron_size,
                             len(intron_seq),
                             previous_indels, next_indels,
                             has_stop))

                    nuncorrected_frameshifts += 1
                    # start a new exon
                    cds_starts.append(lcds)

            if E.global_options.loglevel >= 8:
                print "%i: intron_indels (%i-%i):" % (allele_id, last_end, exon.start)
                if intron_key:
                    for x, c in enumerate(introns[intron_key]):
                        if len(c) != 1:
                            print x + last_end, ":%s:" % c
                    print
                    print introns[intron_key]
                    print "genome_pos=", genome_pos, \
                        ",intron=%i-%i" % (genome_pos, genome_pos + len(intron_seq)), \
                        ", len(intron_seq)=", len(intron_seq), \
                        ", len(intron)=", exon.start - last_end, \
                        ", offset at start=", _getOffset( last_end, offsets), \
                        ", offset at end=", _getOffset(exon.start, offsets)
                else:
                    print "empty intron"

            genome_pos += len(intron_seq)

            # assertion - check if genomic coordinate of intron is consistent
            # with offset
            test_offset = _getOffset(exon.start, offsets)
            is_offset = genome_pos - exon.start
            assert is_offset == test_offset, "intron offset difference: %i != %i" % (
                is_offset, test_offset)

            ###################################################
            ###################################################
            ###################################################
            # add the exon
            exon_seq = "".join(exon_sequence)
            cds.append(exon_seq)

            if len(exon_seq) != exon.end - exon.start:
                nframeshifts += 1

            if new_exon:
                if reference_coordinates:
                    exon_starts.append(exon.start + start_offset)
                else:
                    exon_starts.append(genome_pos)

            _addCds2Reference(map_cds2reference,
                              lcds,
                              exon_sequence,
                              exon.start)

            lcds += len(exon_seq)
            last_end = exon.end

            if E.global_options.loglevel >= 8:
                print "%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)
                for x, c in enumerate(exons[exon_key]):
                    if len(c) != 1:
                        print x + exon.start, ":%s:" % c
                print
                print exons[exon_key]
                print "genome_pos=", genome_pos, \
                    ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), \
                    ", len(exon_seq)=", len(exon_seq), \
                    ", len(exon)=", exon.end - exon.start, \
                    ", offsets=%i,%i," % (start_offset, end_offset), \
                    ", offset at start=", _getOffset( exon.start, offsets), \
                    ", offset at end=", _getOffset(exon.end, offsets)

            genome_pos += len(exon_seq)

            test_offset = _getOffset(exon.end, offsets)
            is_offset = genome_pos - exon.end
            assert is_offset == test_offset, "exon offset difference: %i != %i" % (
                is_offset, test_offset)

        cds = "".join(cds)
        assert lcds == len(cds)

        # fix incomplete codons at the end of the sequence
        if lcds % 3 != 0:
            offset = lcds % 3
            cds = cds[:-offset]

        # add frame correction for transcripts that do not start at frame=0
        start_frame = (3 - (int(transcript[0].frame) % 3)) % 3

        # n are ignored (? in sequence to deal with genes like Muc2)
        peptide = Genomics.translate("n" * start_frame + cds,
                                     is_seleno=is_seleno,
                                     prefer_lowercase=False,
                                     ignore_n=True)

        # find the first stop codon
        if start_frame != 0:
            # ignore first, potentially incomplete base
            pep_first_stop = peptide.upper().find("X", 1)
        else:
            pep_first_stop = peptide.upper().find("X")

        E.debug("%s: translated peptide = %s, first stop at %i" %
                (transcript_id, peptide, pep_first_stop))

        peptide = peptide.replace("?", "x")

        if E.global_options.loglevel >= 8:
            E.debug("peptide=%s" % peptide)
            E.debug("cds=%s" % cds)

        E.debug("%s: start_frame=%i, first stop at %i/%i" % (transcript_id,
                                                             start_frame,
                                                             pep_first_stop,
                                                             len(peptide)))

        lpeptide, lcds = len(peptide), len(cds)

        # check for non-sense mediated decay
        if pep_first_stop != -1:
            cds_first_stop = pep_first_stop * 3 - start_frame
            if cds_first_stop < cds_starts[-1]:
                if ncorrected_frameshifts or nuncorrected_frameshifts:
                    E.warn("nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected" %
                           (transcript_id,
                            ncorrected_frameshifts,
                            nuncorrected_frameshifts))
                is_nmd_knockout = True
                cds = peptide = ""
                lpeptide, lcds = 0, 0
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            elif pep_first_stop < len(peptide) - 1:
                is_stop_truncated = True
                cds = cds[:cds_first_stop]
                peptide[:pep_first_stop]
                lpeptide, lcds = len(peptide), len(cds)
                reference_first_stop_start, reference_first_stop_end = \
                    (map_cds2reference.mapRowToCol(cds_first_stop),
                     map_cds2reference.mapRowToCol(cds_first_stop + 3))
            else:
                E.warn("first stop at %i(cds=%i) ignored: last exon start at %i" %
                       (pep_first_stop,
                        cds_first_stop,
                        cds_starts[-1]))

        else:
            # -1 for no stop codon found
            pep_first_stop = -1
            cds_first_stop = -1
            lpeptide, lcds = len(peptide), len(cds)

        if peptide is None and nframeshifts == 0:
            E.warn(
                "transcript %s is knockout, though there are no indels - must be nonsense mutation" % (transcript_id))

        # build frames
        frames = [start_frame]
        start = start_frame
        l = 0
        for end in cds_starts[1:]:
            l += end - start
            frames.append((3 - l % 3) % 3)
            start = end

        return Allele._make((cds,
                             peptide,
                             len(cds_starts),
                             cds_starts,
                             exon_starts,
                             frames,
                             is_nmd_knockout,
                             is_splice_truncated,
                             is_stop_truncated,
                             nframeshifts,
                             ncorrected_frameshifts,
                             nuncorrected_frameshifts,
                             pep_first_stop,
                             lpeptide,
                             cds_first_stop,
                             lcds,
                             reference_first_stop_start,
                             reference_first_stop_end,
                             loriginal,
                             nsplice_noncanonical,
                             )), map_cds2reference
Ejemplo n.º 42
0
    genome = IndexedFasta.IndexedFasta( options.genome_file )

    assert options.filename_regions != None, "please supply a gff formatted filename with regions"

    regions = GTF.readAsIntervals( GFF.iterator( IOTools.openFile(options.filename_regions, "r" ) ) )

    # build pairs for complement
    reverse_splice_pairs = []
    forward_splice_pairs = options.splice_pairs
    left_tokens, right_tokens = {}, {}
    x = 0
    for a,b in forward_splice_pairs:
        assert len(a) == 2, "only two-residue patterns allowed"
        assert len(b) == 2, "only two-residue patterns allowed"

        ca, cb = Genomics.complement( a ), Genomics.complement( b ) 
        reverse_splice_pairs.append( (b,a) )
        left_tokens[a] = x
        left_tokens[cb] = x+1
        right_tokens[b] = x
        right_tokens[ca] = x+1
        x += 2

    search_area = options.search_area
    read_length = options.read_length
    joined = options.joined

    ninput, noutput = 0, 0

    if joined:
        outfile_coordinates = IOTools.openFile( options.output_filename_pattern % "coords", "w" )
Ejemplo n.º 43
0
        if param_filename_benchmark_synonyms:
            infile = open(param_filename_benchmark_synonyms, "r")
            param_benchmark_synonyms = {}
            for line in infile:
                if line[0] == "#": continue
                value, key = line[:-1].split("\t")
                param_benchmark_synonyms[key] = value
        else:
            param_benchmark_synonyms = {}
    else:
        param_benchmarks = {}
        param_benchmark_synonyms = {}

    # read peptide sequences
    if param_filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(param_filename_peptides, "r"))
    else:
        peptide_sequences = {}

    if param_conserve_memory:
        old_predictions, filename_old_predictions = tempfile.mkstemp()
        os.close(old_predictions)
        old_predictions = PredictionFile.PredictionFile()
        old_predictions.open(filename_old_predictions, "w")
    else:
        ## array with final predictions
        old_predictions = []

    if param_loglevel >= 1:
        print "# reading predictions."
        sys.stdout.flush()
Ejemplo n.º 44
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--merge-exons",
                      dest="merge_exons",
                      action="store_true",
                      help="merge overlapping exons of all transcripts "
                      "within a gene. "
                      "The merged exons will be output. "
                      "Input needs to sorted by gene [default=%default].")

    parser.add_option("-t", "--merge-transcripts",
                      dest="merge_transcripts",
                      action="store_true",
                      help="merge all transcripts within a gene. "
                      "The entry will span the whole gene "
                      "(exons and introns). "
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. [default=%default].")

    parser.add_option("--merge-genes", dest="merge_genes", action="store_true",
                      help="merge overlapping genes if their exons overlap. "
                      "A gene with a single transcript containing all exons "
                      "of the overlapping transcripts will be output. "
                      "This operation ignores strand information "
                      "The input needs te sorted by transcript "
                      "[default=%default].")

    parser.add_option("--merge-exons-distance",
                      dest="merge_exons_distance",
                      type="int",
                      help="distance in nucleotides between "
                      "exons to be merged [default=%default].")

    parser.add_option("-j", "--join-exons",
                      dest="join_exons",
                      action="store_true",
                      help="join all exons per transcript. "
                      "A new transcript will be "
                      "output that spans a whole transcript. "
                      "Input needs to be sorted by transcript "
                      "[default=%default].")

    parser.add_option("--unset-genes", dest="unset_genes", type="string",
                      help="unset gene identifiers, keeping "
                      "transcripts intact. "
                      "New gene identifiers are set to the "
                      "pattern given. For example, "
                      "'--unset-genes=%06i' [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      type="choice",
                      choices=("gene",
                               "gene+transcript",
                               "transcript",
                               "position",
                               "contig+gene",
                               "position+gene",
                               "gene+position"),
                      help="sort input data [default=%default].")

    parser.add_option("-u", "--with-utr",
                      dest="with_utr",
                      action="store_true",
                      help="include utr in merged transcripts "
                      "[default=%default].")

    parser.add_option("--intersect-transcripts",
                      dest="intersect_transcripts",
                      action="store_true",
                      help="intersect all transcripts within a gene. "
                      "The entry will only span those bases "
                      "that are covered by all transcrips."
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. This method "
                      "will remove all other features (stop_codon, etc.) "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-i", "--merge-introns",
                      dest="merge_introns",
                      action="store_true",
                      help="merge and output all introns within a "
                      "gene. The output will contain "
                      "all intronic regions within a gene. Single exon genes "
                      "are skipped. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-g", "--set-transcript-to-gene",
                      "--set-transcript2gene",
                      dest="set_transcript2gene",
                      action="store_true",
                      help="set the transcript_id to the "
                      "gene_id [default=%default].")

    parser.add_option("--set-protein-to-transcript",
                      dest="set_protein2transcript",
                      action="store_true",
                      help="set the protein_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("--add-protein-id",
                      dest="add_protein_id",
                      type="string",
                      help="add a protein_id for each transcript_id. "
                      "The argument is a filename containing a mapping "
                      "between "
                      "transcript_id to protein_id [default=%default].")

    parser.add_option("-G", "--set-gene-to-transcript",
                      "--set-gene2transcript",
                      dest="set_gene2transcript",
                      action="store_true",
                      help="set the gene_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("-d",
                      "--set-score2distance",
                      dest="set_score2distance",
                      action="store_true",
                      help="set the score field for each feature to the "
                      "distance to "
                      "transcription start site [default=%default].")

    parser.add_option("--exons2introns",
                      dest="exons2introns",
                      action="store_true",
                      help="for each gene build an 'intronic' transcript "
                      "containing the union of all intronic regions "
                      "of all transcripts in a gene."
                      "The features are labeled as 'intron'."
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-f", "--filter", dest="filter",
                      type="choice",
                      choices=("gene", "transcript", "longest-gene",
                               "longest-transcript",
                               "representative-transcript"),
                      help="apply a filter to the input file. Available "
                      "filters are: "
                      "'gene': filter by gene_id, "
                      "'transcript': filter by transcript_id, "
                      "'longest-gene': output the longest gene for "
                      "overlapping genes ,"
                      "'longest-transcript': output the longest "
                      "transcript per gene,"
                      "'representative-transcript': output the "
                      "representative transcript per gene. "
                      "The representative transcript is the transcript "
                      "that shares most exons with "
                      "the other transcripts in a gene. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-r", "--rename",
                      dest="rename",
                      type="choice",
                      choices=("gene", "transcript"),
                      help="rename genes or transcripts with a map "
                      "given by the option `--apply`. "
                      "Those that can not be renamed are removed "
                      "[default=%default].")

    parser.add_option("--renumber-genes", dest="renumber_genes", type="string",
                      help="renumber genes according to the given pattern. "
                      "[default=%default].")

    parser.add_option("--renumber-transcripts",
                      dest="renumber_transcripts",
                      type="string",
                      help="renumber transcripts according to the "
                      "given pattern. "
                      "[default=%default].")

    parser.add_option("-a", "--apply", dest="filename_filter", type="string",
                      metavar="tsv",
                      help="filename of ids to map/filter [default=%default].")

    parser.add_option("--invert-filter",
                      dest="invert_filter",
                      action="store_true",
                      help="when using --filter, invert selection "
                      "(like grep -v). "
                      "[default=%default].")

    parser.add_option("--sample-size", dest="sample_size", type="int",
                      help="extract a random sample of size # if the option "
                      "'--filter' is set[default=%default].")

    parser.add_option("--intron-min-length",
                      dest="intron_min_length", type="int",
                      help="minimum length for introns (for --exons2introns) "
                      "[default=%default].")

    parser.add_option("--min-exons-length",
                      dest="min_exons_length",
                      type="int",
                      help="minimum length for gene (sum of exons) "
                      "(--sample-size) [default=%default].")

    parser.add_option("--intron-border",
                      dest="intron_border",
                      type="int",
                      help="number of residues to exclude at intron at either end "
                      "(--exons2introns) [default=%default].")

    parser.add_option("--transcripts2genes",
                      dest="transcripts2genes",
                      action="store_true",
                      help="cluster overlapping transcripts into genes.")

    parser.add_option("--reset-strand",
                      dest="reset_strand",
                      action="store_true",
                      help="remove strandedness of features (set to '.') when "
                      "using --transcripts2genes"
                      "[default=%default].")

    parser.add_option("--remove-overlapping", dest="remove_overlapping",
                      type="string",
                      metavar="gff",
                      help="remove all transcripts that overlap intervals "
                      "in a gff-formatted file."
                      "The comparison ignores strand "
                      "[default=%default].")

    parser.add_option("--permit-duplicates", dest="strict",
                      action="store_false",
                      help="permit duplicate genes. "
                      "[default=%default]")

    parser.add_option("--remove-duplicates", dest="remove_duplicates",
                      type="choice",
                      choices=("gene", "transcript", "ucsc", "coordinates"),
                      help="remove duplicates by gene/transcript. "
                      "If ``ucsc`` is chosen, transcripts ending on _dup# are "
                      "removed. This is necessary to remove duplicate entries "
                      "that are next to each other in the sort order "
                      "[%default]")

    parser.add_option("--rename-duplicates", dest="rename_duplicates",
                      action="store_true",
                      help="rename duplicate gene_ids and transcript_ids by "
                      "addition of a numerical suffix")

    parser.set_defaults(
        sort=None,
        merge_exons=False,
        join_exons=False,
        merge_exons_distance=0,
        merge_transcripts=False,
        set_score2distance=False,
        set_gene2transcript=False,
        set_transcript2gene=False,
        set_protein2transcript=False,
        add_protein_id=None,
        filename_filter=None,
        filter=None,
        exons2introns=None,
        merge_genes=False,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        transripts2genes=False,
        reset_strand=False,
        with_utr=False,
        invert_filter=False,
        remove_duplicates=None,
        remove_overlapping=None,
        renumber_genes=None,
        unset_genes=None,
        renumber_transcripts=None,
        strict=True,
        intersect_transcripts=False,
        rename_duplicates=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.set_transcript2gene:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.remove_duplicates:

        counts = collections.defaultdict(int)

        if options.remove_duplicates == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(
                GTF.iterator(options.stdin), strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.remove_duplicates == "gene":
                gffs = GTF.gene_iterator(
                    GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(
                    ["\n".join([str(y) for y in xx]) for xx in x])
            elif options.remove_duplicates == "transcript":
                gffs = GTF.transcript_iterator(
                    GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.remove_duplicates == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + \
                    str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.remove_duplicates == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))

    elif options.sort:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin),
                                       sort_order=options.sort):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.set_gene2transcript:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.set_protein2transcript:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.add_protein_id:

        transcript2protein = IOTools.readMap(open(options.add_protein_id, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(
                        ("removing transcript '%s' due to "
                         "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute(
                "protein_id", transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" %
               len(missing))

    elif options.join_exons:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons]), max(
                [x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif options.merge_genes:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(
            GTF.flat_gene_iterator(GTF.iterator(options.stdin)),
            sort_by="contig-strand-start")

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and \
                   gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, \
                        ("input file should be sorted by contig, strand "
                         "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \
                        (d,
                         "\n".join([str(x) for x in last]),
                         "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or \
                        gffs[0].strand != last[0].strand or \
                        d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.renumber_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.renumber_genes % (
                    len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.unset_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.unset_genes % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.renumber_transcripts:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.renumber_transcripts % (
                    len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.transcripts2genes:

        transcripts = set()
        genes = set()
        reset_strand = options.reset_strand
        for gtfs in GTF.iterator_transcripts2genes(
                GTF.iterator(options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if reset_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" %
               (len(transcripts), len(genes)))

    elif options.rename:

        map_old2new = IOTools.readMap(open(options.filename_filter, "r"))

        if options.rename == "transcript":
            is_gene_id = False
        elif options.rename == "gene":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute(
                        "transcript_id", map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" %
                            gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.filter:

        keep_genes = set()
        if options.filter == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig,
                               min([x.start for x in gff]),
                               max([x.end for x in gff]),
                               gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter in ("longest-transcript", "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                '''select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                '''
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end)
                                      for x in transcript if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([exon_counts[(x.start, x.end)]
                                 for x in transcript if x.feature == "exon"])
                    transcript_counts.append((count, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][1]

            if options.filter == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                transcript = _select(gene)
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(
                    open(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter == "gene"
                by_transcript = options.filter == "transcript"
                invert = options.invert_filter

                reset_strand = options.reset_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if reset_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter == "gene":
                    iterator = GTF.flat_gene_iterator(
                        GTF.iterator(options.stdin))
                elif options.filter == "transcript":
                    iterator = GTF.transcript_iterator(
                        GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator,
                        min_length=options.min_exons_length,
                        feature="exon")

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--apply) or a sample-size."

    elif options.exons2introns:

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b)
                                     for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [
                        x for x in output_ranges if x[1] - x[0] > l]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.set_score2distance:

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs]), max(
                [x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.remove_overlapping:

        index = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.remove_overlapping, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.intersect_transcripts:

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin),
                                      strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif options.rename_duplicates:

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [item for item in set(transcript_ids)
                          if transcript_ids.count(item) > 1]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript,
                                   ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if gtf.gene_id in dup_gene:
                    gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                    gtf.setAttribute('gene_id',
                                     gtf.gene_id + "." +
                                     str(gene_dict[gtf.gene_id]))

                if gtf.transcript_id in dup_transcript:
                    transcript_dict[gtf.transcript_id] = \
                        transcript_dict[gtf.transcript_id] + 1
                    gtf.setAttribute('transcript_id',
                                     gtf.transcript_id + "." +
                                     str(transcript_dict[gtf.transcript_id]))

            options.stdout.write("%s\n" % gtf)

    else:
        for gffs in GTF.flat_gene_iterator(
                GTF.iterator(options.stdin),
                strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError("can not merge gene '%s' on multiple strands: %s" % (
                    gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (
                    gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            if options.merge_exons:
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(
                    output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.merge_transcripts:

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.merge_introns:

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" %
           (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()
Ejemplo n.º 45
0
    def loadSequence(self, sequence):
        """load sequence properties from a sequence."""

        SequenceProperties.loadSequence(self, sequence)

        # uppercase all letters
        sequence = sequence.upper()

        self.mNStopCodons = 0

        # setup counting arrays
        # nucleotide counts for each position (is not a sum of the counts
        # per degenerate site, as the codon might be intelligible, e.g. GNN).
        self.mCounts = [{
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }, {
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }, {
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }]

        # nucleotide counts for each position per degeneracy
        self.mCountsDegeneracy = []

        for x in (0, 1, 2):
            xx = []
            for y in range(5):
                yy = {}
                for z in Bio.Alphabet.IUPAC.extended_dna.letters:
                    yy[z] = 0
                xx.append(yy)
            self.mCountsDegeneracy.append(xx)

        for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]:

            for x in (0, 1, 2):
                self.mCounts[x][codon[x]] += 1

            if Genomics.IsStopCodon(codon):
                self.mNStopCodons += 1
                continue

            try:
                aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon)
                degrees = (deg1, deg2, deg3)
                for x in range(len(degrees)):
                    self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1

            except KeyError:
                pass
Ejemplo n.º 46
0
def main( argv = None ):

    parser = E.OptionParser( version = "%prog version: $Id: analyze_codonbias_shannon.py 2864 2010-03-03 10:18:16Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option( "-c", "--is-cds", dest="is_cds", action="store_true",
                       help = "input are cds (nucleotide) sequences [%default]" )
    
    parser.set_defaults(
        is_cds = False,
        )
    
    (options, args) = E.Start( parser, argv = argv )

    options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n" )

    alphabet = "ACDEFGHIKLMNPQRSTVWY"
    
    snpid = 0

    for entry in FastaIterator.iterate( options.stdin ):
        identifier = entry.title

        if options.is_cds:
            cds_sequence = entry.sequence.upper()
            assert len(cds_sequence) % 3 == 0, \
                "length of sequence '%s' is not a multiple of 3" % entry.title

            sequence = Genomics.translate( cds_sequence )
            weights = []
            for pos, cds_pos in enumerate(range( 0, len(cds_sequence), 3)):
                codon = cds_sequence[cds_pos:cds_pos+3]
                counts = collections.defaultdict(int)
                for x in range(0,3):
                    rna = codon[x]
                    for na in "ACGT":
                        if na == rna: continue
                        taa = Genomics.translate(codon[:x] + na + codon[x+1:])
                        counts[taa] += 1
                weights.append( counts )

        else:
            sequence = entry.sequence.upper()
            counts = {}
            for x in alphabet: counts[x] = 1
            weights = [counts] * len(sequence)

        for pos, ref in enumerate( sequence ):

            if ref not in alphabet: continue
            w = weights[pos]
            t = float(sum(w.values()))
            for variant in alphabet:
                if variant == ref: continue
                snpid +=1
                options.stdout.write( 
                    "%s\n" % "\t".join(
                        ( "%010i" % snpid,
                          identifier,
                          str(pos+1),
                          ref, 
                          variant,
                          "%i" % w[variant],
                          "%6.4f" % (w[variant] / t),
                          )))
    
    E.Stop()
Ejemplo n.º 47
0
def AlignCodonBased(seq_wobble,
                    seq_cds,
                    seq_peptide,
                    map_p2c,
                    options,
                    diag_width=2,
                    max_advance=2):
    """advance in codons in seq_wobble and match to nucleotides in seq_cds.

    Due to alinglib this is all in one-based coordinates.
    Takes care of frameshifts.
    """

    map_p2c.clear()

    gop, gep = -1.0, -1.0
    matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation(
        1, -10, 1, alignlib_lite.py_getDefaultEncoder())

    pep_seq = seq_peptide.asString()
    cds_seq = seq_cds.asString()
    wobble_seq = seq_wobble.asString()

    lcds = seq_cds.getLength()
    lwobble = seq_wobble.getLength()
    y = 0
    x = 0

    last_start = None

    while x < lwobble and y < lcds:

        xr = seq_wobble.asResidue(x)
        # skip over masked chars in wobble - these are gaps
        if seq_wobble.asChar(x) == "X":
            x += 1
            continue

        # skip over masked chars in wobble - these are from
        # masked chars in the peptide sequence
        # Note to self: do not see all implications of this change
        # check later.
        if seq_wobble.asChar(x) == "N":
            x += 1
            continue

        # skip over gaps in wobble
        if seq_wobble.asChar(x) == "-":
            x += 1
            continue

        s = matrix.getValue(xr, seq_cds.asResidue(y))

        if options.loglevel >= 6:
            if (x % 3 == 0):
                c = seq_cds.asChar(y) + seq_cds.asChar(y +
                                                       1) + seq_cds.asChar(y +
                                                                           2)
                options.stdlog.write(
                    "# c=%s, x=%i, y=%i, aa=%s target=%s\n" %
                    (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)]))

            options.stdlog.write(
                "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" %
                (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr,
                 seq_cds.asResidue(y), str(s)))

        # deal with mismatches
        if s <= 0:

            tmp_map_p2c = alignlib_lite.py_makeAlignmentVector()

            # backtrack to previous three codons and align
            # three codons for double frameshifts that span two codons and
            # produce two X's and six WWWWWW.

            # number of nucleotides to extend (should be multiple of 3)
            # less than 12 caused failure for some peptides.
            d = 15

            # extend by amound dx
            dx = (x % 3) + d

            x_start = max(0, x - dx)
            # map to ensure that no ambiguous residue mappings
            # exist after re-alignment
            y_start = max(0,
                          map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT))

            if (x_start, y_start) == last_start:
                raise ValueError("infinite loop detected")

            last_start = (x_start, y_start)

            x_end = min(x_start + 2 * d, len(wobble_seq))
            y_end = min(y_start + 2 * d, len(cds_seq))

            wobble_fragment = alignlib_lite.py_makeSequence(
                wobble_seq[x_start:x_end])
            cds_fragment = alignlib_lite.py_makeSequence(
                cds_seq[y_start:y_end])

            AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c,
                            options)

            if options.loglevel >= 10:
                options.stdlog.write(
                    "# fragmented alignment from %i-%i, %i-%i:\n%s\n" %
                    (x_start, x_end, y_start, y_end,
                     str(
                         alignlib_lite.py_AlignmentFormatExplicit(
                             tmp_map_p2c, wobble_fragment, cds_fragment))))

                options.stdlog.flush()

            # clear alignment
            map_p2c.removeRowRegion(x_start, x_end)
            ngap = 0
            last_x, last_y = None, None
            for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()):
                yyy = tmp_map_p2c.mapRowToCol(xxx)

                if yyy >= 0:
                    x = xxx + x_start
                    y = yyy + y_start
                    xr = seq_wobble.asResidue(x)
                    s = matrix.getValue(seq_wobble.asResidue(x),
                                        seq_cds.asResidue(y))
                    if s < 0:
                        raise ValueError(
                            "mismatched residue wobble: %i (%s), cds: %i (%s)"
                            % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y)))

                    map_p2c.addPair(x, y, s)
                    last_x, last_y = x, y
                    if options.loglevel >= 6:
                        options.stdlog.write(
                            "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n"
                            % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y),
                               xr, seq_cds.asResidue(y), s))
                        options.stdlog.flush()
                    ngap = 0
                else:
                    ngap += 1

                # treat special case of double frameshifts. They might cause a petide/wobble residue
                # to be eliminated and thus the translated sequences will differ.
                # simply delete the last residue between x and y and move to
                # next codon.
                if ngap == 3:
                    map_p2c.removeRowRegion(last_x, last_x + 1)

                    last_x += 1
                    map_p2c.addPair(last_x, last_y)
                    if options.loglevel >= 6:
                        options.stdlog.write(
                            "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n"
                            % (last_x, seq_wobble.asChar(last_x), last_y,
                               seq_cds.asChar(last_y), xr,
                               seq_cds.asResidue(last_y), s))
                        options.stdlog.flush()
                    ngap = 0

            # exit condition if alignment is shorter than problematic residue
            # need to catch this to avoid infinite loop.
            if tmp_map_p2c.getRowTo() < d:
                if lwobble - x <= 4:
                    # only last codon is missing, so ok
                    break
                else:
                    raise ValueError("failure to align in designated window.")

            s = 0

        s = matrix.getValue(xr, seq_cds.asResidue(y))

        if s < 0:
            raise ValueError("mis-matching residues.")

        map_p2c.addPair(x, y, float(s))

        # advance to next residues
        x += 1
        y += 1

    # sanity checks
    assert (map_p2c.getRowTo() <= seq_wobble.getLength())
    assert (map_p2c.getColTo() <= seq_cds.getLength())
Ejemplo n.º 48
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] )

    parser.add_option("-i", "--iterations", dest="iterations", type="int",
                      help="number of iterations for sampling [default=%default]."  )

    parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float",
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("--without-combine", dest="combine", action = "store_false",
                      help="combine overlapping motifs [default=%default]."  )

    parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice",
                      choices = ("per-sequence", "all", "xall"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-m", "--motif", dest="motif", type="choice",
                      choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"),
                      help="qvalue threshold [default=%default]."  )

    parser.add_option("-a", "--arrangements", dest="arrangements", type="string",
                      help ="',' separated list of repeat arrangements [default=%default]")

    parser.add_option("-x", "--mask", dest="mask", type="choice",
                      choices=("dust","repeatmasker"),
                      help ="mask sequences before scanning [default=%default]")

    parser.add_option("--output-stats", dest="output_stats", action = "store_true",
                      help="output stats [default=%default]."  )

    parser.add_option("--add-sequence", dest="add_sequence", action = "store_true",
                      help="add sequence information [default=%default]."  )

    parser.set_defaults(
        iterations = 100,
        qvalue_threshold = 0.05,
        motif = "rxrvdr",
        fdr_control = "all",
        combine = True,
        arrangements = None,
        mask = None,
        output_stats = False,
        add_sequence = False,
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv, add_output_options = True )

    ## do sth
    ninput, nskipped, noutput = 0, 0, 0

    if options.arrangements == None:
        options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ]
    else:
        options.arrangements = options.arrangements.split(",")
        
    options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) )
    if options.add_sequence: options.stdout.write( "\tsequence" )
    options.stdout.write("\n")

    if options.motif == 'nr': sense_matrix = NR
    elif options.motif == "rxrvdr": sense_matrix = RXRVDR
    elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1
    elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2
    else:
        raise ValueError("unknown matrix %s" % options.motif)

    if options.fdr_control == "all":

        seqs = list(FastaIterator.iterate(options.stdin))

        if options.mask:
            masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask )
        else:
            masked_seqs = [x.sequence for x in seqs]
            
        ninput = len(seqs)
        map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) )
        matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix,
                                                          samples = options.iterations )
        
        results = matcher.run( masked_seqs,
                               options.arrangements,
                               qvalue_threshold = options.qvalue_threshold )

        if options.combine:
            results =  Nubiscan.combineMotifs( results )
        
        for r in results:

            if r.alternatives:
                alternatives = ",".join( [x.arrangement for x in r.alternatives ] )
            else:
                alternatives = ""

            options.stdout.write( "\t".join( ( 
                map_id2title[r.id],
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue,
                alternatives) ) )

            if options.add_sequence:
                s = masked_seqs[int(r.id)][r.start:r.end]
                if r.strand == "-": s = Genomics.complement( s )
                s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper()
                options.stdout.write( "\t%s" % s )
                
            options.stdout.write("\n")
            noutput += 1

        # output stats
        if options.output_stats:
            outfile = E.openOutputFile( "fdr" )
            outfile.write("bin\thist\tnobserved\n" )
            for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations):
                outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs))
            outfile.close()


    elif options.fdr_control == "xall":

        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        # collect all results
        matches = []
        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            mm = matcher.run( seq.sequence,
                              options.arrangements,
                              qvalue_threshold = None )
            for m in mm:
                matches.append( m._replace( sequence = seq.title ) )

        # estimate qvalues for all matches across all sequences
        pvalues = [ x.pvalue for x in matches ]
        fdr = Stats.doFDR( pvalues )
        qvalues = fdr.mQValues
        results = []
        for m, qvalue in zip(matches, qvalues):
            if qvalue > options.qvalue_threshold: continue
            results.append( m._replace( qvalue = qvalue ) )

        if options.combine:            
            results =  Nubiscan.combineMotifs( results )

        # output
        for r in results:
            options.stdout.write( "\t".join( ( 
                r.id,
                "%i" % r.start,
                "%i" % r.end,
                r.strand,
                r.arrangement,
                "%6.4f" % r.score,
                "%6.4f" % r.zscore,
                "%6.4e" % r.pvalue,
                "%6.4e" % r.qvalue ) ) + "\n" )

            noutput += 1

    elif options.fdr_control == "per-sequence":
        matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix,
                                                         samples = options.iterations )
    

        for seq in FastaIterator.iterate(options.stdin):
            ninput += 1
            result = matcher.run( seq.sequence,
                                  options.arrangements,
                                  qvalue_threshold = options.qvalue_threshold )
            
            if options.combine:
                result =  Nubiscan.combineMotifs( result )

            t = re.sub(" .*","",  seq.title)
            for r in result:
                options.stdout.write( "\t".join( ( 
                    t,
                    "%i" % r.start,
                    "%i" % r.end,
                    r.strand,
                    r.arrangement,
                    "%6.4f" % r.score,
                    "%6.4f" % r.zscore,
                    "%f" % r.pvalue,
                    "%f" % r.qvalue ) ) + "\n" )

            noutput += 1
    
    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) )

    ## write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 49
0
def updateVariants(variants, lcontig, strand, phased=True):
    '''update variants such that they use same coordinate
    system (and strand) as the transcript

    fixes 1-ness of variants
    '''

    new_variants = []
    is_positive = Genomics.IsPositiveStrand(strand)

    for variant in variants:

        pos = variant.pos
        genotype = bytes(variant.genotype)
        reference = bytes(variant.reference)

        # fix 1-ness of variants
        # pos -= 1

        if len(genotype) == 1:
            variantseqs = list(Genomics.decodeGenotype(genotype))
            has_wildtype = reference in variantseqs
            action = "="
            start, end = pos, pos + 1
        else:

            variantseqs = [x[1:] for x in genotype.split("/")]
            lvariant = max([len(x) for x in variantseqs])
            if not phased:
                variantseqs = [x for x in variantseqs if x]
            has_wildtype = "*" in genotype

            if "+" in genotype and "-" in genotype:
                # both insertion and deletion at position
                # the range is given by the deletion
                # see below for explanations
                if genotype.startswith("+"):
                    action = ">"
                    variantseqs[1] += "-" * (lvariant - len(variantseqs[1]))
                else:
                    action = "<"
                    variantseqs[0] += "-" * (lvariant - len(variantseqs[0]))

                start, end = pos + 1, pos + lvariant + 1

            elif "-" in genotype:
                action = "-"
                # samtools: deletions are after the base denoted by snp.position
                #   * <- deletion at 1
                # 0 1 2 3 4 5 6
                #     - -
                # 6 5 4 3 2 1 0
                # deletion of 2+3 = (2,4)
                # on reverse: (7-4, 7-2) = (3,5)
                start, end = pos + 1, pos + lvariant + 1

                # deletions of unequal length are filled up with "-"
                # This is necessary to deal with negative strands:
                # -at/-atg on the positive strand deletes a t [g]
                # -at/-atg on the negative strand deletes [g] t a
                variantseqs = [
                    x + "-" * (lvariant - len(x)) for x in variantseqs
                ]

            elif "+" in genotype:
                action = "+"
                # indels are after the base denoted by position
                # as region use both flanking base so that negative strand
                # coordinates work
                # insertion between position 2 and 3
                #     * <- insection at pos 2
                # 0 1 2i3 4
                # 4 3 2i1 0
                # is insertion between 1 and 2 in reverse
                # including both flanking residues makes it work:
                # (2,3) = (5-3,5-2) = (2,3)
                # but:
                # (2,4) = (5-4,5-2) = (1,3)
                start, end = pos, pos + 2

        # revert strand
        if not is_positive:
            reference = Genomics.complement(reference)
            variantseqs = [Genomics.complement(x.upper()) for x in variantseqs]
            start, end = lcontig - end, lcontig - start

        new_variants.append(
            ExtendedVariant._make((start, end, reference.upper(), action,
                                   has_wildtype, variantseqs)))

    return new_variants
Ejemplo n.º 50
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append",
                      help="tracks (tablenames) to use in sqlite database [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-r", "--reference", dest="reference", type="string",
                      help="name of reference [default=%default].")
    parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true",
                      help="if set, the gene_id will be added to the alignment header [default=%default].")
    parser.add_option("-z", "--compress", dest="compress", action="store_true",
                      help="compress output with gzip [default=%default].")
    parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string",
                      help="regular expression pattern for track [default=%default].")

    parser.set_defaults(
        genome_file=None,
        tracks=[],
        database="csvdb",
        output=[],
        border=0,
        reference_name="reference",
        pattern_track="(\S+)",
        is_gtf=True,
        compress=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if not options.database or not options.tracks:
        raise ValueError("please supply both database and tracks")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.is_gtf:
        infile_gff = GTF.iterator(options.stdin)
    else:
        infile_gff = GTF.iterator(options.stdin)

    dbhandle = sqlite3.connect(options.database)

    statement = '''SELECT pos, reference, genotype 
                   FROM %(track)s
                   WHERE contig = '%(contig)s' AND 
                   pos BETWEEN %(extended_start)s and %(extended_end)s
                '''

    counts = E.Counter()
    tracks = options.tracks
    try:
        translated_tracks = [
            re.search(options.pattern_track, track).groups()[0] for track in tracks]
    except AttributeError:
        raise AttributeError(
            "pattern `%s` does not match input tracks." % options.pattern_track)

    if options.compress:
        outfile = gzip.GzipFile(fileobj=options.stdout)
    else:
        outfile = options.stdout

    outfile.flush()
    outfile.write("##maf version=1 program=snp2maf.py\n\n")

    for gff in infile_gff:
        counts.input += 1

        contig = gff.contig
        strand = gff.strand
        lcontig = fasta.getLength(contig)
        region_start, region_end = gff.start, gff.end
        if contig.startswith("chr"):
            contig = contig[3:]
        extended_start = region_start - options.border
        extended_end = region_end + options.border
        is_positive = Genomics.IsPositiveStrand(strand)

        E.info("processing %s" % str(gff))

        # collect all variants
        all_variants = []
        for track in options.tracks:
            cc = dbhandle.cursor()
            cc.execute(statement % locals())
            all_variants.append(map(Variants.Variant._make, cc.fetchall()))
            cc.close()

        E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig,
                                                                   region_start, region_end,
                                                                   sum([
                                                                       len(x) for x in all_variants]),
                                                                   len(all_variants)))

        reference_seq = fasta.getSequence(
            contig, "+", region_start, region_end)
        lseq = len(reference_seq)
        alleles = collections.defaultdict(list)

        # build allele sequences for track and count maximum chars per mali
        # column
        colcounts = numpy.ones(lseq)
        for track, variants in zip(translated_tracks, all_variants):
            variants = Variants.updateVariants(variants, lcontig, "+")
            a = Variants.buildAlleles(reference_seq,
                                      variants,
                                      reference_start=region_start)

            alleles[track] = a
            for allele in a:
                for pos, c in enumerate(allele):
                    colcounts[pos] = max(colcounts[pos], len(c))

        # realign gapped regions
        alignIndels(alleles, colcounts)

        if options.is_gtf:
            outfile.write("a gene_id=%s\n" % gff.gene_id)
        else:
            outfile.write("a\n")

        maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n"

        def __addGaps(sequence, colcounts):
            '''output gapped sequence.'''
            r = []
            for x, c in enumerate(sequence):
                r.append(c + "-" * (colcounts[x] - len(c)))
            return "".join(r)

        name = ".".join((options.reference, contig))
        if is_positive:
            pos = region_start
        else:
            pos = lcontig - region_start

        size = lseq
        seq = __addGaps(reference_seq, colcounts)
        outfile.write(maf_format % (locals()))

        for track in translated_tracks:
            for aid, allele in enumerate(alleles[track]):
                seq = __addGaps(allele, colcounts)
                if not is_positive:
                    Genomics.complement(seq)
                size = len(seq) - seq.count("-")
                name = ".".join((track + "-%i" % aid, contig))
                outfile.write(maf_format % (locals()))

        outfile.write("\n")

    E.info("%s" % str(counts))

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 51
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--is-cds",
                      dest="is_cds",
                      action="store_true",
                      help="input are cds (nucleotide) sequences [%default]")

    parser.set_defaults(is_cds=False, )

    (options, args) = E.Start(parser, argv=argv)

    options.stdout.write(
        "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n")

    alphabet = "ACDEFGHIKLMNPQRSTVWY"

    snpid = 0

    for entry in FastaIterator.iterate(options.stdin):
        identifier = entry.title

        if options.is_cds:
            cds_sequence = entry.sequence.upper()
            assert len(cds_sequence) % 3 == 0, \
                "length of sequence '%s' is not a multiple of 3" % entry.title

            sequence = Genomics.translate(cds_sequence)
            weights = []
            for pos, cds_pos in enumerate(range(0, len(cds_sequence), 3)):
                codon = cds_sequence[cds_pos:cds_pos + 3]
                counts = collections.defaultdict(int)
                for x in range(0, 3):
                    rna = codon[x]
                    for na in "ACGT":
                        if na == rna:
                            continue
                        taa = Genomics.translate(codon[:x] + na +
                                                 codon[x + 1:])
                        counts[taa] += 1
                weights.append(counts)

        else:
            sequence = entry.sequence.upper()
            counts = {}
            for x in alphabet:
                counts[x] = 1
            weights = [counts] * len(sequence)

        for pos, ref in enumerate(sequence):

            if ref not in alphabet:
                continue
            w = weights[pos]
            t = float(sum(w.values()))
            for variant in alphabet:
                if variant == ref:
                    continue
                snpid += 1
                options.stdout.write("%s\n" % "\t".join((
                    "%010i" % snpid,
                    identifier,
                    str(pos + 1),
                    ref,
                    variant,
                    "%i" % w[variant],
                    "%6.4f" % (w[variant] / t),
                )))

    E.Stop()
Ejemplo n.º 52
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: gff2gff.py$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=(
                          "add-flank",
                          "add-upstream-flank",
                          "add-downstream-flank",
                          "crop",
                          "crop-unique",
                          "complement-groups",
                          "combine-groups",
                          "filter-range",
                          "join-features",
                          "merge-features",
                          "sanitize",
                          "to-forward-coordinates",
                          "to-forward-strand"),
                      help="method to apply [%default]")

    parser.add_option(
        "--ignore-strand", dest="ignore_strand",
        help="ignore strand information.", action="store_true")

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input will be treated as gtf [default=%default].")

    parser.add_option(
        "-c", "--contigs-tsv-file", dest="input_filename_contigs",
        type="string",
        help="filename with contig lengths.")

    parser.add_option(
        "--agp-file", dest="input_filename_agp", type="string",
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_option(
        "-g", "--genome-file", dest="genome_file", type="string",
        help="filename with genome.")

    parser.add_option(
        "--crop-gff-file", dest="filename_crop_gff", type="string",
        help="GFF/GTF file to crop against.")

    parser.add_option(
        "--group-field", dest="group_field", type="string",
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... [%default].""")

    parser.add_option(
        "--filter-range", dest="filter_range", type="string",
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_option(
        "--sanitize-method", dest="sanitize_method", type="choice",
        choices=("ucsc", "ensembl", "genome"),
        help="method to use for sanitizing chromosome names. "
        "[%default].")

    parser.add_option(
        "--flank-method", dest="flank_method", type="choice",
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        "[%default].")

    parser.add_option(
        "--skip-missing", dest="skip_missing", action="store_true",
        help="skip entries on missing contigs. Otherwise an "
        "exception is raised [%default].")

    parser.add_option(
        "--contig-pattern", dest="contig_pattern", type="string",
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize [%default].")

    parser.add_option(
        "--extension-upstream", dest="extension_upstream", type="float",
        help="extension for upstream end [%default].")

    parser.add_option(
        "--extension-downstream", dest="extension_downstream", type="float",
        help="extension for downstream end [%default].")

    parser.add_option(
        "--min-distance", dest="min_distance", type="int",
        help="minimum distance of features to merge/join [%default].")

    parser.add_option(
        "--max-distance", dest="max_distance", type="int",
        help="maximum distance of features to merge/join [%default].")

    parser.add_option(
        "--min-features", dest="min_features", type="int",
        help="minimum number of features to merge/join [%default].")

    parser.add_option(
        "--max-features", dest="max_features", type="int",
        help="maximum number of features to merge/join [%default].")

    parser.set_defaults(
        input_filename_contigs=False,
        filename_crop_gff=None,
        input_filename_agp=False,
        genome_file=None,
        add_up_flank=None,
        add_down_flank=None,
        complement_groups=False,
        crop=None,
        crop_unique=False,
        ignore_strand=False,
        filter_range=None,
        min_distance=0,
        max_distance=0,
        min_features=1,
        max_features=0,
        extension_upstream=1000,
        extension_downstream=1000,
        sanitize_method="ucsc",
        flank_method="add",
        output_format="%06i",
        skip_missing=False,
        is_gtf=False,
        group_field=None,
        contig_pattern=None,
    )

    (options, args) = E.Start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    if options.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            IOTools.openFile(options.input_filename_contigs, "r"))

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    if options.method in ("forward_coordinates", "forward_strand",
                          "add-flank", "add-upstream-flank",
                          "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if options.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(options.stdin)

    if options.method in ("add-upstream-flank",
                          "add-downstream-flank",
                          "add-flank"):

        add_upstream_flank = "add-upstream-flank" == options.method
        add_downstream_flank = "add-downstream-flank" == options.method
        if options.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(options.extension_upstream)
        downstream_flank = int(options.extension_downstream)
        extend_flank = options.flank_method == "extend"

        if options.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, options.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(
                            0, chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(
                            lcontig,
                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(
                            0, chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                options.stdout.write(str(gff) + "\n")

    elif options.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs,
                                       group_field=options.group_field)

        for chunk in iterator:
            if options.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                options.stdout.write(str(x) + "\n")
                x.start = c.end

    elif options.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs,
                                       group_field=options.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            options.stdout.write(str(x) + "\n")

    elif options.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=options.min_distance,
                              max_distance=options.max_distance,
                              min_features=options.min_features,
                              max_features=options.max_features,
                              merge=False,
                              output_format=options.output_format):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=options.min_distance,
                              max_distance=options.max_distance,
                              min_features=options.min_features,
                              max_features=options.max_features,
                              merge=True,
                              output_format=options.output_format):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "crop":
        for gff in cropGFF(gffs, options.filename_crop_gff):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)",
                options.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match(
                    "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match(
                    "(\d+)(\.\.|\,|\-)(\d+)", options.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % options.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs, contig=contig,
                                         strand=strand,
                                         interval=interval):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "sanitize":

        def toUCSC(id):
            if not id.startswith("contig") and not id.startswith("chr"):
                id = "chr%s" % id
            return id

        def toEnsembl(id):
            if id.startswith("contig"):
                return id[len("contig"):]
            if id.startswith("chr"):
                return id[len("chr"):]
            return id

        if options.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError(
                    "please specify --genome-file= when using "
                    "--sanitize-method=genome")
            f = genome_fasta.getToken
        elif options.sanitize_method == "ucsc":
            f = toUCSC
        elif options.sanitize_method == "ensembl":
            f = toEnsembl

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if options.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise
                    
            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if options.contig_pattern:
                to_remove = [re.compile(x)
                             for x in options.contig_pattern.split(",")]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            options.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys(
                    ))),
                    str(skipped_contigs)))

        if outofrange_contigs:
            E.warn("skipped %i entries on %i contigs because they are out of range: %s" %
                   (sum(outofrange_contigs.values()),
                    len(list(outofrange_contigs.keys())),
                    str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())),
                    str(filtered_contigs)))

    else:

        for gff in gffs:

            if options.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if options.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            options.stdout.write(str(gff) + "\n")

    E.Stop()