Exemple #1
0
def addIntergenicSegment(last, this, fasta, options):
    """add an intergenic segment between last and this.

    At telomeres, either can be None.
    """
    if not this and not last:
        return 0

    nadded = 0
    if not this:
        # last telomere
        try:
            lcontig = fasta.getLength(last.contig)
        except KeyError as msg:
            if options.ignore_missing:
                return nadded
            else:
                raise KeyError(msg)
        flank = min(last.end + options.flank, lcontig)
        nadded += addFlank(last.end, flank, last, options)
        nadded += addSegment("telomeric", flank, lcontig, last, options)
    elif not last:
        # first telomere
        flank = max(0, this.start - options.flank)
        nadded += addSegment("telomeric", 0, flank, this, options)
        nadded += addFlank(flank, this.start, this, options)
    else:
        # intergenic region
        d = this.start - last.end
        flank = options.flank
        if d > flank * 2:
            nadded += addFlank(last.end, last.end + flank, last, options)
            nadded += addSegment("intergenic", last.end + flank,
                                 this.start - flank, (last, this), options)
            nadded += addFlank(this.start - flank, this.start, this, options)
        else:
            # add short flank between two genes. If they can not agree
            # on the directionality, "flank" is used.
            is_positive1 = Genomics.IsPositiveStrand(last.strand)
            is_positive2 = Genomics.IsPositiveStrand(this.strand)
            if is_positive1 and not is_positive2:
                key = "3flank"
            elif not is_positive1 and is_positive2:
                key = "5flank"
            else:
                key = "flank"
            nadded += addSegment(key, last.end, this.start, (last, this),
                                 options)

    return nadded
Exemple #2
0
def toSequence(chunk, fasta):
    """convert a list of gff attributes to a single sequence.

    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0:
        return ""

    contig, strand = chunk[0].contig, chunk[0].strand

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand(strand)

    if not positive:
        intervals = [(lcontig - end, lcontig - start)
                     for start, end in intervals]
        intervals.reverse()

    s = [
        fasta.getSequence(contig, strand, start, end)
        for start, end in intervals
    ]

    return "".join(s)
Exemple #3
0
def countMotifs(infile, motifs):
    '''find regular expression *motifs* in
    sequences within fasta formatted *infile*.
    '''

    it = FastaIterator.FastaIterator(infile)
    positions = []
    while 1:
        try:
            seq = next(it)
        except StopIteration:
            break
        if not seq:
            break

        rseq = Genomics.reverse_complement(seq.sequence)
        lsequence = len(seq.sequence)
        pos = []
        for motif, pattern in motifs:

            for x in pattern.finditer(seq.sequence):
                pos.append((motif, "+", x.start(), x.end()))
            for x in pattern.finditer(rseq):
                pos.append(
                    (motif, "-", lsequence - x.end(), lsequence - x.start()))

        positions.append((seq.title, pos))

    return positions
 def getKL(self, usage):
     """return Kullback-Leibler Divergence (relative entropy) of sequences with
     respect to reference codon usage.
     """
     e = 0
     freqs = Genomics.CalculateCodonFrequenciesFromCounts(
         self.mCodonCounts, self.mPseudoCounts)
     for codon, count in list(self.mCodonCounts.items()):
         e += usage[codon] * math.log(usage[codon] / freqs[codon])
     return e
    def loadSequence(self, sequence, seqtype="na"):
        """load sequence properties from a sequence."""

        if len(sequence) % 3:
            raise ValueError('''sequence length is not a multiple of 3
                (length=%i)''' % (len(sequence)))

        SequencePropertiesLength.loadSequence(self, sequence, seqtype)

        # uppercase all letters and count codons
        self.mCodonCounts = Genomics.CountCodons(sequence.upper())
Exemple #6
0
def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.items():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, noverlapping=%i\n" %
                             (ninput, noutput, noverlapping))
Exemple #7
0
def addFlank(start, end, template, options):
    """add a flank.
    """
    is_positive = Genomics.IsPositiveStrand(template.strand)
    is_before = end <= template.start
    if (is_before and is_positive) or (not is_before and not is_positive):
        name = "5flank"
    else:
        name = "3flank"

    return addSegment(name, start, end, template, options)
Exemple #8
0
    def __call__(self, sequence):
        """mask a sequence."""

        sequence = re.sub("\s", "", sequence)

        a = self.getAlphabet(sequence)

        seq = list(sequence)

        if len(seq) < 5:
            # do not mask empty/short sequences
            pass

        elif a == "aa" and self.mHasPeptideMasking:

            c = 0
            m = self.maskSequence(sequence)
            if self.soft_mask:
                m = re.sub("[a-z]", "x", m)
            for p, m in zip(sequence, m):
                if m in "Xx":
                    if p.isupper():
                        seq[c] = "X"
                    else:
                        seq[c] = "x"
                c += 1

        elif a == "codons" and self.mHasPeptideMasking:

            peptide_sequence = Genomics.TranslateDNA2Protein(sequence)
            masked_sequence = self.maskSequence(peptide_sequence)
            if self.soft_mask:
                masked_sequence = re.sub("[a-z]", "x", masked_sequence)

            c = 0
            for p, m in zip(peptide_sequence, masked_sequence):
                if m in "Xx":
                    if p.isupper():
                        seq[c:c + 3] = ["N"] * 3
                    else:
                        seq[c:c + 3] = ["n"] * 3
                c += 3

        elif a in ("na", "codons") and self.mHasNucleicAcidMasking:
            masked_sequence = self.maskSequence(sequence)
            if self.soft_mask:
                masked_sequence = re.sub("[a-z]", "N", masked_sequence)
            return masked_sequence
        else:
            raise ValueError(
                "masking of sequence type %s not implemented." % a)

        return "".join(seq)
Exemple #9
0
def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append((min(ma, lcontig - options.promotor),
                            min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
                             (ngenes, ntranscripts, npromotors))
Exemple #10
0
def transform_third_codon(start, end, intervals_with_gff):
    """transform: only return nucleotide positions in window (start, end)
    that are in third codon position.
    """
    intervals = []
    for istart, iend, gff in intervals_with_gff:

        if gff.frame == ".":
            raise ValueError("need a frame for third codon positions.")

        # frame = nucleotides from start to next codon
        frame = int(gff.frame)

        # to make life easier, convert to 0-based coordinates,
        # with zero starting at first position in window
        # re-arrange positions on negative strand
        if Genomics.IsNegativeStrand(gff.strand):
            # convert to negative strand coordinates counting from 0
            coordinate_offset = end
            reverse = True
            istart, iend = end - iend, end - istart
        else:
            istart, iend = istart - start, iend - start
            reverse = False
            coordinate_offset = start

        # make sure that you start on a second codon position and within window
        if istart < 0:
            frame = (frame + istart) % 3
            istart = 0
        if frame != 0:
            istart -= (3 - frame)
        istart += 2

        iend = min(iend, end - start)

        for x in range(istart, iend, 3):

            if reverse:
                c = coordinate_offset - x - 1
            else:
                c = coordinate_offset + x
            intervals.append((c, c + 1))

    return Intervals.combineIntervals(intervals)
    def getEntropy(self, usage=None):
        """return entropy of a source in terms of a reference usage.
        Also called conditional entropy or encoding cost.

        Note that here I compute the sum over 20 entropies,
        one for each amino acid.

        If not given, calculate entropy.
        """

        e = 0
        freqs = Genomics.CalculateCodonFrequenciesFromCounts(
            self.mCodonCounts, self.mPseudoCounts)
        if usage is None:
            usage = freqs
        for codon, count in list(self.mCodonCounts.items()):
            e -= freqs[codon] * math.log(usage[codon])
        return e
    def loadSequence(self, sequence, seqtype="na"):
        """load sequence properties from a sequence."""

        SequenceProperties.loadSequence(self, sequence, seqtype)

        if len(sequence) % 3:
            raise ValueError(
                '''sequence length is not a multiple of 3 (length=%i)''' %
                (len(sequence)))

        # counts of amino acids
        self.mCountsAA = {}

        for x in Bio.Alphabet.IUPAC.extended_protein.letters:
            self.mCountsAA[x] = 0

        for codon in (sequence[x:x + 3] for x in range(0, len(sequence), 3)):
            aa = Genomics.MapCodon2AA(codon)
            self.mCountsAA[aa] += 1
Exemple #13
0
    def getSequence(self,
                    contig,
                    strand="+",
                    start=0,
                    end=0,
                    converter=None,
                    as_array=False):
        """get a genomic fragment.

        A genomic fragment is identified by the coordinates
        contig, strand, start, end.

        The converter function supplied translated these coordinates
        into 0-based coordinates. By default, start and end are assumed
        to be pythonic coordinates and are forward/reverse coordinates.

        If as_array is set to true, return the AString object. This might
        be beneficial for large sequence chunks. If as_array is set to False,
        return a python string.
        """

        contig = self.getToken(contig)

        data = self.mIndex[contig]
        # dummy is
        # -> pos_seq for seekable streams
        # -> block_size for unseekable streams
        try:
            pos_id, dummy, lsequence = struct.unpack("QQi", data)
        except (struct.error, TypeError):
            pos_id, dummy, lsequence, points = data

        pos_seq = dummy
        block_size = dummy

        if end == 0:
            end = lsequence

        if end > lsequence:
            raise ValueError("3' coordinate on %s out of bounds: %i > %i" %
                             (contig, end, lsequence))

        if start < 0:
            raise ValueError("5' coordinate on %s out of bounds: %i < 0" %
                             (contig, start))

        if converter:
            first_pos, last_pos = converter(start, end,
                                            str(strand) in ("+", "1"),
                                            lsequence)
        elif self.mConverter:
            first_pos, last_pos = self.mConverter(start, end,
                                                  str(strand) in ("+", "1"),
                                                  lsequence)
        else:
            first_pos, last_pos = start, end
            if str(strand) in ("-", "0", "-1"):
                first_pos, last_pos = lsequence - \
                    last_pos, lsequence - first_pos

        if first_pos == last_pos:
            return ""

        assert first_pos < last_pos, \
            "first position %i is larger than last position %i " % \
            (first_pos, last_pos)

        p = AString()

        if self.mNoSeek:
            # read directly from position
            p.fromstring(
                self.mDatabaseFile.read(block_size, data[3], first_pos,
                                        last_pos))
        else:
            first_pos += pos_seq
            last_pos += pos_seq

            self.mDatabaseFile.seek(first_pos)
            p.fromstring(self.mDatabaseFile.read(last_pos - first_pos))

        if str(strand) in ("-", "0", "-1"):
            p = AString(Genomics.reverse_complement(str(p)))

        if self.mTranslator:
            return self.mTranslator.translate(p)
        elif as_array:
            return p
        else:
            return p.tostring().decode("ascii")
Exemple #14
0
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE):
    """annotate a genome given by the indexed *fasta* file and 
    an iterator over gtf annotations.
    """

    annotations = {}
    contig_sizes = fasta.getContigSizes(with_synonyms=False)
    E.info("allocating memory for %i contigs and %i bytes" %
           (len(contig_sizes),
            sum(contig_sizes.values()) * array.array("B").itemsize))
    # AString.AString( "a").itemsize ))

    for contig, size in list(contig_sizes.items()):
        E.debug("allocating %s: %i bases" % (contig, size))
        # annotations[contig] = AString.AString( default_code * size )
        # annotations[contig] = array.array("", default_code * size)
        # Go to list for py3 compatibility, patch
        annotations[contig] = [default_code] * size

    E.info("allocated memory for %i contigs" % len(fasta))

    counter = E.Counter()

    # output splice junctions
    outfile_junctions = E.open_output_file("junctions")
    outfile_junctions.write(
        "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n")
    for gtfs in iterator:

        counter.input += 1

        if counter.input % options.report_step == 0:
            E.info("iteration %i" % counter.input)

        try:
            contig = fasta.getToken(gtfs[0].contig)
        except KeyError as msg:
            E.warn("contig %s not found - annotation ignored" % gtfs[0].contig)
            counter.skipped_contig += 1
            continue

        lcontig = fasta.getLength(contig)

        # make sure that exons are sorted by coordinate
        gtfs.sort(key=lambda x: x.start)

        is_positive = Genomics.IsPositiveStrand(gtfs[0].strand)
        source = gtfs[0].source

        # process non-coding data
        if source in MAP_ENSEMBL:
            code = MAP_ENSEMBL[source]

            intervals = [(x.start, x.end) for x in gtfs]
            addSegments(annotations[contig], intervals, is_positive, code)

        elif source == "protein_coding":

            # collect exons for utr
            exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"]
            cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"]
            if len(cds) == 0:
                counter.skipped_transcripts += 1
                E.warn("protein-coding transcript %s without CDS - skipped" %
                       gtfs[0].transcript_id)
                continue

            exons = Intervals.truncate(exons, cds)
            start, end = cds[0][0], cds[-1][1]

            UTR5 = [x for x in exons if x[1] < start]
            UTR3 = [x for x in exons if x[0] >= end]

            if not is_positive:
                UTR5, UTR3 = UTR3, UTR5
                splice_code = "S"
            else:
                splice_code = "s"

            addSegments(annotations[contig], UTR5, is_positive, "u")

            addIntrons(annotations[contig], UTR5, is_positive,
                       options.max_frameshift_length)

            addSegments(annotations[contig], UTR3, is_positive, "v")

            addIntrons(annotations[contig], UTR3, is_positive,
                       options.max_frameshift_length)

            # output CDS according to frame
            addCDS(annotations[contig],
                   [x for x in gtfs if x.feature == "CDS"], is_positive)

            # add introns between CDS
            addIntrons(annotations[contig], cds, is_positive,
                       options.max_frameshift_length)

            # output splice junctions
            cds = [x for x in gtfs if x.feature == "CDS"]

            # apply corrections for 1-past end coordinates
            # to point between residues within CDS
            if is_positive:
                ender = lambda x: x.end - 1
                starter = lambda x: x.start
                out_positive = "+"
            else:
                ender = lambda x: lcontig - x.start - 1
                starter = lambda x: lcontig - x.end
                out_positive = "-"
                cds.reverse()

            end = ender(cds[0])
            for c in cds[1:]:
                start = starter(c)
                outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (
                    contig,
                    out_positive,
                    end,
                    start,
                    c.frame,
                    c.gene_id,
                    c.transcript_id,
                ))
                end = ender(c)

    E.info("finished reading genes: %s" % str(counter))

    outfile_junctions.close()

    E.info("started counting")
    outfile = E.open_output_file("counts")
    outputCounts(outfile, annotations)
    outfile.close()

    E.info("started output")
    for k in sorted(annotations.keys()):
        # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring()))
        options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
Exemple #15
0
def updateVariants(variants, lcontig, strand, phased=True):
    '''update variants such that they use same coordinate
    system (and strand) as the transcript

    fixes 1-ness of variants
    '''

    new_variants = []
    is_positive = Genomics.IsPositiveStrand(strand)

    for variant in variants:

        pos = variant.pos
        genotype = bytes(variant.genotype)
        reference = bytes(variant.reference)

        # fix 1-ness of variants
        # pos -= 1

        if len(genotype) == 1:
            variantseqs = list(Genomics.decodeGenotype(genotype))
            has_wildtype = reference in variantseqs
            action = "="
            start, end = pos, pos + 1
        else:

            variantseqs = [x[1:] for x in genotype.split("/")]
            lvariant = max([len(x) for x in variantseqs])
            if not phased:
                variantseqs = [x for x in variantseqs if x]
            has_wildtype = "*" in genotype

            if "+" in genotype and "-" in genotype:
                # both insertion and deletion at position
                # the range is given by the deletion
                # see below for explanations
                if genotype.startswith("+"):
                    action = ">"
                    variantseqs[1] += "-" * (lvariant - len(variantseqs[1]))
                else:
                    action = "<"
                    variantseqs[0] += "-" * (lvariant - len(variantseqs[0]))

                start, end = pos + 1, pos + lvariant + 1

            elif "-" in genotype:
                action = "-"
                # samtools: deletions are after the base denoted by snp.position
                #   * <- deletion at 1
                # 0 1 2 3 4 5 6
                #     - -
                # 6 5 4 3 2 1 0
                # deletion of 2+3 = (2,4)
                # on reverse: (7-4, 7-2) = (3,5)
                start, end = pos + 1, pos + lvariant + 1

                # deletions of unequal length are filled up with "-"
                # This is necessary to deal with negative strands:
                # -at/-atg on the positive strand deletes a t [g]
                # -at/-atg on the negative strand deletes [g] t a
                variantseqs = [
                    x + "-" * (lvariant - len(x)) for x in variantseqs
                ]

            elif "+" in genotype:
                action = "+"
                # indels are after the base denoted by position
                # as region use both flanking base so that negative strand
                # coordinates work
                # insertion between position 2 and 3
                #     * <- insection at pos 2
                # 0 1 2i3 4
                # 4 3 2i1 0
                # is insertion between 1 and 2 in reverse
                # including both flanking residues makes it work:
                # (2,3) = (5-3,5-2) = (2,3)
                # but:
                # (2,4) = (5-4,5-2) = (1,3)
                start, end = pos, pos + 2

        # revert strand
        if not is_positive:
            reference = Genomics.reverse_complement(reference)
            variantseqs = [
                Genomics.reverse_complement(x.upper()) for x in variantseqs
            ]
            start, end = lcontig - end, lcontig - start

        new_variants.append(
            ExtendedVariant._make((start, end, reference.upper(), action,
                                   has_wildtype, variantseqs)))

    return new_variants
Exemple #16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--header-attributes",
                      dest="header_attr",
                      action="store_true",
                      help="add GFF entry attributes to the FASTA record"
                      " header section")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(
        is_gtf=False,
        genome_file=None,
        merge=False,
        feature=None,
        filename_masks=None,
        remove_masked_regions=False,
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        extend_with=None,
        masker=None,
        fold_at=None,
        naming_attribute=False,
        header_attr=False,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with iotools.open_file(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = quicksect.IntervalTree()
            for start, end in e[contig]:
                intersector.add(start, end)
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand

        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(
                                           quicksect.Interval(start, end))]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        if options.header_attr:
            attributes = " ".join(
                [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()])
            options.stdout.write(
                ">%s %s:%s:%s feature:%s %s\n%s\n" %
                (name, contig, strand, ";".join(
                    ["%i-%i" % x
                     for x in out]), chunk[0].feature, attributes, seq))
        else:
            options.stdout.write(
                ">%s %s:%s:%s\n%s\n" %
                (name, contig, strand, ";".join(["%i-%i" % x
                                                 for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.stop()
                        "utron_size"))
for utron in bedfile:
    
    ss5_sequence = genome.getSequence(utron.contig, "+", utron.start, utron.start+2)
    ss3_sequence = genome.getSequence(utron.contig, "+", utron.end-2, utron.end)
    if utron.strand == "+":
        splice_site_dict[utron.name] = (ss5_sequence, ss3_sequence)
        if ":" in utron.name:
            transcript_id = utron.name.split(":")[0]
            match_transcript_id = utron.name.split(":")[1]
            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript_id, utron.strand, ss5_sequence, ss3_sequence, utron.contig, utron.start, utron.end, utron.end-utron.start))
        else:
            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (utron.name, utron.strand, ss5_sequence, ss3_sequence, utron.contig, utron.start, utron.end, utron.end-utron.start))

    elif utron.strand == "-":
        ss5_sequence = Genomics.reverse_complement(ss5_sequence)
        ss3_sequence = Genomics.reverse_complement(ss3_sequence)
        splice_site_dict[utron.name] = (ss3_sequence, ss5_sequence)
        if ":" in utron.name:
            transcript_id = utron.name.split(":")[0]
            match_transcript_id = utron.name.split(":")[1]
            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (transcript_id, utron.strand, ss3_sequence, ss5_sequence, utron.contig, utron.end, utron.start, utron.end-utron.start))
        else:
            outfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (utron.name, utron.strand, ss3_sequence, ss5_sequence, utron.contig, utron.end, utron.start, utron.end-utron.start))

outfile.close()

from collections import defaultdict

counts = defaultdict(int)
for name, ss in splice_site_dict.items():
Exemple #18
0
def annotateRegulons(iterator, fasta, tss, options):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = (min(lcontig, max(0, interval[0])),
                        min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have
            # changed)
            regulons = Intervals.combine(regulons)
            transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            nregulons += 1
            x += 1

    E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" %
           (ngenes, ntranscripts, nregulons))
Exemple #19
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type=str,
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_argument("-s",
                        "--section",
                        dest="sections",
                        nargs="*",
                        type=str,
                        choices=("length", "sequence", "hid", "na", "aa",
                                 "cpg", "dn", "degeneracy", "gaps", "codons",
                                 "codon-usage", "codon-translator",
                                 "codon-bias"),
                        help="which sections to output ")

    parser.add_argument(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type=str,
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids .")

    parser.add_argument(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type=str,
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_argument(
        "--split-fasta-identifier",
        dest="split_id",
        action="store_true",
        help="split fasta description line (starting >) and use "
        "only text before first space")

    parser.add_argument(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (args) = E.start(parser, argv=argv)

    rx = re.compile(args.regex_identifier)

    reference_codons = []
    if args.filename_weights:
        args.filename_weights = args.filename_weights.split(",")
        for filename in args.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    iotools.ReadMap(iotools.open_file(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        args.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in list(a.items()):
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                args.stdlog.write(
                    "# tablediff\t%s\t%s\t%f\n" %
                    (args.filename_weights[x], args.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(args.stdin)

    def getCounter(section):

        if args.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(args.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif args.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in args.sections:
        totals[section] = getCounter(section)

    args.stdout.write("id")
    for section in args.sections:
        args.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    args.stdout.write("\n")
    args.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if args.split_id is True:
            args.stdout.write("%s" % id.split()[0])
        else:
            args.stdout.write("%s" % id)
        args.stdout.flush()

        for section in args.sections:
            s = getCounter(section)
            s.loadSequence(sequence, args.seqtype)
            totals[section].addProperties(s)

            args.stdout.write("\t" + "\t".join(s.getFields()))

        args.stdout.write("\n")

    if args.add_total:
        args.stdout.write("total")
        for section in args.sections:
            args.stdout.write("\t" + "\t".join(totals[section].getFields()))
        args.stdout.write("\n")

    E.stop()
Exemple #20
0
def annotateGREATDomains(iterator, fasta, options):
    """build great domains

    extend from TSS a basal region.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    counter = E.Counter()

    upstream, downstream = options.upstream, options.downstream
    radius = options.radius
    outfile = options.stdout

    regions = []
    ####################################################################
    # define basal regions for each gene
    # take all basal regions per transcript and merge them
    # Thus, the basal region of a gene might be larger than the sum
    # of options.upstream + options.downstream
    for gene in gene_iterator:
        counter.genes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)

        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []

        # collect every basal region per transcript
        for transcript in gene:
            counter.transcripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            # add range to both sides of tss
            if is_negative_strand:
                interval = ma - options.downstream, ma + options.upstream
            else:
                interval = mi - options.upstream, mi + options.downstream

            interval = (min(lcontig, max(0, interval[0])),
                        min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        # take first/last entry
        start, end = min(x[0] for x in regulons), max(x[1] for x in regulons)

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "greatdomain"
        gtf.start, gtf.end = start, end
        regions.append(gtf)

    regions.sort(key=lambda x: (x.contig, x.start))

    outf = iotools.open_file("test.gff", "w")
    for x in regions:
        outf.write(str(x) + "\n")
    outf.close()

    ####################################################################
    # extend basal regions
    regions.sort(key=lambda x: (x.contig, x.start))

    # iterate within groups of overlapping basal regions
    groups = list(GTF.iterator_overlaps(iter(regions)))
    counter.groups = len(groups)

    last_end = 0
    reset = False

    for region_id, group in enumerate(groups):

        # collect basal intervals in group
        intervals = [(x.start, x.end) for x in group]

        def overlapsBasalRegion(pos):
            for start, end in intervals:
                if start == pos or end == pos:
                    continue
                if start <= pos < end:
                    return True
                if start > pos:
                    return False
            return False

        # deal with boundary cases - end of contig
        if region_id < len(groups) - 1:
            nxt = groups[region_id + 1]
            if nxt[0].contig == group[0].contig:
                next_start = min([x.start for x in nxt])
            else:
                next_start = fasta.getLength(group[0].contig)
                reset = True
        else:
            next_start = fasta.getLength(group[0].contig)
            reset = True

        # last_end = basal extension of previous group
        # next_start = basal_extension of next group

        # extend region to previous/next group always extend
        # dowstream, but upstream only extend if basal region of an
        # interval is not already overlapping another basal region
        # within the group
        save_end = 0
        for gtf in group:
            save_end = max(save_end, gtf.end)
            if gtf.strand == "+":
                if not overlapsBasalRegion(gtf.start):
                    gtf.start = max(gtf.start - radius, last_end)
                # always extend downstream
                gtf.end = min(gtf.end + radius, next_start)
            else:
                # always extend downstream
                gtf.start = max(gtf.start - radius, last_end)
                if not overlapsBasalRegion(gtf.end):
                    gtf.end = min(gtf.end + radius, next_start)
            outfile.write(str(gtf) + "\n")
            counter.regulons += 1

        if len(group) > 1:
            counter.overlaps += len(group)
        else:
            counter.nonoverlaps += 1

        if reset:
            last_end = 0
            reset = False
        else:
            last_end = save_end

    E.info("%s" % str(counter))
Exemple #21
0
def process_cgat(options):

    c = E.Counter()

    assert options.input_fastq_file == "-"

    if options.method == "change-format":
        for record in Fastq.iterate_convert(options.stdin,
                                            format=options.target_format,
                                            guess=options.guess_format):
            c.input += 1
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "grep":
        for record in Fastq.iterate(options.stdin):
            if re.match(options.grep_pattern, record.seq):
                options.stdout.write("%s\n" % record)

    elif options.method == "reverse-complement":
        for record in Fastq.iterate(options.stdin):
            record.seq = Genomics.complement(record.seq)
            record.quals = record.quals[::-1]
            options.stdout.write("%s\n" % record)

    elif options.method == "sample":
        sample_threshold = min(1.0, options.sample_size)

        random.seed(options.seed)

        if options.pair:
            if not options.output_filename_pattern:
                raise ValueError("please specify output filename pattern for "
                                 "second pair (--output-filename-pattern)")

            outfile1 = options.stdout
            outfile2 = iotools.open_file(options.output_filename_pattern, "w")

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(iotools.open_file(options.pair))):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    outfile1.write("%s\n" % record1)
                    outfile2.write("%s\n" % record2)
        else:
            for record in Fastq.iterate(options.stdin):
                c.input += 1
                if random.random() <= sample_threshold:
                    c.output += 1
                    options.stdout.write("%s\n" % record)

    elif options.method == "apply":
        ids = set(iotools.read_list(iotools.open_file(options.apply)))

        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if re.sub(" .*", "", record.identifier).strip() in ids:
                c.output += 1
                options.stdout.write("%s\n" % record)

    elif options.method == "trim3":
        trim3 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim(trim3)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "trim5":
        trim5 = options.nbases
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            record.trim5(trim5)
            options.stdout.write("%s\n" % record)
            c.output += 1

    elif options.method == "unique":
        keys = set()
        for record in Fastq.iterate(options.stdin):
            c.input += 1
            if record.identifier in keys:
                continue
            else:
                keys.add(record.identifier)
            options.stdout.write("%s\n" % record)
            c.output += 1

    # Need to change this to incorporate both pairs
    elif options.method == "sort":
        if not options.pair:
            # This is quicker for a single fastq file
            statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'"
            os.system(statement)
        else:
            if not options.output_filename_pattern:
                raise ValueError(
                    "please specify output filename for second pair "
                    "(--output-filename-pattern)")
            E.warn("consider sorting individual fastq files - "
                   "this is memory intensive")
            entries1 = {}
            entries2 = {}

            for record1, record2 in zip(
                    Fastq.iterate(options.stdin),
                    Fastq.iterate(iotools.open_file(options.pair))):
                entries1[record1.identifier[:-2]] = (record1.seq,
                                                     record1.quals)
                entries2[record2.identifier[:-2]] = (record2.seq,
                                                     record2.quals)

            outfile1 = options.stdout
            outfile2 = iotools.open_file(options.output_filename_pattern, "w")
            assert len(set(entries1.keys()).intersection(
                set(entries2.keys()))) == len(entries1),\
                "paired files do not contain the same reads "\
                "need to reconcile files"

            for entry in sorted(entries1):
                outfile1.write("@%s/1\n%s\n+\n%s\n" %
                               (entry, entries1[entry][0], entries1[entry][1]))
                outfile2.write("@%s/2\n%s\n+\n%s\n" %
                               (entry, entries2[entry][0], entries2[entry][1]))

    elif options.method == "renumber-reads":
        id_count = 1
        for record in Fastq.iterate(options.stdin):
            record.identifier = options.renumber_pattern % id_count
            id_count += 1
            options.stdout.write("@%s\n%s\n+\n%s\n" %
                                 (record.identifier, record.seq, record.quals))
    return c
Exemple #22
0
def annotateGenes(iterator, fasta, options):
    """annotate gene structures

    This method outputs intervals for first/middle/last exon/intron,
    UTRs and flanking regions.

    This method annotates per transcript. In order to achieve a unique tiling,
    use only a single transcript per gene and remove any overlap between
    genes.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nskipped = 0, 0, 0

    results = []
    increment = options.increment

    introns_detail = "introns" in options.detail
    exons_detail = "exons" in options.detail

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        try:
            lcontig = fasta.getLength(gene[0][0].contig)
        except KeyError:
            nskipped += 1
            continue

        results = []

        for transcript in gene:

            def _add(interval, anno):
                gtf = GTF.Entry()
                gtf.contig = transcript[0].contig
                gtf.gene_id = transcript[0].gene_id
                gtf.transcript_id = transcript[0].transcript_id
                gtf.strand = transcript[0].strand
                gtf.feature = anno
                gtf.start, gtf.end = interval
                results.append(gtf)

            ntranscripts += 1

            exons = [(x.start, x.end) for x in transcript
                     if x.feature == "exon"]
            if len(exons) == 0:
                nskipped += 1

            exons.sort()
            introns = []
            end = exons[0][1]
            for exon in exons[1:]:
                introns.append((end, exon[0]))
                end = exon[1]

            # add flank
            start, end = exons[0][0], exons[-1][1]
            upstream, downstream = [], []
            for x in range(0, options.flank, increment):
                upstream.append((start - increment, start))
                start -= increment
                downstream.append((end, end + increment))
                end += increment

            # remove out-of-bounds coordinates
            upstream = [x for x in upstream if x[0] >= 0]
            downstream = [x for x in downstream if x[1] <= lcontig]

            if is_negative_strand:
                exons.reverse()
                introns.reverse()
                upstream, downstream = downstream, upstream

            # add exons
            if exons_detail:
                _add(exons[0], "first_exon")
                if len(exons) > 1:
                    _add(exons[-1], "last_exon")
                for e in exons[1:-1]:
                    _add(e, "middle_exon")
            else:
                for e in exons:
                    _add(e, "exon")

            # add introns
            if introns_detail:
                if len(introns) > 0:
                    _add(introns[0], "first_intron")
                if len(introns) > 1:
                    _add(introns[-1], "last_intron")
                for i in introns[1:-1]:
                    _add(i, "middle_intron")
            else:
                for i in introns:
                    _add(i, "intron")

            for x, u in enumerate(upstream):
                _add(u, "upstream_%i" % (increment * (x + 1)))

            for x, u in enumerate(downstream):
                _add(u, "downstream_%i" % (increment * (x + 1)))

            results.sort(key=lambda x: x.feature)

        cache = []
        for key, vals in itertools.groupby(results, key=lambda x: x.feature):
            v = list(vals)
            intervals = [(x.start, x.end) for x in v]
            intervals = Intervals.combine(intervals)

            for start, end in intervals:
                r = GTF.Entry()
                r.copy(v[0])
                r.start, r.end = start, end
                cache.append(r)

        cache.sort(key=lambda x: x.start)
        for r in cache:
            options.stdout.write("%s\n" % str(r))

    E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" %
           (ngenes, ntranscripts, nskipped))
Exemple #23
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="method",
        type=str,
        choices=("add-flank", "add-upstream-flank", "add-downstream-flank",
                 "crop", "crop-unique", "complement-groups", "combine-groups",
                 "filter-range", "join-features", "merge-features", "sanitize",
                 "to-forward-coordinates", "to-forward-strand", "rename-chr"),
        help="method to apply ")

    parser.add_argument("--ignore-strand",
                        dest="ignore_strand",
                        help="ignore strand information.",
                        action="store_true")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="input will be treated as gtf.")

    parser.add_argument("-c",
                        "--contigs-tsv-file",
                        dest="input_filename_contigs",
                        type=str,
                        help="filename with contig lengths.")

    parser.add_argument(
        "--agp-file",
        dest="input_filename_agp",
        type=str,
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("--crop-gff-file",
                        dest="filename_crop_gff",
                        type=str,
                        help="GFF/GTF file to crop against.")

    parser.add_argument(
        "--group-field",
        dest="group_field",
        type=str,
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... .""")

    parser.add_argument(
        "--filter-range",
        dest="filter_range",
        type=str,
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_argument("--sanitize-method",
                        dest="sanitize_method",
                        type=str,
                        choices=("ucsc", "ensembl", "genome"),
                        help="method to use for sanitizing chromosome names. "
                        ".")

    parser.add_argument(
        "--flank-method",
        dest="flank_method",
        type=str,
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        ".")

    parser.add_argument("--skip-missing",
                        dest="skip_missing",
                        action="store_true",
                        help="skip entries on missing contigs. Otherwise an "
                        "exception is raised .")

    parser.add_argument(
        "--contig-pattern",
        dest="contig_pattern",
        type=str,
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize .")

    parser.add_argument(
        "--assembly-report",
        dest="assembly_report",
        type=str,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-hasids",
        dest="assembly_report_hasIDs",
        type=int,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-ucsccol",
        dest="assembly_report_ucsccol",
        type=int,
        help="column in the assembly report containing ucsc contig ids"
        ".")

    parser.add_argument(
        "--assembly-report-ensemblcol",
        dest="assembly_report_ensemblcol",
        type=int,
        help="column in the assembly report containing ensembl contig ids")

    parser.add_argument(
        "--assembly-extras",
        dest="assembly_extras",
        type=str,
        help="additional mismatches between gtf and fasta to fix when"
        "sanitizing the genome .")

    parser.add_argument("--extension-upstream",
                        dest="extension_upstream",
                        type=float,
                        help="extension for upstream end .")

    parser.add_argument("--extension-downstream",
                        dest="extension_downstream",
                        type=float,
                        help="extension for downstream end .")

    parser.add_argument("--min-distance",
                        dest="min_distance",
                        type=int,
                        help="minimum distance of features to merge/join .")

    parser.add_argument("--max-distance",
                        dest="max_distance",
                        type=int,
                        help="maximum distance of features to merge/join .")

    parser.add_argument("--min-features",
                        dest="min_features",
                        type=int,
                        help="minimum number of features to merge/join .")

    parser.add_argument("--max-features",
                        dest="max_features",
                        type=int,
                        help="maximum number of features to merge/join .")

    parser.add_argument(
        "--rename-chr-file",
        dest="rename_chr_file",
        type=str,
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(input_filename_contigs=False,
                        filename_crop_gff=None,
                        input_filename_agp=False,
                        genome_file=None,
                        rename_chr_file=None,
                        add_up_flank=None,
                        add_down_flank=None,
                        complement_groups=False,
                        crop=None,
                        crop_unique=False,
                        ignore_strand=False,
                        filter_range=None,
                        min_distance=0,
                        max_distance=0,
                        min_features=1,
                        max_features=0,
                        extension_upstream=1000,
                        extension_downstream=1000,
                        sanitize_method="ucsc",
                        flank_method="add",
                        output_format="%06i",
                        skip_missing=False,
                        is_gtf=False,
                        group_field=None,
                        contig_pattern=None,
                        assembly_report=None,
                        assembly_report_hasIDs=1,
                        assembly_report_ensemblcol=4,
                        assembly_report_ucsccol=9,
                        assembly_extras=None)

    (args) = E.start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    chr_map = None

    if args.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            iotools.open_file(args.input_filename_contigs, "r"))

    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    if args.rename_chr_file:
        chr_map = {}
        with open(args.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    if args.assembly_report:
        df = pd.read_csv(args.assembly_report,
                         comment="#",
                         header=None,
                         sep="\t")
        # fixes naming inconsistency in assembly report: ensembl chromosome
        # contigs found in columnn 0, ensembl unassigned contigs found in
        # column 4.
        if args.assembly_report_hasIDs == 1:
            ucsccol = args.assembly_report_ucsccol
            ensemblcol = args.assembly_report_ensemblcol
            df.loc[df[1] == "assembled-molecule",
                   ensemblcol] = df.loc[df[1] == "assembled-molecule", 0]
            if args.sanitize_method == "ucsc":
                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
            elif args.sanitize_method == "ensembl":
                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
            else:
                raise ValueError(''' When using assembly report,
                please specify sanitize method as either
                "ucsc" or "ensembl" to specify direction of conversion
                ''')
        else:
            assembly_dict = {}
        if args.assembly_extras is not None:
            assembly_extras = args.assembly_extras.split(",")
            for item in assembly_extras:
                item = item.split("-")
                assembly_dict[item[0]] = item[1]

    if args.method in ("forward_coordinates", "forward_strand",
                       "add-flank", "add-upstream-flank",
                       "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if args.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(iotools.open_file(args.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(args.stdin)

    if args.method in ("add-upstream-flank", "add-downstream-flank",
                       "add-flank"):

        add_upstream_flank = "add-upstream-flank" == args.method
        add_downstream_flank = "add-downstream-flank" == args.method
        if args.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(args.extension_upstream)
        downstream_flank = int(args.extension_downstream)
        extend_flank = args.flank_method == "extend"

        if args.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, args.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(0,
                                             chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(0,
                                             chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                args.stdout.write(str(gff) + "\n")

    elif args.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            if args.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                args.stdout.write(str(x) + "\n")
                x.start = c.end

    elif args.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            args.stdout.write(str(x) + "\n")

    elif args.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=False,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=True,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop":
        for gff in cropGFF(gffs, args.filename_crop_gff):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)",
                                                   args.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      args.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % args.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "sanitize":

        def assemblyReport(id):
            if id in assembly_dict.keys():
                id = assembly_dict[id]
            # if not in dict, the contig name is forced
            # into the desired convention, this is helpful user
            # modified gff files that contain additional contigs
            elif args.sanitize_method == "ucsc":
                if not id.startswith("contig") and not id.startswith("chr"):
                    id = "chr%s" % id
            elif args.sanitize_method == "ensembl":
                if id.startswith("contig"):
                    return id[len("contig"):]
                elif id.startswith("chr"):
                    return id[len("chr"):]
            return id

        if args.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError("please specify --genome-file= when using "
                                 "--sanitize-method=genome")
            f = genome_fasta.getToken
        else:
            if args.assembly_report is None:
                raise ValueError(
                    "please specify --assembly-report= when using "
                    "--sanitize-method=ucsc or ensembl")
            f = assemblyReport

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if args.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if args.contig_pattern:
                to_remove = [
                    re.compile(x) for x in args.contig_pattern.split(",")
                ]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            args.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys())), str(skipped_contigs)))

        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(list(
                       outofrange_contigs.keys())), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())), str(filtered_contigs)))

    elif args.method == "rename-chr":
        if not chr_map:
            raise ValueError("please supply mapping file")

        for gff in renameChromosomes(gffs, chr_map):
            args.stdout.write(str(gff) + "\n")

    else:

        for gff in gffs:

            if args.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if args.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            args.stdout.write(str(gff) + "\n")

    E.stop()
Exemple #24
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-c", "--is-cds", dest="is_cds", action="store_true",
                        help="input are cds (nucleotide) sequences ")

    parser.set_defaults(
        is_cds=False,
    )

    (args) = E.start(parser, argv=argv)

    args.stdout.write(
        "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n")

    alphabet = "ACDEFGHIKLMNPQRSTVWY"

    snpid = 0

    for entry in FastaIterator.iterate(args.stdin):
        identifier = entry.title

        if args.is_cds:
            cds_sequence = entry.sequence.upper()
            assert len(cds_sequence) % 3 == 0, \
                "length of sequence '%s' is not a multiple of 3" % entry.title

            sequence = Genomics.translate(cds_sequence)
            weights = []
            for pos, cds_pos in enumerate(range(0, len(cds_sequence), 3)):
                codon = cds_sequence[cds_pos:cds_pos + 3]
                counts = collections.defaultdict(int)
                for x in range(0, 3):
                    rna = codon[x]
                    for na in "ACGT":
                        if na == rna:
                            continue
                        taa = Genomics.translate(
                            codon[:x] + na + codon[x + 1:])
                        counts[taa] += 1
                weights.append(counts)

        else:
            sequence = entry.sequence.upper()
            counts = {}
            for x in alphabet:
                counts[x] = 1
            weights = [counts] * len(sequence)

        for pos, ref in enumerate(sequence):

            if ref not in alphabet:
                continue
            w = weights[pos]
            t = float(sum(w.values()))
            for variant in alphabet:
                if variant == ref:
                    continue
                snpid += 1
                args.stdout.write(
                    "%s\n" % "\t".join(
                        ("%010i" % snpid,
                         identifier,
                         str(pos + 1),
                         ref,
                         variant,
                         "%i" % w[variant],
                         "%6.4f" % (w[variant] / t),
                         )))

    E.stop()
    def loadSequence(self, sequence, seqtype="na"):
        """load sequence properties from a sequence."""

        SequencePropertiesLength.loadSequence(self, sequence, seqtype)
        if len(sequence) % 3:
            raise ValueError(
                '''sequence length is not a multiple of 3 (length=%i)''' %
                (len(sequence)))

        # uppercase all letters
        sequence = sequence.upper()

        self.mNStopCodons = 0

        # setup counting arrays
        # nucleotide counts for each position (is not a sum of the counts
        # per degenerate site, as the codon might be intelligible, e.g. GNN).
        self.mCounts = [{
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }, {
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }, {
            'A': 0,
            'C': 0,
            'G': 0,
            'T': 0,
            'X': 0,
            'N': 0
        }]

        # nucleotide counts for each position per degeneracy
        self.mCountsDegeneracy = []

        for x in (0, 1, 2):
            xx = []
            for y in range(5):
                yy = {}
                for z in Bio.Alphabet.IUPAC.extended_dna.letters:
                    yy[z] = 0
                xx.append(yy)
            self.mCountsDegeneracy.append(xx)

        # use generator rather than list to save memory
        for codon in (sequence[x:x + 3] for x in range(0, len(sequence), 3)):

            for x in (0, 1, 2):
                self.mCounts[x][codon[x]] += 1

            if Genomics.IsStopCodon(codon):
                self.mNStopCodons += 1
                continue

            try:
                aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon)
                degrees = (deg1, deg2, deg3)
                for x in range(len(degrees)):
                    self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1

            except KeyError:
                pass
Exemple #26
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="methods",
        type=str,
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "map-identifier", "nop", "remove-stops", "upper", "lower",
                 "reverse-complement", "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_argument("-p",
                        "--parameters",
                        dest="parameters",
                        type=str,
                        help="parameter stack for methods that require one ")

    parser.add_argument("-x",
                        "--ignore-errors",
                        dest="ignore_errors",
                        action="store_true",
                        help="ignore errors.")

    parser.add_argument("--sample-proportion",
                        dest="sample_proportion",
                        type=float,
                        help="sample proportion.")

    parser.add_argument(
        "--exclude-pattern",
        dest="exclude_pattern",
        type=str,
        help="exclude all sequences with ids matching pattern ")

    parser.add_argument(
        "--include-pattern",
        dest="include_pattern",
        type=str,
        help="include only sequences with ids matching pattern ")

    parser.add_argument("--filter-method",
                        dest="filter_methods",
                        type=str,
                        action="append",
                        help="filtering methods to apply ")

    parser.add_argument(
        "-t",
        "--sequence-type",
        dest="type",
        type=str,
        choices=("aa", "na"),
        help="sequence type (aa or na) . This option determines "
        "which characters to use for masking.")

    parser.add_argument(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type=str,
        help="template for numerical identifier"
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.add_argument(
        "--map-tsv-file",
        dest="map_tsv_file",
        type=str,
        help=
        "input filename with map for identifiers. The first row is a header")

    parser.add_argument("--fold-width",
                        dest="fold_width",
                        type=int,
                        help="fold width for sequence output. 0 is unfolded ")

    parser.set_defaults(methods=[],
                        parameters="",
                        type="na",
                        aa_mask_chars="xX",
                        aa_mask_char="x",
                        na_mask_chars="nN",
                        na_mask_char="n",
                        gap_chars="-.",
                        gap_char="-",
                        template_identifier="ID%06i",
                        ignore_errors=False,
                        exclude_pattern=None,
                        include_pattern=None,
                        sample_proportion=None,
                        filter_methods=[],
                        input_filename_fasta="-",
                        input_filename_map=None,
                        fold_width=80)

    (args, unknown) = E.start(parser, unknowns=True)

    if len(unknown) > 0:
        args.input_filename_fasta = unknown[0]

    args.parameters = args.parameters.split(",")

    rx_include, rx_exclude = None, None
    if args.include_pattern:
        rx_include = re.compile(args.include_pattern)
    if args.exclude_pattern:
        rx_exclude = re.compile(args.exclude_pattern)

    iterator = FastaIterator.FastaIterator(args.stdin)

    nseq = 0

    map_seq2nid = {}

    map_identifier = ("apply-map" in args.methods
                      or "map-identifier" in args.methods)
    if map_identifier:
        if args.input_filename_map is None:
            raise ValueError("for method=map-identifier use --map-tsv-file")
        with iotools.open_file(args.input_filename_map) as infile:
            map_identifier = iotools.read_map(infile, has_header=True)

    if args.type == "na":
        mask_chars = args.na_mask_chars
        mask_char = args.na_mask_char
    else:
        mask_chars = args.aa_mask_chars
        mask_char = args.aa_mask_char

    if "map-codons" in args.methods:
        map_codon2code = iotools.ReadMap(open(args.parameters[0], "r"))
        del args.parameters[0]

    if "mask-soft" in args.methods:
        f = args.parameters[0]
        del args.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in args.methods or "back-translate" in args.methods:

        # open a second stream to read sequences from
        f = args.parameters[0]
        del args.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "sample" in args.methods:
        if not args.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = args.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in args.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in iotools.open_file(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    iterator = pysam.FastxFile(args.input_filename_fasta)

    c = E.Counter()

    fold_width = args.fold_width

    def fold(s, w):
        return "\n".join([s[x:x + w] for x in range(0, len(s), w)])

    for record in iterator:
        c.nseq += 1
        c.input += 1

        sequence = re.sub(" ", "", record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(record.name):
            c.skipped += 1
            continue

        if rx_exclude and rx_exclude.search(record.name):
            c.skipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or record.name in filter_id_list):
            c.skipped += 1
            continue

        for method in args.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % args.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        record.name, ls)
                    c.errors += 1
                    if args.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if record.name != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        record.name, other_record.title)

                other_sequence = re.sub("[ %s]" % args.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % args.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in args.gap_chars:
                        c = args.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = sequence.translate(
                    str.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = args.na_mask_char
                elif method == "remove-stops":
                    char = args.gap_char

                for x in sequence:

                    if x not in args.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in args.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (record.name))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in args.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", record.name).groups()[0]
                if id in map_seq2nid:
                    rest = record.name[len(id):]
                    record.name = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", record.name).groups()[0]
                new_id = args.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                record.name = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if record.name != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (record.name, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        %
                        (record.name, len(other_sequence) * 3, len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in args.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [args.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [args.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            c.skipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            c.skipped += 1
            continue

        record.sequence = sequence
        if fold_width >= 0:
            if record.comment:
                args.stdout.write(">{} {}\n{}\n".format(
                    record.name, record.comment,
                    fold(record.sequence, fold_width)))
            else:
                args.stdout.write(">{}\n{}\n".format(
                    record.name, fold(record.sequence, fold_width)))
        else:
            args.stdout.write(str(record) + "\n")

        c.output += 1

    if "build-map" in args.methods:
        p = args.parameters[0]
        if p:
            outfile = iotools.open_file(p, "w")
        else:
            outfile = args.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info(c)
    E.stop()
Exemple #27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=('join', ),
                      help="method to apply [default=%default].")

    parser.set_defaults(method="join", )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()
    outfile = options.stdout

    if options.method == "join":
        # merge based on diagonals in dotplot
        iter1 = Fastq.iterate(iotools.open_file(fn1))
        iter2 = Fastq.iterate(iotools.open_file(fn2))
        tuple_size = 2
        for left, right in zip(iter1, iter2):
            c.input += 1

            # build dictionary of tuples
            s1, q1 = left.seq, left.quals
            d = collections.defaultdict(list)
            for x in range(len(s1) - tuple_size):
                d[s1[x:x + tuple_size]].append(x)

            s2, q2 = right.seq, right.quals
            s2 = Genomics.reverse_complement(s2)
            q2 = q2[::-1]

            # compute list of offsets/diagonals
            offsets = collections.defaultdict(int)
            for x in range(len(s2) - tuple_size):
                c = s2[x:x + tuple_size]
                for y in d[c]:
                    offsets[x - y] += 1

            # find maximum diagonal
            sorted = sorted([(y, x) for x, y in list(offsets.items())])
            max_count, max_offset = sorted[-1]

            E.debug('%s: maximum offset at %i' % (left.identifier, max_offset))

            # simple merge sequence
            take = len(s2) - max_offset
            merged_seq = s1 + s2[take:]

            # simple merge quality scores
            merged_quals = q1 + q2[take:]

            new_entry = copy.copy(left)
            new_entry.seq = merged_seq
            new_entry.quals = merged_quals
            outfile.write(new_entry)
            c.output += 1

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
    def updateProperties(self):

        SequencePropertiesCodons.updateProperties(self)

        self.mCodonFrequencies = Genomics.CalculateCodonFrequenciesFromCounts(
            self.mCodonCounts)