Python Genomics.IsPositiveStrand Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: Genomics

Method/Function: IsPositiveStrand

Examples at hotexamples.com: 13

Python Genomics.IsPositiveStrand - 13 examples found. These are the top rated real world Python examples of CGAT.Genomics.IsPositiveStrand extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

IsPositiveStrand(13)

IsNegativeStrand(10)

MapCodon2AA(10)

IsStopCodon(7)

CalculatePairIndices(7)

Alignment2PeptideAlignment(6)

GetHID(6)

ReadGenomicSequences(5)

CalculateCodonFrequenciesFromCounts(4)

CountGeneFeatures(4)

GetDegeneracy(4)

Alignment2ExonBoundaries(3)

GetUniformCodonUsage(3)

ReadContigSizes(3)

ParseFasta2Hash(2)

Protein2Wobble(2)

MaskStopCodons(2)

Alignment2CDNA(2)

CountCodons(2)

Alignment2String(2)

GetIntronType(2)

GetMapAA2Codons(1)

GetGenomicSequence(1)

GetDegenerateSites(1)

MapSequences(1)

CalculateCAIWeightsFromCounts(1)

ParseFasta2HashFromIndex(1)

AlignmentProtein2CDNA(1)

ReadClusters(1)

Example #1

Show file

File: gtf2gff.py Project: gsc0107/cgat

def addIntergenicSegment(last, this, fasta, options):
    """add an intergenic segment between last and this.

    At telomeres, either can be None.
    """
    if not this and not last:
        return 0

    nadded = 0
    if not this:
        # last telomere
        try:
            lcontig = fasta.getLength(last.contig)
        except KeyError as msg:
            if options.ignore_missing:
                return nadded
            else:
                raise KeyError(msg)
        flank = min(last.end + options.flank, lcontig)
        nadded += addFlank(last.end, flank, last, options)
        nadded += addSegment("telomeric", flank, lcontig, last, options)
    elif not last:
        # first telomere
        flank = max(0, this.start - options.flank)
        nadded += addSegment("telomeric", 0, flank, this, options)
        nadded += addFlank(flank, this.start, this, options)
    else:
        # intergenic region
        d = this.start - last.end
        flank = options.flank
        if d > flank * 2:
            nadded += addFlank(last.end, last.end + flank, last, options)
            nadded += addSegment("intergenic", last.end +
                                 flank, this.start - flank,
                                 (last, this), options)
            nadded += addFlank(this.start - flank, this.start, this, options)
        else:
            # add short flank between two genes. If they can not agree
            # on the directionality, "flank" is used.
            is_positive1 = Genomics.IsPositiveStrand(last.strand)
            is_positive2 = Genomics.IsPositiveStrand(this.strand)
            if is_positive1 and not is_positive2:
                key = "3flank"
            elif not is_positive1 and is_positive2:
                key = "5flank"
            else:
                key = "flank"
            nadded += addSegment(key, last.end, this.start,
                                 (last, this), options)

    return nadded

Example #2

Show file

def toSequence(chunk, fasta):
    """convert a list of gff attributes to a single sequence.

    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0:
        return ""

    contig, strand = chunk[0].contig, chunk[0].strand

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand(strand)

    if not positive:
        intervals = [(lcontig - end, lcontig - start)
                     for start, end in intervals]
        intervals.reverse()

    s = [
        fasta.getSequence(contig, strand, start, end)
        for start, end in intervals
    ]

    return "".join(s)

Example #3

Show file

File: gtf2gff.py Project: gsc0107/cgat

def addFlank(start, end, template, options):
    """add a flank.
    """
    is_positive = Genomics.IsPositiveStrand(template.strand)
    is_before = end <= template.start
    if (is_before and is_positive) or (not is_before and not is_positive):
        name = "5flank"
    else:
        name = "3flank"

    return addSegment(name, start, end, template, options)

Example #4

Show file

def updateVariants(variants, lcontig, strand, phased=True):
    '''update variants such that they use same coordinate
    system (and strand) as the transcript

    fixes 1-ness of variants
    '''

    new_variants = []
    is_positive = Genomics.IsPositiveStrand(strand)

    for variant in variants:

        pos = variant.pos
        genotype = bytes(variant.genotype)
        reference = bytes(variant.reference)

        # fix 1-ness of variants
        # pos -= 1

        if len(genotype) == 1:
            variantseqs = list(Genomics.decodeGenotype(genotype))
            has_wildtype = reference in variantseqs
            action = "="
            start, end = pos, pos + 1
        else:

            variantseqs = [x[1:] for x in genotype.split("/")]
            lvariant = max([len(x) for x in variantseqs])
            if not phased:
                variantseqs = [x for x in variantseqs if x]
            has_wildtype = "*" in genotype

            if "+" in genotype and "-" in genotype:
                # both insertion and deletion at position
                # the range is given by the deletion
                # see below for explanations
                if genotype.startswith("+"):
                    action = ">"
                    variantseqs[1] += "-" * (lvariant - len(variantseqs[1]))
                else:
                    action = "<"
                    variantseqs[0] += "-" * (lvariant - len(variantseqs[0]))

                start, end = pos + 1, pos + lvariant + 1

            elif "-" in genotype:
                action = "-"
                # samtools: deletions are after the base denoted by snp.position
                #   * <- deletion at 1
                # 0 1 2 3 4 5 6
                #     - -
                # 6 5 4 3 2 1 0
                # deletion of 2+3 = (2,4)
                # on reverse: (7-4, 7-2) = (3,5)
                start, end = pos + 1, pos + lvariant + 1

                # deletions of unequal length are filled up with "-"
                # This is necessary to deal with negative strands:
                # -at/-atg on the positive strand deletes a t [g]
                # -at/-atg on the negative strand deletes [g] t a
                variantseqs = [
                    x + "-" * (lvariant - len(x)) for x in variantseqs
                ]

            elif "+" in genotype:
                action = "+"
                # indels are after the base denoted by position
                # as region use both flanking base so that negative strand
                # coordinates work
                # insertion between position 2 and 3
                #     * <- insection at pos 2
                # 0 1 2i3 4
                # 4 3 2i1 0
                # is insertion between 1 and 2 in reverse
                # including both flanking residues makes it work:
                # (2,3) = (5-3,5-2) = (2,3)
                # but:
                # (2,4) = (5-4,5-2) = (1,3)
                start, end = pos, pos + 2

        # revert strand
        if not is_positive:
            reference = Genomics.complement(reference)
            variantseqs = [Genomics.complement(x.upper()) for x in variantseqs]
            start, end = lcontig - end, lcontig - start

        new_variants.append(
            ExtendedVariant._make((start, end, reference.upper(), action,
                                   has_wildtype, variantseqs)))

    return new_variants

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-t",
        "--tablename",
        dest="tablename",
        type="string",
        help=
        "tablename to get variants from (in samtools pileup format) [default=%default]."
    )
    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with transcript model information (gtf formatted file)  [default=%default]."
    )
    parser.add_option(
        "-r",
        "--filename-reference",
        dest="filename_reference",
        type="string",
        help=
        "filename with transcript models of a reference gene set. Stop codons that do not"
        " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default]."
    )
    parser.add_option(
        "--vcf-file",
        dest="filename_vcf",
        type="string",
        help=
        "filename with variants in VCF format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--pileup-file",
        dest="filename_pileup",
        type="string",
        help=
        "filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help=
        "sample id for species of interest in vcf formatted file [default=%default]."
    )
    parser.add_option(
        "-s",
        "--seleno-tsv-file",
        dest="filename_seleno",
        type="string",
        help=
        "filename of a list of transcript ids that are selenoproteins [default=%default]."
    )
    parser.add_option("-m",
                      "--module",
                      dest="modules",
                      type="choice",
                      action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option(
        "-k",
        "--with-knockouts",
        dest="with_knockouts",
        action="store_true",
        help=
        "add alleles that are knocked out to fasta and gtf files [default=%default]."
    )

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(options.database,
                                             options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(options.filename_vcf,
                                          options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(("gene_id", "transcript_id",
                                         "allele_id", "contig", "strand",
                                         "is_wildtype",
                                         ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(indexed_variants,
                                                    all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] +
                        variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(
                transcript,
                variant_exons,
                variant_introns,
                all_exons,
                all_introns,
                offsets,
                is_seleno=transcript_id in seleno,
                reference_coordinates=False,
            )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(
                                allele.exon_starts[1:], allele.cds_starts[1:],
                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(">%s\n%s\n" %
                                               (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig -
                        allele.reference_first_stop_start,
                    )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % ("\t".join(
                        (gene_id, transcript_id, allele_id, contig, strand,
                         "%i" % is_wildtype)), "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()

Example #6

Show file

File: gff2fasta.py Project: gsc0107/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        merge=False,
                        feature=None,
                        filename_masks=None,
                        remove_masked_regions=False,
                        min_length=0,
                        max_length=0,
                        extend_at=None,
                        extend_by=100,
                        extend_with=None,
                        masker=None,
                        fold_at=None,
                        naming_attribute=False)

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with IOTools.openFile(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        options.stdout.write(
            ">%s %s:%s:%s\n%s\n" %
            (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.Stop()

Example #7

Show file

File: gtf2fasta.py Project: lesheng/cgat

def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE):
    """annotate a genome given by the indexed *fasta* file and 
    an iterator over gtf annotations.
    """

    annotations = {}
    contig_sizes = fasta.getContigSizes(with_synonyms=False)
    E.info("allocating memory for %i contigs and %i bytes" %
           (len(contig_sizes), sum(contig_sizes.values()) * array.array("c").itemsize))
          # AString.AString( "a").itemsize ))

    for contig, size in contig_sizes.items():
        E.debug("allocating %s: %i bases" % (contig, size))
        # annotations[contig] = AString.AString( default_code * size )
        annotations[contig] = array.array("c", default_code * size)

    E.info("allocated memory for %i contigs" % len(fasta))

    counter = E.Counter()

    # output splice junctions
    outfile_junctions = E.openOutputFile("junctions")
    outfile_junctions.write(
        "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n")
    for gtfs in iterator:

        counter.input += 1

        if counter.input % options.report_step == 0:
            E.info("iteration %i" % counter.input)

        try:
            contig = fasta.getToken(gtfs[0].contig)
        except KeyError, msg:
            E.warn("contig %s not found - annotation ignored" % gtfs[0].contig)
            counter.skipped_contig += 1
            continue

        lcontig = fasta.getLength(contig)

        # make sure that exons are sorted by coordinate
        gtfs.sort(key=lambda x: x.start)

        is_positive = Genomics.IsPositiveStrand(gtfs[0].strand)
        source = gtfs[0].source

        # process non-coding data
        if source in MAP_ENSEMBL:
            code = MAP_ENSEMBL[source]

            intervals = [(x.start, x.end) for x in gtfs]
            addSegments(annotations[contig],
                        intervals,
                        is_positive,
                        code)

        elif source == "protein_coding":

            # collect exons for utr
            exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"]
            cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"]
            if len(cds) == 0:
                counter.skipped_transcripts += 1
                E.warn("protein-coding transcript %s without CDS - skipped" %
                       gtfs[0].transcript_id)
                continue

            exons = Intervals.truncate(exons, cds)
            start, end = cds[0][0], cds[-1][1]

            UTR5 = [x for x in exons if x[1] < start]
            UTR3 = [x for x in exons if x[0] >= end]

            if not is_positive:
                UTR5, UTR3 = UTR3, UTR5
                splice_code = "S"
            else:
                splice_code = "s"

            addSegments(annotations[contig],
                        UTR5,
                        is_positive,
                        "u")

            addIntrons(annotations[contig],
                       UTR5,
                       is_positive,
                       options.max_frameshift_length)

            addSegments(annotations[contig],
                        UTR3,
                        is_positive,
                        "v")

            addIntrons(annotations[contig],
                       UTR3,
                       is_positive,
                       options.max_frameshift_length)

            # output CDS according to frame
            addCDS(annotations[contig],
                   [x for x in gtfs if x.feature == "CDS"],
                   is_positive)

            # add introns between CDS
            addIntrons(annotations[contig],
                       cds,
                       is_positive,
                       options.max_frameshift_length)

            # output splice junctions
            cds = [x for x in gtfs if x.feature == "CDS"]

            # apply corrections for 1-past end coordinates
            # to point between residues within CDS
            if is_positive:
                ender = lambda x: x.end - 1
                starter = lambda x: x.start
                out_positive = "+"
            else:
                ender = lambda x: lcontig - x.start - 1
                starter = lambda x: lcontig - x.end
                out_positive = "-"
                cds.reverse()

            end = ender(cds[0])
            for c in cds[1:]:
                start = starter(c)
                outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" %
                                        (contig,
                                         out_positive,
                                         end,
                                         start,
                                         c.frame,
                                         c.gene_id,
                                         c.transcript_id,
                                         ))
                end = ender(c)

Example #8

Show file

    def process(matches):

        new = matches[0].copy()

        map_query2target = alignlib_lite.py_makeAlignmentBlocks()

        graph = networkx.DiGraph()
        graph.add_nodes_from(range(len(matches) + 2))

        matches.sort(key=lambda x: x.mQueryFrom)

        if Genomics.IsPositiveStrand(matches[0].strand):
            f = lambda x, y: x.mSbjctTo < y.mSbjctFrom
        else:
            f = lambda x, y: x.mSbjctFrom > y.mSbjctTo

        for x in range(0, len(matches)):

            xx = matches[x]
            if options.loglevel >= 6:
                options.stdlog.write("# graph: %2i %s\n" % (x, str(xx)))

            for y in range(x + 1, len(matches)):
                yy = matches[y]
                d = min(xx.mQueryTo, yy.mQueryTo) - \
                    max(xx.mQueryFrom, yy.mQueryFrom)
                if d > 0 or not f(xx, yy):
                    continue
                else:
                    graph.add_edge(x, y, {'weight': -d})

        source = len(matches)
        target = len(matches) + 1
        for x in range(len(matches)):
            xx = matches[x]
            graph.add_edge(source, x, {'weight': xx.mQueryFrom})
            graph.add_edge(
                x, target, {'weight': xx.mQueryLength - xx.mQueryTo})

        if options.loglevel >= 6:
            networkx.write_edgelist(graph, options.stdlog)

        path = networkx.dijkstra_path(graph, source, target)

        if options.loglevel >= 6:
            options.stdlog.write("# path: %s\n" % (str(path)))

        new_matches = [matches[x] for x in path[1:-1]]

        if len(matches) != len(new_matches):
            E.warn(("query=%s, target=%s, strand=%s: "
                    "removed overlapping/out-of-order segments: "
                    "before=%i, after=%i") %
                   (matches[0].mQueryId,
                    matches[0].mSbjctId,
                    matches[0].strand,
                    len(matches),
                    len(new_matches)))

        matches = new_matches

        for match in matches:
            m = match.getMapQuery2Target()
            alignlib_lite.py_addAlignment2Alignment(map_query2target, m)

        new.fromMap(map_query2target, use_strand=True)

        options.stdout.write(str(new) + "\n")
        options.stdout.flush()
        return 1

Example #9

Show file

def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: gff2gff.py$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("add-flank", "add-upstream-flank", "add-downstream-flank",
                 "crop", "crop-unique", "complement-groups", "combine-groups",
                 "filter-range", "join-features", "merge-features", "sanitize",
                 "to-forward-coordinates", "to-forward-strand"),
        help="method to apply [%default]")

    parser.add_option("--ignore-strand",
                      dest="ignore_strand",
                      help="ignore strand information.",
                      action="store_true")

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input will be treated as gtf [default=%default].")

    parser.add_option("-c",
                      "--contigs-tsv-file",
                      dest="input_filename_contigs",
                      type="string",
                      help="filename with contig lengths.")

    parser.add_option(
        "--agp-file",
        dest="input_filename_agp",
        type="string",
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("--crop-gff-file",
                      dest="filename_crop_gff",
                      type="string",
                      help="GFF/GTF file to crop against.")

    parser.add_option(
        "--group-field",
        dest="group_field",
        type="string",
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... [%default].""")

    parser.add_option(
        "--filter-range",
        dest="filter_range",
        type="string",
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_option("--sanitize-method",
                      dest="sanitize_method",
                      type="choice",
                      choices=("ucsc", "ensembl", "genome"),
                      help="method to use for sanitizing chromosome names. "
                      "[%default].")

    parser.add_option(
        "--flank-method",
        dest="flank_method",
        type="choice",
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        "[%default].")

    parser.add_option("--skip-missing",
                      dest="skip_missing",
                      action="store_true",
                      help="skip entries on missing contigs. Otherwise an "
                      "exception is raised [%default].")

    parser.add_option(
        "--contig-pattern",
        dest="contig_pattern",
        type="string",
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report",
        dest="assembly_report",
        type="string",
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report-hasids",
        dest="assembly_report_hasIDs",
        type="int",
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report-ucsccol",
        dest="assembly_report_ucsccol",
        type="int",
        help="column in the assembly report containing ucsc contig ids"
        "[%default].")

    parser.add_option(
        "--assembly-report-ensemblcol",
        dest="assembly_report_ensemblcol",
        type="int",
        help="column in the assembly report containing ensembl contig ids"
        "[%default].")

    parser.add_option(
        "--assembly-extras",
        dest="assembly_extras",
        type="str",
        help="additional mismatches between gtf and fasta to fix when"
        "sanitizing the genome [%default].")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="float",
                      help="extension for upstream end [%default].")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="float",
                      help="extension for downstream end [%default].")

    parser.add_option(
        "--min-distance",
        dest="min_distance",
        type="int",
        help="minimum distance of features to merge/join [%default].")

    parser.add_option(
        "--max-distance",
        dest="max_distance",
        type="int",
        help="maximum distance of features to merge/join [%default].")

    parser.add_option(
        "--min-features",
        dest="min_features",
        type="int",
        help="minimum number of features to merge/join [%default].")

    parser.add_option(
        "--max-features",
        dest="max_features",
        type="int",
        help="maximum number of features to merge/join [%default].")

    parser.set_defaults(input_filename_contigs=False,
                        filename_crop_gff=None,
                        input_filename_agp=False,
                        genome_file=None,
                        add_up_flank=None,
                        add_down_flank=None,
                        complement_groups=False,
                        crop=None,
                        crop_unique=False,
                        ignore_strand=False,
                        filter_range=None,
                        min_distance=0,
                        max_distance=0,
                        min_features=1,
                        max_features=0,
                        extension_upstream=1000,
                        extension_downstream=1000,
                        sanitize_method="ucsc",
                        flank_method="add",
                        output_format="%06i",
                        skip_missing=False,
                        is_gtf=False,
                        group_field=None,
                        contig_pattern=None,
                        assembly_report=None,
                        assembly_report_hasIDs=1,
                        assembly_report_ensemblcol=4,
                        assembly_report_ucsccol=9,
                        assembly_extras=None)

    (options, args) = E.Start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    if options.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            IOTools.openFile(options.input_filename_contigs, "r"))

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    if options.assembly_report:
        df = pd.read_csv(options.assembly_report,
                         comment="#",
                         header=None,
                         sep="\t")
        # fixes naming inconsistency in assembly report: ensembl chromosome
        # contigs found in columnn 0, ensembl unassigned contigs found in
        # column 4.
        if options.assembly_report_hasIDs == 1:
            ucsccol = options.assembly_report_ucsccol
            ensemblcol = options.assembly_report_ensemblcol
            df.ix[df[1] == "assembled-molecule",
                  ensemblcol] = df.ix[df[1] == "assembled-molecule", 0]
            if options.sanitize_method == "ucsc":
                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
            elif options.sanitize_method == "ensembl":
                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
            else:
                raise ValueError(''' When using assembly report,
                please specify sanitize method as either
                "ucsc" or "ensembl" to specify direction of conversion
                ''')
        else:
            assembly_dict = {}
        if options.assembly_extras is not None:
            assembly_extras = options.assembly_extras.split(",")
            for item in assembly_extras:
                item = item.split("-")
                assembly_dict[item[0]] = item[1]

    if options.method in ("forward_coordinates", "forward_strand",
                          "add-flank", "add-upstream-flank",
                          "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if options.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(options.stdin)

    if options.method in ("add-upstream-flank", "add-downstream-flank",
                          "add-flank"):

        add_upstream_flank = "add-upstream-flank" == options.method
        add_downstream_flank = "add-downstream-flank" == options.method
        if options.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(options.extension_upstream)
        downstream_flank = int(options.extension_downstream)
        extend_flank = options.flank_method == "extend"

        if options.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, options.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(0,
                                             chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(0,
                                             chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                options.stdout.write(str(gff) + "\n")

    elif options.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs, group_field=options.group_field)

        for chunk in iterator:
            if options.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                options.stdout.write(str(x) + "\n")
                x.start = c.end

    elif options.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs, group_field=options.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            options.stdout.write(str(x) + "\n")

    elif options.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=options.min_distance,
                              max_distance=options.max_distance,
                              min_features=options.min_features,
                              max_features=options.max_features,
                              merge=False,
                              output_format=options.output_format):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=options.min_distance,
                              max_distance=options.max_distance,
                              min_features=options.min_features,
                              max_features=options.max_features,
                              merge=True,
                              output_format=options.output_format):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "crop":
        for gff in cropGFF(gffs, options.filename_crop_gff):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)",
                options.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match(
                    "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      options.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % options.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "sanitize":

        def assemblyReport(id):
            if id in assembly_dict.keys():
                id = assembly_dict[id]
            # if not in dict, the contig name is forced
            # into the desired convention, this is helpful user
            # modified gff files that contain additional contigs
            elif options.sanitize_method == "ucsc":
                if not id.startswith("contig") and not id.startswith("chr"):
                    id = "chr%s" % id
            elif options.sanitize_method == "ensembl":
                if id.startswith("contig"):
                    return id[len("contig"):]
                elif id.startswith("chr"):
                    return id[len("chr"):]
            return id

        if options.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError("please specify --genome-file= when using "
                                 "--sanitize-method=genome")
            f = genome_fasta.getToken
        else:
            if options.assembly_report is None:
                raise ValueError(
                    "please specify --assembly-report= when using "
                    "--sanitize-method=ucsc or ensembl")
            f = assemblyReport

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if options.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if options.contig_pattern:
                to_remove = [
                    re.compile(x) for x in options.contig_pattern.split(",")
                ]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            options.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys())), str(skipped_contigs)))

        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(list(
                       outofrange_contigs.keys())), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())), str(filtered_contigs)))

    else:

        for gff in gffs:

            if options.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if options.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            options.stdout.write(str(gff) + "\n")

    E.Stop()

Example #10

Show file

def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2gff.py 2868 2010-03-03 10:19:52Z andreas $")

    parser.add_option("-f",
                      "--forward-coordinates",
                      dest="forward_coordinates",
                      help="translate to forward coordinates.",
                      action="store_true")

    parser.add_option("--forward-strand",
                      dest="forward_strand",
                      help="convert to forward strand.",
                      action="store_true")

    parser.add_option("--ignore-strand",
                      dest="ignore_strand",
                      help="ignore strand information.",
                      action="store_true")

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input will be treated as gtf [default=%default].")

    parser.add_option(
        "--add-up-flank",
        dest="add_up_flank",
        type="int",
        help="add an upstream flanking segment to first exon of a group.")

    parser.add_option(
        "--add-down-flank",
        dest="add_down_flank",
        type="int",
        help="add a downstream flanking segment to last segment of a group.")

    parser.add_option("--extend",
                      dest="extend",
                      help="extend the existing features.",
                      action="store_true")

    parser.add_option("-c",
                      "--contigs",
                      dest="input_filename_contigs",
                      type="string",
                      help="filename with contig lenghts.")

    parser.add_option(
        "--filename-agp",
        dest="input_filename_agp",
        type="string",
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option(
        "--complement-groups",
        dest="complement_groups",
        action="store_true",
        help="""complement groups. Will write introns from exons [%default]."""
    )

    parser.add_option(
        "--group-field",
        dest="group_field",
        type="string",
        help=
        """gff field/attribute to group by such as gene_id, transrcipt_id, ... [%default]."""
    )

    parser.add_option("--combine-groups",
                      dest="combine_groups",
                      action="store_true",
                      help="""combine groups.""")

    parser.add_option(
        "--filter-range",
        dest="filter_range",
        type="string",
        help=
        """extract all elements overlapping a range. A range is specified by eithor 'contig:from..to', 'contig:+:from..to', or 'from,to' ."""
    )

    parser.add_option(
        "--join-features",
        dest="join_features",
        type="string",
        help=
        "join features into a single transcript. Consecutive features are grouped "
        " into the same transcript/gene. This metdo expects a string of for numbers ``a,b,c,d`` "
        " as input with:"
        " a,b=minimum/maximum distance between features, "
        " c,d=minimum,maximum number of features."
        "")

    parser.add_option(
        "--merge-features",
        dest="merge_features",
        type="string",
        help=
        "merge features. Consecutive features are merged into a single feature. "
        "This method expects a string of four numbers ``a,b,c,d`` as input; "
        "a,b=minimum/maximum distance between features, "
        "c,d=minimum,maximum number of features.")

    parser.add_option(
        "--crop-unique",
        dest="crop_unique",
        action="store_true",
        help=
        "crop overlapping intervals, keeping only intervals that are unique [default=%default]"
    )

    parser.add_option(
        "--crop",
        dest="crop",
        type="string",
        help=
        """crop features in gff file with features in another file. If a feature falls in the middle of another, two entries will be output."""
    )

    parser.add_option(
        "--sanitize",
        dest="sanitize",
        type="choice",
        choices=("ucsc", "ensembl", "genome"),
        help=
        "sanitize chr names for ucsc or ensembl or use the genome translator [%default]."
    )

    parser.add_option(
        "--skip-missing",
        dest="skip_missing",
        action="store_true",
        help=
        "skip entries on missing contigs. Otherwise an exception is raised [%default]."
    )

    parser.add_option(
        "--remove-contigs",
        dest="remove_contigs",
        type="string",
        action="store",
        help=
        "a comma separated list of regular expressions specifying contigs to be removed when runnnig sanitize [%default]."
    )

    parser.set_defaults(
        forward_coordinates=False,
        forward_strand=False,
        input_filename_contigs=False,
        input_filename_agp=False,
        genome_file=None,
        sanitize=None,
        add_up_flank=None,
        add_down_flank=None,
        extend=False,
        complement_groups=False,
        combine_groups=False,
        crop=None,
        crop_unique=False,
        ignore_strand=False,
        filter_range=None,
        join_features=None,
        merge_features=None,
        output_format="%06i",
        skip_missing=False,
        remove_contigs=None,
        is_gtf=False,
        group_field=None,
    )

    (options, args) = E.Start(parser, argv=argv)

    if options.input_filename_contigs:
        contigs = Genomics.ReadContigSizes(
            IOTools.openFile(options.input_filename_contigs, "r"))

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()
    else:
        genome_fasta = None

    if (options.forward_coordinates or options.forward_strand) and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if options.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(options.stdin)

    if options.add_up_flank or options.add_down_flank:

        if options.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, options.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(lambda x, y: cmp(x.start, y.start))
            lcontig = contigs[chunk[0].contig]

            if options.extend:
                if options.add_up_flank:
                    if is_positive:
                        chunk[0].start = max(
                            0, chunk[0].start - options.add_up_flank)
                    else:
                        chunk[-1].end = min(
                            lcontig, chunk[-1].end + options.add_up_flank)
                if options.add_down_flank:
                    if is_positive:
                        chunk[-1].end = min(
                            lcontig, chunk[-1].end + options.add_down_flank)
                    else:
                        chunk[0].start = max(
                            0, chunk[0].start - options.add_down_flank)
            else:
                if options.add_up_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - options.add_up_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + options.add_up_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if options.add_down_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + options.add_up_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - options.add_up_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                options.stdout.write(str(gff) + "\n")

    elif options.complement_groups:

        iterator = GTF.joined_iterator(gffs, group_field=options.group_field)

        for chunk in iterator:
            if options.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort()
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                options.stdout.write(str(x) + "\n")
                x.start = c.end

    elif options.combine_groups:

        iterator = GTF.joined_iterator(gffs)

        for chunk in iterator:
            chunk.sort()
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            options.stdout.write(str(x) + "\n")

    elif options.join_features:

        combineGFF(gffs, options, merge=False)

    elif options.merge_features:

        combineGFF(gffs, options, merge=True)

    elif options.crop:

        cropGFF(gffs, options)

    elif options.crop_unique:

        cropGFFUnique(gffs, options)

    elif options.filter_range:

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)",
                options.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match(
                    "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      options.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % options.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        if options.loglevel >= 2:
            options.stdlog.write(
                "# filter: contig=%s, strand=%s, interval=%s\n" %
                (str(contig), str(strand), str(interval)))
            options.stdlog.flush()

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            options.stdout.write(str(gff) + "\n")

    elif options.sanitize:

        def toUCSC(id):
            if not id.startswith("contig") and not id.startswith("chr"):
                id = "chr%s" % id
            return id

        def toEnsembl(id):
            if id.startswith("contig"):
                return id[len("contig"):]
            if id.startswith("chr"):
                return id[len("chr"):]
            return id

        if options.sanitize == "genome":
            if genome_fasta is None:
                raise ValueError(
                    "please specify --genome-file= when using --sanitize=genome"
                )
            f = genome_fasta.getToken
        elif options.sanitize == "ucsc":
            f = toUCSC
        elif options.sanitize == "ensembl":
            f = toEnsembl

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError, msg:
                if options.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if options.remove_contigs:
                to_remove = [
                    re.compile(x) for x in options.remove_contigs.split(",")
                ]
                if any([x.match(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            options.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()), len(
                       skipped_contigs.keys()), str(skipped_contigs)))
        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(outofrange_contigs.keys()), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()), len(
                       filtered_contigs.keys()), str(filtered_contigs)))

Example #11

Show file

File: snp2maf.py Project: AndreasHegerGenomics/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append",
                      help="tracks (tablenames) to use in sqlite database [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-r", "--reference", dest="reference", type="string",
                      help="name of reference [default=%default].")
    parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true",
                      help="if set, the gene_id will be added to the alignment header [default=%default].")
    parser.add_option("-z", "--compress", dest="compress", action="store_true",
                      help="compress output with gzip [default=%default].")
    parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string",
                      help="regular expression pattern for track [default=%default].")

    parser.set_defaults(
        genome_file=None,
        tracks=[],
        database="csvdb",
        output=[],
        border=0,
        reference_name="reference",
        pattern_track="(\S+)",
        is_gtf=True,
        compress=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if not options.database or not options.tracks:
        raise ValueError("please supply both database and tracks")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.is_gtf:
        infile_gff = GTF.iterator(options.stdin)
    else:
        infile_gff = GTF.iterator(options.stdin)

    dbhandle = sqlite3.connect(options.database)

    statement = '''SELECT pos, reference, genotype 
                   FROM %(track)s
                   WHERE contig = '%(contig)s' AND 
                   pos BETWEEN %(extended_start)s and %(extended_end)s
                '''

    counts = E.Counter()
    tracks = options.tracks
    try:
        translated_tracks = [
            re.search(options.pattern_track, track).groups()[0] for track in tracks]
    except AttributeError:
        raise AttributeError(
            "pattern `%s` does not match input tracks." % options.pattern_track)

    if options.compress:
        outfile = gzip.GzipFile(fileobj=options.stdout)
    else:
        outfile = options.stdout

    outfile.flush()
    outfile.write("##maf version=1 program=snp2maf.py\n\n")

    for gff in infile_gff:
        counts.input += 1

        contig = gff.contig
        strand = gff.strand
        lcontig = fasta.getLength(contig)
        region_start, region_end = gff.start, gff.end
        if contig.startswith("chr"):
            contig = contig[3:]
        extended_start = region_start - options.border
        extended_end = region_end + options.border
        is_positive = Genomics.IsPositiveStrand(strand)

        E.info("processing %s" % str(gff))

        # collect all variants
        all_variants = []
        for track in options.tracks:
            cc = dbhandle.cursor()
            cc.execute(statement % locals())
            all_variants.append(list(map(Variants.Variant._make, cc.fetchall())))
            cc.close()

        E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig,
                                                                   region_start, region_end,
                                                                   sum([
                                                                       len(x) for x in all_variants]),
                                                                   len(all_variants)))

        reference_seq = fasta.getSequence(
            contig, "+", region_start, region_end)
        lseq = len(reference_seq)
        alleles = collections.defaultdict(list)

        # build allele sequences for track and count maximum chars per mali
        # column
        colcounts = numpy.ones(lseq)
        for track, variants in zip(translated_tracks, all_variants):
            variants = Variants.updateVariants(variants, lcontig, "+")
            a = Variants.buildAlleles(reference_seq,
                                      variants,
                                      reference_start=region_start)

            alleles[track] = a
            for allele in a:
                for pos, c in enumerate(allele):
                    colcounts[pos] = max(colcounts[pos], len(c))

        # realign gapped regions
        alignIndels(alleles, colcounts)

        if options.is_gtf:
            outfile.write("a gene_id=%s\n" % gff.gene_id)
        else:
            outfile.write("a\n")

        maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n"

        def __addGaps(sequence, colcounts):
            '''output gapped sequence.'''
            r = []
            for x, c in enumerate(sequence):
                r.append(c + "-" * (colcounts[x] - len(c)))
            return "".join(r)

        name = ".".join((options.reference, contig))
        if is_positive:
            pos = region_start
        else:
            pos = lcontig - region_start

        size = lseq
        seq = __addGaps(reference_seq, colcounts)
        outfile.write(maf_format % (locals()))

        for track in translated_tracks:
            for aid, allele in enumerate(alleles[track]):
                seq = __addGaps(allele, colcounts)
                if not is_positive:
                    Genomics.complement(seq)
                size = len(seq) - seq.count("-")
                name = ".".join((track + "-%i" % aid, contig))
                outfile.write(maf_format % (locals()))

        outfile.write("\n")

    E.info("%s" % str(counts))

    # write footer and output benchmark information.
    E.stop()

Example #12

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-m",
        "--merge",
        dest="merge",
        action="store_true",
        help="merge adjacent intervals with the same attributes. "
        "[default=%default]")

    parser.add_option(
        "-e",
        "--feature",
        dest="feature",
        type="string",
        help="filter by a feature, for example 'exon', 'CDS'. If "
        "set to the empty string, all entries are output [%default].")

    parser.add_option(
        "-f",
        "--filename-masks",
        dest="filename_masks",
        type="string",
        metavar="gff",
        help="mask sequences with regions given in gff file [%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option(
        "--min-length",
        dest="min_length",
        type="int",
        help="set minimum length for sequences output [%default]")

    parser.add_option(
        "--max-length",
        dest="max_length",
        type="int",
        help="set maximum length for sequences output [%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        merge=False,
                        feature=None,
                        filename_masks=None,
                        remove_masked_regions=False,
                        min_length=0,
                        max_length=0,
                        extend_at=None,
                        extend_by=100,
                        masker=None)

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(sys.stdin))
    else:
        gffs = GTF.iterator(sys.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with open(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GFF.iterator(infile))

        # convert intervals to intersectors
        for contig in e.keys():
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    #    for item in iterator:
    #	print len(item) # 3, 2
    #	for i in item:
    #	   print len(i) # 9, 9, 9, 9, 9
    #	   print i.contig
    #	   print i.strand
    #	   print i.transcript_id

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = filter(lambda x: x.feature == feature, ichunk)
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from %s:%i..%i - %s" %
                   (ichunk[0].contig, ichunk[0].start, ichunk[0].end,
                    str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions): nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise "unimplemented"

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write( "# skipped because fully masked: %s: regions=%s masks=%s\n" %\
                                                  (name, str([ (x.start, x.end) for x in chunk ]), masked_regions) )
                    continue

        out = intervals

        if options.extend_at:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        #IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if l < options.min_length or (options.max_length
                                      and l > options.max_length):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write( "# skipped because length out of bounds %s: regions=%s len=%i\n" %\
                                          (name, str(intervals), l) )
            continue

        options.stdout.write(
            ">%s %s:%s:%s\n%s\n" %
            (name, contig, strand, ";".join(["%i-%i" % x
                                             for x in out]), "\n".join(s)))

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, nskipped_masked=%i, nskipped_length=%i" %\
                (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length ) )

    E.Stop()

Example #13

Show file

        flank = max(0, this.start - options.flank)
        nadded += addSegment("telomeric", 0, flank, this, options)
        nadded += addFlank(flank, this.start, this, options)
    else:
        # intergenic region
        d = this.start - last.end
        flank = options.flank
        if d > flank * 2:
            nadded += addFlank(last.end, last.end + flank, last, options)
            nadded += addSegment("intergenic", last.end + flank,
                                 this.start - flank, (last, this), options)
            nadded += addFlank(this.start - flank, this.start, this, options)
        else:
            # add short flank between two genes. If they can not agree
            # on the directionality, "flank" is used.
            is_positive1 = Genomics.IsPositiveStrand(last.strand)
            is_positive2 = Genomics.IsPositiveStrand(this.strand)
            if is_positive1 and not is_positive2:
                key = "3flank"
            elif not is_positive1 and is_positive2:
                key = "5flank"
            else:
                key = "flank"
            nadded += addSegment(key, last.end, this.start, (last, this),
                                 options)

    return nadded


# -----------------------------------------------------------------------------