예제 #1
0
파일: gtf2alleles.py 프로젝트: SCV/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tablename", dest="tablename", type="string",
                      help="tablename to get variants from (in samtools pileup format) [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-f", "--exons-file", dest="filename_exons", type="string",
                      help="filename with transcript model information (gtf formatted file)  [default=%default].")
    parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string",
                      help="filename with transcript models of a reference gene set. Stop codons that do not"
                      " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default].")
    parser.add_option("--vcf-file", dest="filename_vcf", type="string",
                      help="filename with variants in VCF format. Should be indexed by tabix  [default=%default].")
    parser.add_option("--pileup-file", dest="filename_pileup", type="string",
                      help="filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default].")
    parser.add_option("--vcf-sample", dest="vcf_sample", type="string",
                      help="sample id for species of interest in vcf formatted file [default=%default].")
    parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string",
                      help="filename of a list of transcript ids that are selenoproteins [default=%default].")
    parser.add_option("-m", "--module", dest="modules", type="choice", action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o", "--output-section", dest="output", type="choice", action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true",
                      help="add alleles that are knocked out to fasta and gtf files [default=%default].")

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(
            options.database, options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(
            options.filename_vcf, options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(
            ("gene_id",
             "transcript_id", "allele_id", "contig", "strand",
             "is_wildtype",
             ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print "# collected variants:", variants

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print "# merged variants:", variants

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(
                indexed_variants, all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print "exon", key
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print "intron", key
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] + variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(transcript,
                                   variant_exons,
                                   variant_introns,
                                   all_exons,
                                   all_introns,
                                   offsets,
                                   is_seleno=transcript_id in seleno,
                                   reference_coordinates=False,
                                   )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(allele.exon_starts[1:],
                                                                allele.cds_starts[
                                                                    1:],
                                                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(
                            ">%s\n%s\n" % (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig - allele.reference_first_stop_start, )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % (
                        "\t".join((gene_id,
                                   transcript_id,
                                   allele_id,
                                   contig,
                                   strand,
                                   "%i" % is_wildtype)),
                        "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
예제 #2
0
파일: snp2maf.py 프로젝트: Q-KIM/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append",
                      help="tracks (tablenames) to use in sqlite database [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-r", "--reference", dest="reference", type="string",
                      help="name of reference [default=%default].")
    parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true",
                      help="if set, the gene_id will be added to the alignment header [default=%default].")
    parser.add_option("-z", "--compress", dest="compress", action="store_true",
                      help="compress output with gzip [default=%default].")
    parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string",
                      help="regular expression pattern for track [default=%default].")

    parser.set_defaults(
        genome_file=None,
        tracks=[],
        database="csvdb",
        output=[],
        border=0,
        reference_name="reference",
        pattern_track="(\S+)",
        is_gtf=True,
        compress=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if not options.database or not options.tracks:
        raise ValueError("please supply both database and tracks")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.is_gtf:
        infile_gff = GTF.iterator(options.stdin)
    else:
        infile_gff = GTF.iterator(options.stdin)

    dbhandle = sqlite3.connect(options.database)

    statement = '''SELECT pos, reference, genotype 
                   FROM %(track)s
                   WHERE contig = '%(contig)s' AND 
                   pos BETWEEN %(extended_start)s and %(extended_end)s
                '''

    counts = E.Counter()
    tracks = options.tracks
    try:
        translated_tracks = [
            re.search(options.pattern_track, track).groups()[0] for track in tracks]
    except AttributeError:
        raise AttributeError(
            "pattern `%s` does not match input tracks." % options.pattern_track)

    if options.compress:
        outfile = gzip.GzipFile(fileobj=options.stdout)
    else:
        outfile = options.stdout

    outfile.flush()
    outfile.write("##maf version=1 program=snp2maf.py\n\n")

    for gff in infile_gff:
        counts.input += 1

        contig = gff.contig
        strand = gff.strand
        lcontig = fasta.getLength(contig)
        region_start, region_end = gff.start, gff.end
        if contig.startswith("chr"):
            contig = contig[3:]
        extended_start = region_start - options.border
        extended_end = region_end + options.border
        is_positive = Genomics.IsPositiveStrand(strand)

        E.info("processing %s" % str(gff))

        # collect all variants
        all_variants = []
        for track in options.tracks:
            cc = dbhandle.cursor()
            cc.execute(statement % locals())
            all_variants.append(map(Variants.Variant._make, cc.fetchall()))
            cc.close()

        E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig,
                                                                   region_start, region_end,
                                                                   sum([
                                                                       len(x) for x in all_variants]),
                                                                   len(all_variants)))

        reference_seq = fasta.getSequence(
            contig, "+", region_start, region_end)
        lseq = len(reference_seq)
        alleles = collections.defaultdict(list)

        # build allele sequences for track and count maximum chars per mali
        # column
        colcounts = numpy.ones(lseq)
        for track, variants in zip(translated_tracks, all_variants):
            variants = Variants.updateVariants(variants, lcontig, "+")
            a = Variants.buildAlleles(reference_seq,
                                      variants,
                                      reference_start=region_start)

            alleles[track] = a
            for allele in a:
                for pos, c in enumerate(allele):
                    colcounts[pos] = max(colcounts[pos], len(c))

        # realign gapped regions
        alignIndels(alleles, colcounts)

        if options.is_gtf:
            outfile.write("a gene_id=%s\n" % gff.gene_id)
        else:
            outfile.write("a\n")

        maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n"

        def __addGaps(sequence, colcounts):
            '''output gapped sequence.'''
            r = []
            for x, c in enumerate(sequence):
                r.append(c + "-" * (colcounts[x] - len(c)))
            return "".join(r)

        name = ".".join((options.reference, contig))
        if is_positive:
            pos = region_start
        else:
            pos = lcontig - region_start

        size = lseq
        seq = __addGaps(reference_seq, colcounts)
        outfile.write(maf_format % (locals()))

        for track in translated_tracks:
            for aid, allele in enumerate(alleles[track]):
                seq = __addGaps(allele, colcounts)
                if not is_positive:
                    Genomics.complement(seq)
                size = len(seq) - seq.count("-")
                name = ".".join((track + "-%i" % aid, contig))
                outfile.write(maf_format % (locals()))

        outfile.write("\n")

    E.info("%s" % str(counts))

    # write footer and output benchmark information.
    E.Stop()
예제 #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")
    parser.add_option(
        "-t",
        "--tablename",
        dest="tablename",
        type="string",
        help=
        "tablename to get variants from (in samtools pileup format) [default=%default]."
    )
    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option(
        "-f",
        "--exons-file",
        dest="filename_exons",
        type="string",
        help=
        "filename with transcript model information (gtf formatted file)  [default=%default]."
    )
    parser.add_option(
        "-r",
        "--filename-reference",
        dest="filename_reference",
        type="string",
        help=
        "filename with transcript models of a reference gene set. Stop codons that do not"
        " overlap any of the exons in this file are ignore (gtf-formatted file)  [default=%default]."
    )
    parser.add_option(
        "--vcf-file",
        dest="filename_vcf",
        type="string",
        help=
        "filename with variants in VCF format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--pileup-file",
        dest="filename_pileup",
        type="string",
        help=
        "filename with variants in samtools pileup format. Should be indexed by tabix  [default=%default]."
    )
    parser.add_option(
        "--vcf-sample",
        dest="vcf_sample",
        type="string",
        help=
        "sample id for species of interest in vcf formatted file [default=%default]."
    )
    parser.add_option(
        "-s",
        "--seleno-tsv-file",
        dest="filename_seleno",
        type="string",
        help=
        "filename of a list of transcript ids that are selenoproteins [default=%default]."
    )
    parser.add_option("-m",
                      "--module",
                      dest="modules",
                      type="choice",
                      action="append",
                      choices=("gene-counts", "transcript-effects"),
                      help="modules to apply [default=%default].")
    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("all", "peptide", "cds", "table", "gtf", "map"),
                      help="sections to output [default=%default].")
    parser.add_option(
        "-k",
        "--with-knockouts",
        dest="with_knockouts",
        action="store_true",
        help=
        "add alleles that are knocked out to fasta and gtf files [default=%default]."
    )

    parser.set_defaults(
        genome_file=None,
        filename_exons=None,
        filename_referenec=None,
        filename_seleno=None,
        modules=[],
        border=200,
        separator="|",
        tablename=None,
        database="csvdb",
        output=[],
        with_knockouts=False,
        filename_vcf=None,
        vcf_sample=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.filename_seleno:
        seleno = set(IOTools.readList(open(options.filename_seleno, "r")))
    else:
        seleno = {}

    infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin))

    # acquire variants from SQLlite database
    if options.tablename:
        if not options.database:
            raise ValueError("please supply both database and tablename")
        variant_getter = VariantGetterSqlite(options.database,
                                             options.tablename)
    elif options.filename_pileup:
        variant_getter = VariantGetterPileup(options.filename_pileup)
    elif options.filename_vcf:
        variant_getter = VariantGetterVCF(options.filename_vcf,
                                          options.vcf_sample)
    else:
        raise ValueError("please specify a source of variants.")

    if len(options.output) == 0 or "all" in options.output:
        output_all = True
    else:
        output_all = False

    if "cds" in options.output or output_all:
        outfile_cds = E.openOutputFile("cds.fasta")
    else:
        outfile_cds = None

    if "map" in options.output or output_all:
        outfile_map = E.openOutputFile("map.psl")
    else:
        outfile_map = None

    if "peptide" in options.output or output_all:
        outfile_peptides = E.openOutputFile("peptides.fasta")
    else:
        outfile_peptides = None

    if "table" in options.output or output_all:
        outfile_alleles = E.openOutputFile("table")
        outfile_alleles.write("\t".join(("gene_id", "transcript_id",
                                         "allele_id", "contig", "strand",
                                         "is_wildtype",
                                         ("\t".join(Allele._fields)))) + "\n")
    else:
        outfile_alleles = None

    if "gtf" in options.output or output_all:
        outfile_gtf = E.openOutputFile("gtf")
    else:
        outfile_gtf = None

    # id separatar
    separator = options.separator

    for transcripts in infile_gtf:

        gene_id = transcripts[0][0].gene_id

        overall_start = min([min([x.start for x in y]) for y in transcripts])
        overall_end = max([max([x.end for x in y]) for y in transcripts])
        contig = transcripts[0][0].contig
        strand = transcripts[0][0].strand
        is_positive_strand = Genomics.IsPositiveStrand(strand)
        lcontig = fasta.getLength(contig)
        E.info("%s: started processing on %s:%i..%i (%s)" %
               (gene_id, contig, overall_start, overall_end, strand))

        ninput += 1
        extended_start = max(0, overall_start - options.border)
        extended_end = min(lcontig, overall_end + options.border)

        # if contig.startswith("chr"): contig = contig[3:]

        variants = variant_getter(contig, extended_start, extended_end)

        E.debug("%s: found %i variants in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# collected variants:", variants)

        # collect intron/exon sequences
        # coordinates are forward/reverse
        # also updates the coordinates in transcripts
        all_exons, all_introns = collectExonIntronSequences(transcripts, fasta)

        # update variants such that they use the same coordinates
        # as the transcript
        variants = Variants.updateVariants(variants, lcontig, strand)

        # deal with overlapping but consistent variants
        variants = Variants.mergeVariants(variants)

        E.debug("%s: found %i variants after merging in %s:%i..%i" %
                (gene_id, len(variants), contig, extended_start, extended_end))

        if E.global_options.loglevel >= 10:
            print("# merged variants:", variants)

        # collect coordinate offsets and remove conflicting variants
        variants, removed_variants, offsets = Variants.buildOffsets(
            variants, contig=contig)

        if len(removed_variants) > 0:
            E.warn("removed %i conflicting variants" % len(removed_variants))
            for v in removed_variants:
                E.info("removed variant: %s" % str(v))

        E.info("%i variants after filtering" % len(variants))

        if len(variants) > 0:
            # build variants
            indexed_variants = Variants.indexVariants(variants)

            # update exon sequences according to variants
            variant_exons = buildVariantSequences(indexed_variants, all_exons)

            # update intron sequences according to variants
            variant_introns = buildVariantSequences(indexed_variants,
                                                    all_introns)

            if E.global_options.loglevel >= 10:
                for key in variant_exons:
                    print("exon", key)
                    Genomics.printPrettyAlignment(
                        all_exons[key],
                        variant_exons[key][0],
                        variant_exons[key][1],
                    )
                for key in variant_introns:
                    print("intron", key)
                    Genomics.printPrettyAlignment(
                        all_introns[key][:30] + all_introns[key][-30:],
                        variant_introns[key][0][:30] +
                        variant_introns[key][0][-30:],
                        variant_introns[key][1][:30] +
                        variant_introns[key][1][-30:])

        else:
            variant_exons, variant_introns = None, None

        for transcript in transcripts:

            transcript.sort(key=lambda x: x.start)

            transcript_id = transcript[0].transcript_id
            alleles = buildAlleles(
                transcript,
                variant_exons,
                variant_introns,
                all_exons,
                all_introns,
                offsets,
                is_seleno=transcript_id in seleno,
                reference_coordinates=False,
            )

            ##############################################################
            ##############################################################
            ##############################################################
            # output
            for aid, al in enumerate(alleles):

                allele, map_cds2reference = al

                reference_cds_sequence = buildCDSSequence(
                    transcript, all_exons)
                is_wildtype = reference_cds_sequence == allele.cds

                allele_id = str(aid)
                assert len(allele.exon_starts) == allele.nexons
                assert len(allele.cds_starts) == allele.nexons
                assert len(allele.frames) == allele.nexons

                # the output id
                outid = separator.join((gene_id, transcript_id, allele_id))

                # output map between cds and reference
                if outfile_map and map_cds2reference:
                    match = Blat.Match()
                    match.mQueryId = allele_id
                    match.mQueryLength = allele.cds_len
                    match.mSbjctId = contig
                    match.mSbjctLength = lcontig
                    match.strand = strand
                    match.fromMap(map_cds2reference, use_strand=True)
                    outfile_map.write("%s\n" % str(match))

                # only output sequences for genes that have not been knocked
                # out, unless required
                if not allele.is_nmd_knockout or options.with_knockouts:

                    if outfile_gtf:
                        gtf = GTF.Entry()
                        gtf.gene_id = gene_id
                        gtf.transcript_id = transcript_id
                        gtf.addAttribute("allele_id", allele_id)
                        gtf.contig = contig
                        gtf.strand = strand
                        gtf.feature = "CDS"
                        gtf.source = "gtfxnsps"
                        l = 0
                        last_cds_start = allele.cds_starts[0]
                        gtf.start = allele.exon_starts[0]
                        gtf.frame = allele.frames[0]

                        for exon_start, cds_start, frame in zip(
                                allele.exon_starts[1:], allele.cds_starts[1:],
                                allele.frames[1:]):
                            cds_length = cds_start - last_cds_start
                            gtf.end = gtf.start + cds_length
                            if not is_positive_strand:
                                gtf.start, gtf.end = lcontig - \
                                    gtf.end, lcontig - gtf.start
                            outfile_gtf.write(str(gtf) + "\n")

                            gtf.start = exon_start
                            gtf.frame = frame

                            l += cds_length
                            last_cds_start = cds_start

                        cds_length = len(allele.cds) - last_cds_start
                        gtf.end = gtf.start + cds_length
                        if not is_positive_strand:
                            gtf.start, gtf.end = lcontig - \
                                gtf.end, lcontig - gtf.start
                        outfile_gtf.write(str(gtf) + "\n")

                    if outfile_cds:
                        outfile_cds.write(">%s\n%s\n" % (outid, allele.cds))
                    if outfile_peptides:
                        outfile_peptides.write(">%s\n%s\n" %
                                               (outid, allele.peptide))

                # reformat for tabular output
                allele = allele._replace(
                    cds_starts=",".join(map(str, allele.cds_starts)),
                    exon_starts=",".join(map(str, allele.exon_starts)),
                    frames=",".join(map(str, allele.frames)))

                # convert reference coordinates to positive strand coordinates
                if allele.reference_first_stop_start >= 0 and not is_positive_strand:
                    allele = allele._replace(
                        reference_first_stop_start=lcontig -
                        allele.reference_first_stop_end,
                        reference_first_stop_end=lcontig -
                        allele.reference_first_stop_start,
                    )

                if outfile_alleles:
                    outfile_alleles.write("%s\t%s\n" % ("\t".join(
                        (gene_id, transcript_id, allele_id, contig, strand,
                         "%i" % is_wildtype)), "\t".join(map(str, allele))))

                noutput += 1
                # only output first allele (debugging)
                # break

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.Stop()
예제 #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")
    parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append",
                      help="tracks (tablenames) to use in sqlite database [default=%default].")
    parser.add_option("-d", "--database", dest="database", type="string",
                      help="sqlite3 database [default=%default].")
    parser.add_option("-r", "--reference", dest="reference", type="string",
                      help="name of reference [default=%default].")
    parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true",
                      help="if set, the gene_id will be added to the alignment header [default=%default].")
    parser.add_option("-z", "--compress", dest="compress", action="store_true",
                      help="compress output with gzip [default=%default].")
    parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string",
                      help="regular expression pattern for track [default=%default].")

    parser.set_defaults(
        genome_file=None,
        tracks=[],
        database="csvdb",
        output=[],
        border=0,
        reference_name="reference",
        pattern_track="(\S+)",
        is_gtf=True,
        compress=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    ninput, nskipped, noutput = 0, 0, 0

    if not options.database or not options.tracks:
        raise ValueError("please supply both database and tracks")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.is_gtf:
        infile_gff = GTF.iterator(options.stdin)
    else:
        infile_gff = GTF.iterator(options.stdin)

    dbhandle = sqlite3.connect(options.database)

    statement = '''SELECT pos, reference, genotype 
                   FROM %(track)s
                   WHERE contig = '%(contig)s' AND 
                   pos BETWEEN %(extended_start)s and %(extended_end)s
                '''

    counts = E.Counter()
    tracks = options.tracks
    try:
        translated_tracks = [
            re.search(options.pattern_track, track).groups()[0] for track in tracks]
    except AttributeError:
        raise AttributeError(
            "pattern `%s` does not match input tracks." % options.pattern_track)

    if options.compress:
        outfile = gzip.GzipFile(fileobj=options.stdout)
    else:
        outfile = options.stdout

    outfile.flush()
    outfile.write("##maf version=1 program=snp2maf.py\n\n")

    for gff in infile_gff:
        counts.input += 1

        contig = gff.contig
        strand = gff.strand
        lcontig = fasta.getLength(contig)
        region_start, region_end = gff.start, gff.end
        if contig.startswith("chr"):
            contig = contig[3:]
        extended_start = region_start - options.border
        extended_end = region_end + options.border
        is_positive = Genomics.IsPositiveStrand(strand)

        E.info("processing %s" % str(gff))

        # collect all variants
        all_variants = []
        for track in options.tracks:
            cc = dbhandle.cursor()
            cc.execute(statement % locals())
            all_variants.append(list(map(Variants.Variant._make, cc.fetchall())))
            cc.close()

        E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig,
                                                                   region_start, region_end,
                                                                   sum([
                                                                       len(x) for x in all_variants]),
                                                                   len(all_variants)))

        reference_seq = fasta.getSequence(
            contig, "+", region_start, region_end)
        lseq = len(reference_seq)
        alleles = collections.defaultdict(list)

        # build allele sequences for track and count maximum chars per mali
        # column
        colcounts = numpy.ones(lseq)
        for track, variants in zip(translated_tracks, all_variants):
            variants = Variants.updateVariants(variants, lcontig, "+")
            a = Variants.buildAlleles(reference_seq,
                                      variants,
                                      reference_start=region_start)

            alleles[track] = a
            for allele in a:
                for pos, c in enumerate(allele):
                    colcounts[pos] = max(colcounts[pos], len(c))

        # realign gapped regions
        alignIndels(alleles, colcounts)

        if options.is_gtf:
            outfile.write("a gene_id=%s\n" % gff.gene_id)
        else:
            outfile.write("a\n")

        maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n"

        def __addGaps(sequence, colcounts):
            '''output gapped sequence.'''
            r = []
            for x, c in enumerate(sequence):
                r.append(c + "-" * (colcounts[x] - len(c)))
            return "".join(r)

        name = ".".join((options.reference, contig))
        if is_positive:
            pos = region_start
        else:
            pos = lcontig - region_start

        size = lseq
        seq = __addGaps(reference_seq, colcounts)
        outfile.write(maf_format % (locals()))

        for track in translated_tracks:
            for aid, allele in enumerate(alleles[track]):
                seq = __addGaps(allele, colcounts)
                if not is_positive:
                    Genomics.complement(seq)
                size = len(seq) - seq.count("-")
                name = ".".join((track + "-%i" % aid, contig))
                outfile.write(maf_format % (locals()))

        outfile.write("\n")

    E.info("%s" % str(counts))

    # write footer and output benchmark information.
    E.stop()