def annotateCpGIslands( infiles, outfile ):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex( IOTools.openFile( cpgfile ) )
    
    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    outf.write("transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n" )

    for tss in Bed.iterator(IOTools.openFile( tssfile ) ):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find( start, end ))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start 
            else:
                relative_start = tss.end - genome_end
            
            relative_end = relative_start + l

            outf.write( "\t".join( map(str, (
                            tss.name, tss.strand,
                            genome_start, genome_end,
                            relative_start, relative_end ))) + "\n" )
            c.matches_output += 1

    outf.close()
            
    with IOTools.openFile( outfile + ".summary", "w" ) as outf:
        outf.write ("category\tcounts\n" )
        outf.write( c.asTable() + "\n" )
    
    E.info( c )
Exemple #2
0
def annotateCpGIslands(infiles, outfile):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex(IOTools.openFile(cpgfile))

    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n")

    for tss in Bed.iterator(IOTools.openFile(tssfile)):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find(start, end))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start
            else:
                relative_start = tss.end - genome_end

            relative_end = relative_start + l

            outf.write("\t".join(
                map(str, (tss.name, tss.strand, genome_start, genome_end,
                          relative_start, relative_end))) + "\n")
            c.matches_output += 1

    outf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Exemple #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-o", "--output-section", dest="output", type="choice",
                      choices=("full", "name"),
                      help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("two arguments required")

    if args[0] == "-":
        infile1 = options.stdin
    else:
        infile1 = IOTools.openFile(args[0], "r")

    infile2 = IOTools.openFile(args[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = options.output
    outfile = options.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.Stop()
Exemple #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-o", "--output", dest="output", type="choice",
                      choices=("full", "name"),
                      help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("two arguments required")

    if args[0] == "-":
        infile1 = options.stdin
    else:
        infile1 = IOTools.openFile(args[0], "r")

    infile2 = IOTools.openFile(args[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = options.output
    outfile = options.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.Stop()
Exemple #5
0
    def __init__(self, filename, *args, **kwargs ):
        
        assert filename != None, "please supply filename for CounterOverlap"

        Counter.__init__(self, *args, **kwargs )

        self.filename = filename

        E.info( "reading intervals from %s" % self.filename )

        self.index = Bed.readAndIndex( IOTools.openFile( self.filename, "r"),
                                       per_track = True )
        
        E.info( "read intervals for %s tracks" % len(self.index) )

        self.tracks = self.index.keys()
        self.headers = []
        for track in self.tracks:
            self.headers.extend( ["%s_nover" % track, "%s_bases" % track] )
Exemple #6
0
    def __init__(self, filename, *args, **kwargs ):
        
        assert filename != None, "please supply filename for CounterOverlap"

        Counter.__init__(self, *args, **kwargs )

        self.filename = filename

        E.info( "reading intervals from %s" % self.filename )

        self.index = Bed.readAndIndex( IOTools.openFile( self.filename, "r"),
                                       per_track = True )
        
        E.info( "read intervals for %s tracks" % len(self.index) )

        self.tracks = self.index.keys()
        self.headers = []
        for track in self.tracks:
            self.headers.extend( ["%s_nover" % track, "%s_bases" % track] )
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=["ave_dist", "min_dist", "corr"],
                      default="min_dist",
                      help="Method for calcuating similarity between profiles")
    parser.add_option("-s",
                      "--spread",
                      dest="spread",
                      type="int",
                      default=10,
                      help="Amount to spread each tag by")
    parser.add_option("-k",
                      "--keep-dist",
                      dest="keep_dist",
                      action="store_true",
                      help="Keep the distribution of tag depths")
    parser.add_option("-r",
                      "--rands",
                      dest="rands",
                      default=100,
                      help="Number of randomisations to use for calculating"
                      " mean and stdev of distance")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    profile1_file, profile2_file = args
    profile1_file = pysam.AlignmentFile(profile1_file)

    if profile2_file.endswith("bed") or profile2_file.endswith("bed.gz"):
        profile2_file = Bed.readAndIndex(profile2_file, with_values=True)
        profile2_counter = bed_counter
    else:
        profile2_file = pysam.AlignmentFile(profile2_file)
        profile2_counter = iCLIP.count_intervals

    if options.method == "min_dist":
        distance_func = iCLIP.findMinDistance
    elif options.method == "ave_dist":
        distance_func = iCLIP.calcAverageDistance
    else:

        def distance_func(profile1, profile2):
            return iCLIP.corr_profile(profile1,
                                      profile2,
                                      options.spread,
                                      profile2_ready=True)

    for exon in GTF.iterator(options.stdin):
        if exon.feature != "exon":
            continue

        contig = exon.contig
        strand = exon.strand
        transcript_id = exon.transcript_id
        start = exon.start
        end = exon.end

        profile1 = iCLIP.count_intervals(profile1_file, [(start, end)],
                                         contig=contig,
                                         strand=strand)

        profile2 = profile2_counter(profile2_file, [(start, end)],
                                    contig=contig,
                                    strand=strand)

        if profile1.sum() == 0 or profile2.sum() == 0:
            z = "NA"
            distance = "NA"
            options.stdout.write(
                "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance)s\t%(z)s\n"
                % locals())
            continue

        if options.method == "corr":
            profile2 = iCLIP.spread(profile2, options.spread)

        distance = distance_func(profile1, profile2)

        rands = iCLIP.rand_apply(profile=profile1,
                                 exon=exon,
                                 n=options.rands,
                                 func=distance_func,
                                 keep_dist=options.keep_dist,
                                 profile2=profile2)

        z = (distance - rands.mean()) / rands.std()

        options.stdout.write(
            "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance).3f\t%(z).2f\n"
            % locals())
    # write footer and output benchmark information.
    E.Stop()
Exemple #8
0
 def buildIndex(self, filename):
     return Bed.readAndIndex(IOTools.openFile(filename, "r"))
Exemple #9
0
 def __init__(self, filename):
     self.mIndices = Bed.readAndIndex(IOTools.openFile(filename, "r"),
                                      per_track=True)
Exemple #10
0
 def buildIndex(self, filename):
     return Bed.readAndIndex(IOTools.openFile(filename, "r"))
Exemple #11
0
 def __init__(self, filename):
     self.mIndices = Bed.readAndIndex(IOTools.openFile(filename, "r"),
                                      per_track=True)
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-b",
        "--bamfile",
        dest="bam",
        type="string",
        help=
        "BAM formated alignment file to test. Should have MD and NH tags set")
    parser.add_option(
        "-t",
        "--quality-threshold",
        dest="threshold",
        type="int",
        default=30,
        help="minimum quality threshold for a mismatched base to count")
    parser.add_option("-f",
                      "--fasta-path",
                      dest="fastapath",
                      type="string",
                      help="path to indexed fasta file for genome of choice")
    parser.add_option("-p",
                      "--vcf-path",
                      dest="vcfpath",
                      type="string",
                      help="path to indexed vcf file for dataset  of choice")
    parser.add_option(
        "-d",
        "--sample",
        dest="samppattern",
        type="string",
        help=
        "pattern to match and extract the donor name from the bam file, for use in parsing the vcf file"
    )
    parser.add_option(
        "-n",
        "--REDI-path",
        dest="redipath",
        type="string",
        help=
        "path to Bed format REDIportal table containing RNA editing positions")

    (options, args) = E.Start(parser, argv=argv)

    bamfile = pysam.AlignmentFile(options.bam)
    fastafile = IndexedFasta(options.fastapath)
    vcffile = vcf.Reader(open(options.vcfpath, "r"))
    BEDREDI = Bed.readAndIndex(IOTools.openFile(options.redipath),
                               with_values=True)
    options.stdout.write("\t".join([
        "gene_id", "strand", "mismatches", "bases", "low_qual", "a", "t", "c",
        "g", "a_to_t", "a_to_g", "a_to_c", "t_to_a", "t_to_g", "t_to_c",
        "g_to_a", "g_to_t", "g_to_c", "c_to_a", "c_to_t", "c_to_g",
        "indel_count", "RNA_editing_events"
    ]) + "\n")

    samplepattern = options.samppattern
    (not_reverse_g_to_t, reverse_g_to_t) = 0, 0
    donorfrombam = re.search(r"%s" % (samplepattern), options.bam,
                             flags=0).group(1)

    # find the donarid:
    vcf_record = vcffile.next()
    samples = vcf_record.samples
    donors = [dnr.sample for dnr in samples]
    donorid = None
    for samp in donors:
        if donorfrombam in samp:
            donorid = samp

    if donorid is None:
        raise ValueError("Donor %s not found in VCF" % donorfrombam)

    reversecomplement = {"a": "t", "t": "a", "c": "g", "g": "c"}

    for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

        exontuple = GTF.asRanges(gene, "exon")
        start = min(e.start for e in gene)
        end = max(e.end for e in gene)
        strand = gene[0].strand

        seq = fastafile.getSequence(gene[0].contig, "+", start, end)
        thischr = gene[0].contig.replace("chr", "")
        reads = bamfile.fetch(gene[0].contig, start, end)

        if all("chr" in c for c in vcffile.contigs.keys()) == False:
            contig = (gene[0].contig).replace("chr", "")
            if contig == "M":
                contig = contig + "T"
        else:
            contig = gene[0].contig

        vcfregion = vcffile.fetch(contig, start, end)

        regionchecker = list(vcfregion)

        BEDREDIregion = BEDREDI[gene[0].contig].find(start, end + 1)

        editpositions = {
            edit_pos: edit_pos_field
            for edit_pos, edit_pos_plus, edit_pos_field in BEDREDIregion
            if edit_pos_field.fields[2] == strand
        }

        gene_id = gene[0].gene_id
        mm_count = 0
        base_count = 0
        skipped = 0
        indel_count = 0
        RNA_edits = 0
        matched_bases = defaultdict(int)
        transition = {
            "a_to_t": 0,
            "a_to_g": 0,
            "a_to_c": 0,
            "t_to_a": 0,
            "t_to_g": 0,
            "t_to_c": 0,
            "g_to_a": 0,
            "g_to_t": 0,
            "g_to_c": 0,
            "c_to_a": 0,
            "c_to_t": 0,
            "c_to_g": 0
        }

        snp_dict = {}
        for snp in regionchecker:
            if snp.genotype(donorid)["GT"] != "0/0":
                snp_dict[snp.POS - 1] = snp.ALT

        for read in reads:

            if read.is_unmapped:
                continue
            if read.is_duplicate:
                continue
            if read.mate_is_unmapped:
                continue
            if read.get_tag("NH") > 1:
                continue
            qualities = read.query_qualities

            alignmentcigar = read.cigarstring

            indel_count += (alignmentcigar.count("I") +
                            alignmentcigar.count("D"))

            alignment = read.get_aligned_pairs(with_seq=True)

            # list[:] is weird syntax for copying the list
            testalignment = alignment[:]

            def _is_exon_range(base):
                result_ranges = []
                for exonrange in exontuple:
                    result_ranges.append(
                        exonrange[0] <= base[1] < exonrange[1])
                    if True in result_ranges:
                        return True
                    else:
                        return False

            alignment = [
                base for base in alignment if not base[0] is None
                and not base[1] is None and _is_exon_range(base)
            ]

            #            base_count += sum(1 for base in alignment
            #                          if start <= base[1] < end and
            #                          base[2].lower() != "n")

            total_alignment = [
                base for base in alignment
                if start <= base[1] < end and base[2].lower() != "n"
            ]

            base_count += len(total_alignment)

            for base in total_alignment:
                if seq[(base[1]) - start].lower() != base[2].lower():
                    if (testalignment[0][1] is None) or (testalignment[-1][1]
                                                         is None):
                        E.debug("first or last base of read is None")
                        raise ValueError
                    else:
                        E.debug(
                            "identity of error causing base from read sequence: %s"
                            % (read.query_alignment_sequence)[base[0]].lower())
                        E.debug("read sequence: %s" %
                                (read.query_alignment_sequence))
                        E.debug(
                            "identity start and end of read as calculated from start and end as described in gtffile and extracted from fasta: %s"
                            % (seq[(testalignment[0][1] -
                                    start):(testalignment[-1][1] - start)]))
                        E.debug(
                            "section of the read 10 bp downstream and upstream of the sequence containing the error extracted from the fasta: %s"
                            % (seq[((base[1] - 10) - start):(
                                (base[1] + 10) - start)].lower()))
                        E.debug("filename?: %s" % (read.tostring(bamfile)))
                        E.debug(
                            "positions of start and end of the gene based on the gtf: %s,%s"
                            % (start, end))
                        E.debug(
                            "identity of start of gene extratced from gtf: %s"
                            % (seq[(base[1]) - start]))
                        E.debug(
                            "identity of error causing base from reference genome: %s"
                            % base[2])
                        E.debug("position of base in read: %s" % base[0])
                        E.debug("position of base in genome: %s" % base[1])
                        E.debug(
                            "position of base in read as calculated from position of base in genome and and start from gtf: %s"
                            % (base[1] - start))
                        E.debug(
                            "identity of error causing base (reference), calculated from fasta and testalignment info: %s"
                            % (seq[(testalignment[0][1] -
                                    start):(testalignment[-1][1] -
                                            start)].upper()[base[0]]))
                        E.debug(
                            "position of base in read from first alignment genome base minus start plus position of base in in read, should equal position of base in read: %s"
                            % ((testalignment[0][1] - start) + base[0]))
                        E.debug(
                            "identity of error causing base (reference), calculated from fasta and position of base in genome from aligned pairs: %s"
                            % (seq[(base[1]) - start]))
                        E.debug(
                            "position of start base in genome from the alignment minus position of start base in genome from the gtf, should be zero: %s"
                            % (alignment[0][1] - start))
                        E.debug("complete aligned pairs, unfiltered: %s" %
                                (testalignment))
                        E.debug("full fasta sequence of read: %s" %
                                (textwrap.fill(seq, 50)))
                        raise ValueError

                else:
                    matched_bases[base[2].lower()] += 1
            if read.get_tag("NM") == 0:
                continue

            # mismatches

            readseq = read.query_sequence

            def _is_snp(base):
                global got_snp_pos
                global wrong_base
                if snp_dict.has_key(base[1]):
                    read_base = readseq[base[0]].lower()
                    alt_base = snp_dict[base[1]][0].sequence.lower()
                    got_snp_pos += 1
                    if read_base != alt_base:
                        wrong_base += 1
                        return True
                    else:
                        return False
                else:
                    return True

            def _is_indel(base):
                if (len(readseq) >= (base[0] + 5)):
                    if (len(seq) < (((base[1]) - start) + 5)):
                        upperrange = len(seq) - (base[1] - start)
                        lowerrange = 5 - upperrange
                        readindelwindow = readseq[(base[0] -
                                                   lowerrange):(base[0] +
                                                                upperrange)]
                        seqindelwindow = seq[(
                            ((base[1]) - start) -
                            lowerrange):(((base[1]) - start) + upperrange)]
                        matchwindows = []
                        for i in range(len(readindelwindow)):
                            try:
                                matchwindows.append(
                                    (readindelwindow[i].lower() ==
                                     seqindelwindow[i].lower()))
                            except IndexError:
                                print i
                                print readindelwindow
                                print seqindelwindow
                                print start
                                print lowerrange
                                print upperrange
                                print base[0]
                                print(base[0] - lowerrange)
                                print(base[0] + upperrange)
                                print base[1]
                                print(base[1] - start)
                                print((((base[1]) - start) - lowerrange) - 1)
                                print((((base[1]) - start) + upperrange) - 1)
                                print readseq
                                print seq
                                print gene_id
                                print gene[0].contig
                                raise
                    elif (len(seq) >= (((base[1]) - start) + 5)):
                        readindelwindow = readseq[base[0]:(base[0] + 5)]
                        seqindelwindow = seq[((base[1]) -
                                              start):(((base[1]) - start) + 5)]
                        matchwindows = []
                        for i in range(len(readindelwindow)):
                            try:
                                matchwindows.append(readindelwindow[i].lower(
                                ) == seqindelwindow[i].lower())
                            except IndexError:
                                print i
                                print readindelwindow
                                print seqindelwindow
                                print start
                                print base[0]
                                print base[1]
                                print(base[1] - start) - 1
                                print((base[1] - start) + 5) - 1
                                print readseq
                                print seq
                                print gene_id
                                print gene[0].contig
                                raise
                    if matchwindows.count(False) >= 4:
                        return False
                    else:
                        return True
                elif (len(readseq) < (base[0] + 5)):
                    if len(seq) < (((base[1]) - start) + 5):
                        readsequpperrange = len(readseq) - base[0]
                        readseqlowerrange = 5 - readsequpperrange
                        sequpperrange = len(seq) - (base[1] - start)
                        seqlowerrange = 5 - sequpperrange
                        if readsequpperrange < sequpperrange:
                            upperrange = readsequpperrange
                            lowerrange = readseqlowerrange
                        elif sequpperrange < readsequpperrange:
                            upperrange = sequpperrange
                            lowerrange = seqlowerrange
                        elif sequpperrange == readsequpperrange:
                            upperrange = sequpperrange
                            lowerrange = seqlowerrange
                    elif ((base[1] - start) - 4) < 0:
                        return True
                    else:
                        upperrange = len(readseq) - base[0]
                        lowerrange = 5 - upperrange
                    readindelwindow = readseq[(base[0] -
                                               lowerrange):(base[0] +
                                                            upperrange)]
                    seqindelwindow = seq[(((base[1]) - start) -
                                          lowerrange):(((base[1]) - start) +
                                                       upperrange)]
                    matchwindows = []
                    for i in range(len(readindelwindow)):
                        try:
                            matchwindows.append((readindelwindow[i].lower() ==
                                                 seqindelwindow[i].lower()))
                        except IndexError:
                            print i
                            print readindelwindow
                            print seqindelwindow
                            print start
                            print lowerrange
                            print upperrange
                            print base[0]
                            print(base[0] - lowerrange)
                            print(base[0] + upperrange)
                            print base[1]
                            print(base[1] - start)
                            print((((base[1]) - start) - lowerrange))
                            print((((base[1]) - start) + upperrange))
                            print readseq
                            print seq
                            print gene_id
                            print gene[0].contig

                            raise
                    if matchwindows.count(False) >= 4:
                        return False
                    else:
                        return True

            def _is_RNA_edit(base, editpositions):
                global got_edit_pos
                global wrong_edit_base
                genomebase = base[2]
                readbase = readseq[base[0]].lower()

                if not base[1] in editpositions.keys() or \
                   genomebase == "n" or \
                   readbase == "n" or \
                   not genomebase.islower():
                    return True
                else:
                    got_edit_pos += 1
                    if genomebase == editpositions[base[1]].fields[0].lower() and \
                       readbase == editpositions[base[1]].fields[1].lower():
                        return False
                    else:
                        wrong_edit_base += 1
                        return True

            for base in total_alignment:
                if _is_RNA_edit(base, editpositions) == False:
                    RNA_edits += 1

            mismatches = [
                base for base in total_alignment if base[2].islower()
                and qualities[base[0]] >= options.threshold and _is_snp(base)
                and _is_indel(base) and _is_RNA_edit(base, editpositions)
                and readseq[base[0]].lower() != "n"
            ]

            total_mm = sum(1 for base in total_alignment
                           if base[2].islower() and _is_snp(base)
                           and readseq[base[0]].lower() != "n")

            hq_mm = len(mismatches)

            for base in mismatches:
                genomebase = base[2].lower()
                readbase = readseq[base[0]].lower()
                try:
                    if strand == "-":
                        revgenomebase = reversecomplement[genomebase]
                        revreadbase = reversecomplement[readbase]
                        if revgenomebase == "g" and revreadbase == "a":
                            if read.is_reverse:
                                reverse_g_to_t += 1
                            else:
                                not_reverse_g_to_t += 1
                        transition["%s_to_%s" %
                                   (revgenomebase, revreadbase)] += 1
                    else:
                        transition["%s_to_%s" % (genomebase, readbase)] += 1
                except KeyError:
                    print transition
                    print read.query_alignment_sequence.upper()
                    print seq[(alignment[0][1] - start):(alignment[-1][1] -
                                                         start)].upper()
                    print read.tostring(bamfile)
                    raise

            mm_count += hq_mm
            skipped += total_mm - hq_mm

        outline = "\t".join(
            map(str, [
                gene_id, strand, mm_count, base_count, skipped,
                matched_bases['a'], matched_bases['t'], matched_bases['c'],
                matched_bases['g'], transition['a_to_t'], transition['a_to_g'],
                transition['a_to_c'], transition['t_to_a'],
                transition['t_to_g'], transition['t_to_c'],
                transition['g_to_a'], transition['g_to_t'],
                transition['g_to_c'], transition['c_to_a'],
                transition['c_to_t'], transition['c_to_g'], indel_count,
                RNA_edits
            ]))
        options.stdout.write(outline + "\n")

    # write footer and output benchmark information.
    E.info("Out of %i mismatches at snp positions %i were the wrong base" %
           (got_snp_pos, wrong_base))
    E.info(
        "Out of %i mismatches at RNA edit positions %i were the wrong base" %
        (got_edit_pos, wrong_edit_base))
    E.info(
        "Out of %i g_to_c transitions on - strand genes, the read was on the + strand %i times"
        % (not_reverse_g_to_t, reverse_g_to_t))
    E.Stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv


    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m","--method", dest="method", type="choice",
                      choices=["ave_dist","min_dist","corr"],
                      default="min_dist",
                      help="Method for calcuating similarity between profiles")
    parser.add_option("-s", "--spread", dest="spread", type="int",
                      default=10,
                      help="Amount to spread each tag by")
    parser.add_option("-k", "--keep-dist", dest="keep_dist", 
                      action="store_true",
                      help="Keep the distribution of tag depths")
    parser.add_option("-r", "--rands", dest="rands", 
                      default=100,
                      help="Number of randomisations to use for calculating"
                           " mean and stdev of distance")
 
    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    profile1_file, profile2_file = args
    profile1_file = pysam.AlignmentFile(profile1_file)

    if profile2_file.endswith("bed") or profile2_file.endswith("bed.gz"):
	profile2_file = Bed.readAndIndex(profile2_file, with_values=True)
        profile2_counter = bed_counter
    else:
        profile2_file = pysam.AlignmentFile(profile2_file)
        profile2_counter = iCLIP.count_intervals

    if options.method=="min_dist":
        distance_func = iCLIP.findMinDistance
    elif options.method=="ave_dist":
        distance_func = iCLIP.calcAverageDistance
    else:
        def distance_func(profile1,profile2):
            return iCLIP.corr_profile(profile1,profile2, options.spread, profile2_ready=True)
 
    for exon in GTF.iterator(options.stdin):
	if exon.feature != "exon":
            continue

        contig = exon.contig
        strand = exon.strand
        transcript_id = exon.transcript_id
        start = exon.start
        end = exon.end

        profile1 = iCLIP.count_intervals(profile1_file, 
                                         [(start, end)],
                                         contig=contig,
                                         strand=strand)

        profile2 = profile2_counter(profile2_file,
                                    [(start, end)],
                                    contig=contig,
                                    strand=strand)

        if profile1.sum() == 0 or profile2.sum() == 0:
            z = "NA"
            distance = "NA"
            options.stdout.write(
                "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance)s\t%(z)s\n" % locals())
            continue

	if options.method=="corr":
             profile2 = iCLIP.spread(profile2, options.spread)

        distance = distance_func(profile1, profile2)

        rands = iCLIP.rand_apply(profile=profile1, 
                                 exon=exon, 
                                 n=options.rands, 
                                 func=distance_func,
                                 keep_dist=options.keep_dist,
                                 profile2=profile2)

        z = (distance - rands.mean())/rands.std()

        options.stdout.write(
          "%(contig)s\t%(start)i\t%(end)i\t%(transcript_id)s\t%(strand)s\t%(distance).3f\t%(z).2f\n" % locals())
    # write footer and output benchmark information.
    E.Stop()
Exemple #14
0
def countInteractions(reads, digest, probe_fragments, outfile, metrics_file):

    reads = pysam.AlignmentFile(reads)
    digest_intervals = Bed.readAndIndex(IOTools.openFile(digest),
                                        with_values="name")
    probe_fragments = Bed.readAndIndex(IOTools.openFile(probe_fragments),
                                       with_values="name")
    c = collections.Counter()
    results = collections.defaultdict(int)
    contigs = digest_intervals.keys()
    rejects = pysam.AlignmentFile(outfile + ".rejects.bam",
                                  "wb",
                                  template=reads)

    E.debug("Starting Counting Run...")
    for fragment in readBundle(reads):

        if c["fragments"] % 1000 == 0:
            E.debug("\t".join(["%s:\t%s" % x for x in c.iteritems()]))

        c["fragments"] += 1
        c["reads"] += len(fragment)

        def _get_read_position(aln):

            if aln.is_reverse:
                c = aln.cigartuples[::-1]
            else:
                c = aln.cigartuples

            pos = 0
            for op, length in c:
                if op == 0:
                    return pos
                else:
                    pos += length

            return pos

        def _get_first(alignments):
            '''find alignment that maps earliest part of read'''
            slist = sorted(alignments, key=lambda x: _get_read_position(x))
            return (slist[0], slist[1:])

        def _get_probes(read):
            try:
                m = list(probe_fragments[reads.getrname(
                    read.reference_id)].find(read.pos, read.aend))
            except KeyError:
                return []
            return m

        first_read = [aln for aln in fragment if aln.is_read1]
        second_read = [aln for aln in fragment if aln.is_read2]

        if len(first_read) == 0 or len(second_read) == 0:
            c["Unpaired"] += 1
            continue

        assert len(first_read) + len(second_read) == len(fragment)

        primary_aln, other_aln = zip(_get_first(first_read),
                                     _get_first(second_read))
        other_aln = sum(other_aln, [])

        probe_matches = [_get_probes(read) for read in primary_aln]

        if len(sum(probe_matches, [])) == 0:
            c["no_probe"] += 1
            for read in fragment:
                rejects.write(read)
            continue

        other_matches = set(
            sum([
                list(digest_intervals[reads.getrname(read.reference_id)].find(
                    read.pos, read.aend)) for read in other_aln
                if reads.getrname(read.reference_id) in contigs
            ], []))

        primary_matches = set(
            sum([
                list(digest_intervals[reads.getrname(read.reference_id)].find(
                    read.pos, read.aend)) for read in primary_aln
                if reads.getrname(read.reference_id) in contigs
            ], []))

        if len(primary_matches) > 2:
            c["multi-hit"] += 1
            continue

        if not all([match in primary_matches for match in other_matches]):
            c["multi-hit"] += 1
            continue

        primary_matches = list(primary_matches)
        if len(primary_matches) == 2:
            results[(primary_matches[0][2], primary_matches[1][2])] += 1
            results[(primary_matches[1][2], primary_matches[0][2])] += 1
        elif len(primary_matches) == 1:
            results[(primary_matches[0][2], primary_matches[0][2])] += 1
        else:
            raise ValueError("Matches not found")

    with IOTools.openFile(outfile, "w") as outf:

        outf.write("Frag1\tFrag2\tCount\n")
        for pair in results:
            outf.write("%s\t%s\t%s\n" % (pair + (results[pair], )))

    with IOTools.openFile(metrics_file, "w") as outf:

        outf.write("\t".join(c.keys()) + "\n")
        outf.write("\t".join(map(str, c.values())) + "\n")

    rejects.close()
Exemple #15
0
def countInteractions(reads, digest, probe_fragments, outfile,
                      metrics_file):

    reads = pysam.AlignmentFile(reads)
    digest_intervals = Bed.readAndIndex(
        IOTools.openFile(digest), with_values="name")
    probe_fragments = Bed.readAndIndex(
        IOTools.openFile(probe_fragments), with_values="name")
    c = collections.Counter()
    results = collections.defaultdict(int)
    contigs = digest_intervals.keys()
    rejects=pysam.AlignmentFile(outfile + ".rejects.bam", "wb", template=reads)

    E.debug("Starting Counting Run...")
    for fragment in readBundle(reads):

        if c["fragments"] % 1000 == 0:
            E.debug("\t".join(["%s:\t%s" % x for x in c.iteritems()]))

        c["fragments"] += 1
        c["reads"] += len(fragment)

        def _get_read_position(aln):

            if aln.is_reverse:
                c = aln.cigartuples[::-1]
            else:
                c = aln.cigartuples

            pos = 0
            for op, length in c:
                if op == 0:
                    return pos
                else:
                    pos += length

            return pos

        def _get_first(alignments):
            '''find alignment that maps earliest part of read'''
            slist = sorted(alignments, key=lambda x: _get_read_position(x))
            return (slist[0], slist[1:])
            
        def _get_probes(read):
            try:
                m = list(
                    probe_fragments[reads.getrname(read.reference_id)].find(
                        read.pos, read.aend))
            except KeyError:
                return []
            return m

        first_read = [aln for aln in fragment if aln.is_read1]
        second_read = [aln for aln in fragment if aln.is_read2]

        if len(first_read) == 0 or len(second_read) == 0:
            c["Unpaired"] += 1
            continue

        assert len(first_read) + len(second_read) == len(fragment)

        primary_aln, other_aln = zip(_get_first(first_read),
                                     _get_first(second_read))
        other_aln = sum(other_aln, [])
        
        probe_matches = [_get_probes(read) for read in primary_aln]

        if len(sum(probe_matches, [])) == 0:
            c["no_probe"] += 1
            for read in fragment:
                rejects.write(read)
            continue
    
        other_matches = set(sum([
            list(digest_intervals[reads.getrname(read.reference_id)].find(
                read.pos, read.aend))
            for read in other_aln
            if reads.getrname(read.reference_id) in contigs],[]))

        primary_matches = set(sum([
            list(digest_intervals[reads.getrname(read.reference_id)].find(
                read.pos, read.aend))
            for read in primary_aln
            if reads.getrname(read.reference_id) in contigs],[]))

        if len(primary_matches) > 2:
            c["multi-hit"] += 1
            continue
            
        if not all([match in primary_matches for match in other_matches]):
            c["multi-hit"] += 1
            continue
        
        primary_matches = list(primary_matches)
        if len(primary_matches) == 2:
            results[(primary_matches[0][2], primary_matches[1][2])] += 1
            results[(primary_matches[1][2], primary_matches[0][2])] += 1
        elif len(primary_matches) == 1:
            results[(primary_matches[0][2], primary_matches[0][2])] += 1
        else:
            raise ValueError("Matches not found")

    with IOTools.openFile(outfile, "w") as outf:

        outf.write("Frag1\tFrag2\tCount\n")
        for pair in results:
            outf.write("%s\t%s\t%s\n" % (pair+(results[pair],)))

    with IOTools.openFile(metrics_file, "w") as outf:
        
        outf.write("\t".join(c.keys())+"\n")
        outf.write("\t".join(map(str,c.values())) + "\n")

    rejects.close()