Ejemplo n.º 1
0
    def testIteratorUncompressed(self):
        '''test iteration from uncompressed file.'''
        tmpfilename = 'tmp_testIteratorUncompressed'
        infile = gzip.open(self.filename, "rb")
        outfile = open(tmpfilename, "wb")
        outfile.write(infile.read())
        outfile.close()
        infile.close()

        with open(tmpfilename) as infile:
            for x, r in enumerate(pysam.tabix_iterator(infile,
                                                       pysam.asTuple())):
                self.assertEqual(self.compare[x], list(r))
                self.assertEqual(len(self.compare[x]), len(r))

                # test indexing
                for c in range(0, len(r)):
                    self.assertEqual(self.compare[x][c], r[c])

                # test slicing access
                for c in range(0, len(r) - 1):
                    for cc in range(c + 1, len(r)):
                        self.assertEqual(self.compare[x][c:cc], r[c:cc])

        os.unlink(tmpfilename)
Ejemplo n.º 2
0
    def testIteratorUncompressed(self):
        '''test iteration from uncompressed file.'''
        tmpfilename = 'tmp_testIteratorUncompressed'
        infile = gzip.open(self.filename, "rb")
        outfile = open(tmpfilename, "wb")
        outfile.write(infile.read())
        outfile.close()
        infile.close()

        with open(tmpfilename) as infile:
            for x, r in enumerate(pysam.tabix_iterator(
                    infile, pysam.asTuple())):
                self.assertEqual(self.compare[x], list(r))
                self.assertEqual(len(self.compare[x]), len(r))

                # test indexing
                for c in range(0, len(r)):
                    self.assertEqual(self.compare[x][c], r[c])

                # test slicing access
                for c in range(0, len(r) - 1):
                    for cc in range(c + 1, len(r)):
                        self.assertEqual(self.compare[x][c:cc],
                                         r[c:cc])

        os.unlink(tmpfilename)
Ejemplo n.º 3
0
def filter_bam(args, bcd):
    with open(args.output, 'w') as o:
        with gzip.open(args.fragments) as f:
            tbx = pysam.tabix_iterator(f, pysam.asBed())
            for line in tbx:
                if line.name in bcd:
                    o.write("{}\n".format(str(line)))
    return 0
def filter_bam(args,bcd):
  reads     = {}
  replicate = {}
  with gzip.open(args.fragments) as f:
    tbx = pysam.tabix_iterator(f,pysam.asBed())
    for line in tbx:
      if line.name in bcd:
        try:
          reads[bcd[line.name] + "_rep" + line.name.split("_")[-2]].append(str(line))
        except KeyError:
          reads[bcd[line.name] + "_rep" + line.name.split("_")[-2]]     = [str(line)]
          
  return(reads)
Ejemplo n.º 5
0
    def testIteratorCompressed(self):
        """test iteration from compressed file."""
        with gzip.open(self.filename) as infile:
            for x, r in enumerate(pysam.tabix_iterator(infile, pysam.asTuple())):
                self.assertEqual(self.compare[x], list(r))
                self.assertEqual(len(self.compare[x]), len(r))

                # test indexing
                for c in range(0, len(r)):
                    self.assertEqual(self.compare[x][c], r[c])

                # test slicing access
                for c in range(0, len(r) - 1):
                    for cc in range(c + 1, len(r)):
                        self.assertEqual(self.compare[x][c:cc], r[c:cc])
Ejemplo n.º 6
0
    def testIteratorCompressed(self):
        '''test iteration from compressed file.'''
        with gzip.open(self.filename) as infile:
            for x, r in enumerate(pysam.tabix_iterator(infile,
                                                       pysam.asTuple())):
                self.assertEqual(self.compare[x], list(r))
                self.assertEqual(len(self.compare[x]), len(r))

                # test indexing
                for c in range(0, len(r)):
                    self.assertEqual(self.compare[x][c], r[c])

                # test slicing access
                for c in range(0, len(r) - 1):
                    for cc in range(c + 1, len(r)):
                        self.assertEqual(self.compare[x][c:cc], r[c:cc])
Ejemplo n.º 7
0
def GetSumOfDifferencesFromTheReference(vcfpath):
    from subprocess import check_call
    from utilBMF.HTSUtils import TrimExt
    import pysam
    import numpy as np
    from sys import stderr
    from itertools import chain
    cfi = chain.from_iterable
    bgvcfpath = TrimExt(vcfpath) + ".gz"
    check_call("bgzip -c %s > %s" % (vcfpath, bgvcfpath), shell=True)
    stderr.write("bgvcf now at %s" % bgvcfpath)
    tabixstr = "tabix " + bgvcfpath
    stderr.write("Now calling tabixstr: '%s'" % tabixstr)
    check_call("tabix %s" % bgvcfpath, shell=True)
    infh = open(bgvcfpath, "rb")
    tabixhandle = pysam.tabix_iterator(infh, pysam.asVCF())
    return np.sum(np.array(list(cfi([dict(tup.split("=") for
                                          tup in i.info.split(";"))[
        'I16'].split(",")[2:4] for i in tabixhandle if
                                     "INDEL" not in i.info])), dtype=np.int64))
Ejemplo n.º 8
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    parser = argparse.ArgumentParser()
    parser.add_argument('--frac', type=float, default=0.0)
    parser.add_argument('gtf_file')
    args = parser.parse_args()

    all_t_ids = set()
    t_ids = set()
    for f in pysam.tabix_iterator(open(args.gtf_file), pysam.asGTF()):
        if f.feature == 'transcript':
            t_id = f.transcript_id
            frac = float(f.frac)
            keep = (frac >= args.frac)
            all_t_ids.add(t_id)
            if keep:
                t_ids.add(t_id)
                print str(f)
        elif f.feature == 'exon':
            t_id = f.transcript_id
            assert t_id in all_t_ids
            if t_id in t_ids:
                print str(f)
Ejemplo n.º 9
0
def main():
    logging.basicConfig(level=logging.DEBUG)
    parser = argparse.ArgumentParser()
    parser.add_argument('--frac', type=float, default=0.0)
    parser.add_argument('gtf_file')
    args = parser.parse_args()

    all_t_ids = set()
    t_ids = set()
    for f in pysam.tabix_iterator(open(args.gtf_file), pysam.asGTF()):
        if f.feature == 'transcript':
            t_id = f.transcript_id
            frac = float(f.frac)
            keep = (frac >= args.frac)
            all_t_ids.add(t_id)
            if keep:
                t_ids.add(t_id)
                print str(f)
        elif f.feature == 'exon':
            t_id = f.transcript_id
            assert t_id in all_t_ids
            if t_id in t_ids:
                print str(f)
Ejemplo n.º 10
0
def iterator(infile):
    """return a simple iterator over all entries in a file."""
    return pysam.tabix_iterator(infile, pysam.asGTF())
Ejemplo n.º 11
0
def main(argv=sys.argv):

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-s",
                        "--session",
                        dest="session",
                        type=str,
                        help="load session before creating plots ")

    parser.add_argument("-d",
                        "--snapshot-dir",
                        dest="snapshotdir",
                        type=str,
                        help="directory to save snapshots in ")

    parser.add_argument("-f",
                        "--format",
                        dest="format",
                        type=str,
                        choices=("png", "eps", "svg"),
                        help="output file format ")

    parser.add_argument("-o",
                        "--host",
                        dest="host",
                        type=str,
                        help="host that IGV is running on ")

    parser.add_argument("-p",
                        "--port",
                        dest="port",
                        type=int,
                        help="port that IGV listens at ")

    parser.add_argument("-e",
                        "--extend",
                        dest="extend",
                        type=int,
                        help="extend each interval by a number of bases ")

    parser.add_argument("-x",
                        "--expand",
                        dest="expand",
                        type=float,
                        help="expand each region by a certain factor ")

    parser.add_argument("--session-only",
                        dest="session_only",
                        action="store_true",
                        help="plot session after opening, "
                        "ignore intervals ")

    parser.add_argument("-n",
                        "--name",
                        dest="name",
                        type=str,
                        choices=("bed-name", "increment"),
                        help="name to use for snapshot ")

    parser.set_defaults(
        command="igv.sh",
        host='127.0.0.1',
        port=61111,
        snapshotdir=os.getcwd(),
        extend=0,
        format="png",
        expand=1.0,
        session=None,
        session_only=False,
        keep_open=False,
        name="bed-name",
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    igv_process = None
    if args.new_instance:
        E.info("starting new IGV process")
        igv_process = IGV.startIGV(command=args.command, port=args.port)
        E.info("new IGV process started")

    E.info("connection to process on %s:%s" % (args.host, args.port))
    E.info("saving images in %s" % args.snapshotdir)
    igv = IGV(host=args.host,
              port=args.port,
              snapshot_dir=os.path.abspath(args.snapshotdir))

    if args.session:
        E.info('loading session from %s' % args.session)
        igv.load(args.session)
        E.info('loaded session')

    if args.session_only:
        E.info('plotting session only ignoring any intervals')
        fn = "%s.%s" % (os.path.basename(args.session), args.format)
        E.info("writing snapshot to '%s'" % os.path.join(args.snapshotdir, fn))
        igv.save(fn)

    else:
        c = E.Counter()
        for bed in pysam.tabix_iterator(args.stdin, parser=pysam.asBed()):

            c.input += 1

            # IGV can not deal with white-space in filenames
            if args.name == "bed-name":
                name = re.sub("\s", "_", bed.name)
            elif args.name == "increment":
                name = str(c.input)

            E.info("going to %s:%i-%i for %s" %
                   (bed.contig, bed.start, bed.end, name))

            start, end = bed.start, bed.end
            extend = args.extend
            if args.expand:
                d = end - start
                extend = max(extend, (args.expand * d - d) // 2)

            start -= extend
            end += extend

            igv.go("%s:%i-%i" % (bed.contig, start, end))

            fn = E.get_output_file("%s.%s" % (name, args.format))
            E.info("writing snapshot to '%s'" % fn)
            igv.save(fn)

            c.snapshots += 1

        E.info(c)

    if igv_process is not None and not args.keep_open:
        E.info('shutting down IGV')
        igv_process.send_signal(signal.SIGKILL)

    E.stop()
Ejemplo n.º 12
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--reference-bed-file",
                      dest="reference_bed_file",
                      type="string",
                      help="reference bed file "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("lvc-comparison", ),
                      help="methods to apply [%default]")

    parser.set_defaults(method="lvc-comparison",
                        reference_fasta_file=None,
                        input_bed_file=None,
                        size_bins=(1000, 10000, 100000),
                        output_sets=True,
                        region_string=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    reference_set = collections.defaultdict(quicksect.IntervalTree)

    E.info("reading reference bed file from {}".format(
        options.reference_bed_file))
    with IOTools.open_file(options.reference_bed_file) as inf:
        for record in pysam.tabix_iterator(inf, pysam.asBed()):
            mm = reference_set[record.contig]
            mm.add(record.start, record.end)
    E.info("read reference intervals on {} contigs: {}".format(
        len(list(reference_set.keys())), ",".join(list(reference_set.keys()))))

    if options.output_sets:
        output_tp = E.open_output_file("tp")
        output_fp = E.open_output_file("fp")
        output_fn = E.open_output_file("fn")
    else:
        output_tp = None
        output_fp = None
        output_fn = None

    if options.method == "lvc-comparison":
        c = E.Counter()

        found = set()
        counts = {}
        names = set()
        nsize_bins = len(options.size_bins)
        for bin in range(len(options.size_bins) + 1):
            counts[bin] = dict([(x, collections.defaultdict(int))
                                for x in ("tp", "fn", "fp", "test", "truth")])

        for record in pysam.tabix_iterator(options.stdin, pysam.asBed()):
            if record.contig not in reference_set:
                c.ignored_no_contig += 1
                continue

            c.test += 1
            matches = reference_set[record.contig].search(
                record.start, record.end)
            size = record.end - record.start
            bin = get_size_bin(size, options.size_bins)

            if len(matches) == 0:
                c.fp += 1
                status = "fp"
                if output_fp:
                    output_fp.write(str(record) + "\n")
            elif len(matches) >= 1:
                c.tp += 1
                status = "tp"
                if output_tp:
                    output_tp.write(str(record) + "\n")
                # todo: overlap criteria

                # record found
                for match in matches:
                    found.add((record.contig, match.start, match.end))

            name = record.name.split(",")[0]
            names.add(name)
            counts[bin]["test"][name] += 1
            counts[bin][status][name] += 1

        outf = options.stdout

        with IOTools.open_file(options.reference_bed_file) as inf:
            for record in pysam.tabix_iterator(inf, pysam.asBed()):
                c.truth += 1
                bin = get_size_bin(record.end - record.start,
                                   options.size_bins)
                counts[bin]["truth"]["all"] += 1

                key = (record.contig, record.start, record.end)
                if key not in found:
                    c.fn += 1
                    counts[bin]["fn"]["all"] += 1

        outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth",
                              "fn")) + "\n")

        for name in sorted(names):
            for bin in range(len(options.size_bins) + 1):
                if bin == len(options.size_bins):
                    size_bin = ">={}".format(options.size_bins[-1])
                else:
                    size_bin = "<{}".format(options.size_bins[bin])
                outf.write("\t".join(
                    map(str, (
                        name,
                        size_bin,
                        counts[bin]["test"][name],
                        counts[bin]["tp"][name],
                        counts[bin]["fp"][name],
                        counts[bin]["truth"]["all"],
                        counts[bin]["fn"]["all"],
                    ))) + "\n")

    E.info(str(c))
    E.stop()
Ejemplo n.º 13
0
def _aggregate_gtf(gtf_file, sample_id, gtf_expr_attr, output_fh, stats_fh,
                   is_ref=False):
    def _init_t_dict():
        return {'_id': None, 'num_exons': 0, 'length': 0}

    t_dict = collections.defaultdict(_init_t_dict)
    cur_t_id = 1
    exprs = []
    for f in pysam.tabix_iterator(open(gtf_file), pysam.asGTF()):
        if f.feature == 'transcript':
            t_id = f.transcript_id
            if t_id in t_dict:
                m = 'GTF "%s" transcript_id "%s" not unique' % (gtf_file, t_id)
                raise GTFError(m)
            t_item = t_dict[t_id]
            # rename transcript id
            new_t_id = "%s.T%d" % (sample_id, cur_t_id)
            cur_t_id += 1
            t_item['_id'] = new_t_id
            if is_ref:
                expr = 0.0
            else:
                expr = float(f[gtf_expr_attr])
            exprs.append(expr)
            # prepare attributes
            attrs = {GTF.Attr.TRANSCRIPT_ID: new_t_id,
                     GTF.Attr.SAMPLE_ID: sample_id,
                     GTF.Attr.REF: str(int(is_ref)),
                     GTF.Attr.EXPR: str(expr)}
            # save attributes
            f.fromDict(attrs)
            print >>output_fh, str(f)
        elif f.feature == 'exon':
            t_id = f.transcript_id
            t_item = t_dict[t_id]
            # update statistics
            t_item['num_exons'] += 1
            t_item['length'] += (f.end - f.start)
            # replace transcript id
            f.fromDict({GTF.Attr.TRANSCRIPT_ID: t_item['_id']})
            print >>output_fh, str(f)

    # process statistics
    num_exons = []
    lengths = []
    for t_item in t_dict.itervalues():
        lengths.append(t_item['length'])
        num_exons.append(t_item['num_exons'])

    # compute and write stats
    quantiles = range(0, 101)
    expr_qs = (scoreatpercentile(exprs, q) for q in quantiles)
    expr_qs = ','.join(map(str, expr_qs))
    length_qs = (int(round(scoreatpercentile(lengths, q)))
                 for q in quantiles)
    length_qs = ','.join(map(str, length_qs))
    num_exon_qs = (int(round(scoreatpercentile(num_exons, q)))
                   for q in quantiles)
    num_exon_qs = ','.join(map(str, num_exon_qs))
    fields = [sample_id, len(t_dict), expr_qs, length_qs, num_exon_qs]
    print >>stats_fh, '\t'.join(map(str, fields))
Ejemplo n.º 14
0
def iterate_parsed_uncompressed(fn):
    with open(fn) as f:
        return len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
Ejemplo n.º 15
0
def test_iterator_parsed_compressed():
    f = gzip.open(fn_compressed)
    l = len( list( pysam.tabix_iterator( f, parser = pysam.asBed() )))
Ejemplo n.º 16
0
def test_iterator_parsed_uncompressed():
    f = open("windows_small.bed")
    l = len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
Ejemplo n.º 17
0
def readFromFile(infile):
    """read records from file and return as list."""
    result = []
    for gff in pysam.tabix_iterator(infile, pysam.asGTF()):
        result.append(gff)
    return result
Ejemplo n.º 18
0
def test_iterator_parsed_compressed():
    f = gzip.open(fn_compressed)
    l = len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
Ejemplo n.º 19
0
def readFromFile( infile ):
    """read gtf from file."""
    result = []
    for gff in pysam.tabix_iterator( infile, pysam.asGTF() ):
        result.append( gff )
    return result
Ejemplo n.º 20
0
Archivo: GTF.py Proyecto: yangjl/cgat
def readFromFile(infile):
    """read gtf from file."""
    result = []
    for gff in pysam.tabix_iterator(infile, pysam.asGTF()):
        result.append(gff)
    return result
Ejemplo n.º 21
0
def test_iterator_parsed_uncompressed():
    f = open("windows_small.bed")
    l = len( list( pysam.tabix_iterator( f, parser = pysam.asBed() )))
Ejemplo n.º 22
0
def readFromFile(infile):
    """read records from file and return as list."""
    result = []
    for gff in pysam.tabix_iterator(infile, pysam.asGTF()):
        result.append(gff)
    return result
Ejemplo n.º 23
0
def main(argv=None):
    '''
    main function
    '''

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--is-gff3",
                      dest="gff3_input",
                      action="store_true",
                      help="filename in gff3 format"
                      "[default=%default].")

    parser.add_option("-o",
                      "--output-only-attributes",
                      dest="only_attributes",
                      action="store_true",
                      help="output only attributes as separate columns "
                      "[default=%default].")

    parser.add_option("-f",
                      "--attributes-as-columns",
                      dest="output_full",
                      action="store_true",
                      help="output attributes as separate columns "
                      "[default=%default].")

    parser.add_option("-i",
                      "--invert",
                      dest="invert",
                      action="store_true",
                      help="convert tab-separated table back to gtf "
                      "[default=%default].")

    parser.add_option("-m",
                      "--output-map",
                      dest="output_map",
                      type="choice",
                      choices=("transcript2gene", "peptide2gene",
                               "peptide2transcript"),
                      help="output a map mapping transcripts to genes "
                      "[default=%default].")

    parser.set_defaults(only_attributes=False,
                        output_full=False,
                        invert=False,
                        output_map=None,
                        gff3_input=False)

    (options, args) = E.Start(parser, argv=argv)

    if options.output_full:
        # output full table with column for each attribute

        # to specify gff3 format
        if options.gff3_input is True:
            gff = pysam.tabix_iterator(options.stdin, parser=pysam.asGFF3())
            attributes = set()
            data = []
            for line in gff:
                # get keys to write out to header
                data.append(line)
                attributes = attributes.union(set(line.keys()))

            attributes = sorted(list(attributes))

            header = [
                "contig", "source", "feature", "start", "end", "score",
                "strand", "frame"
            ] + attributes

            options.stdout.write("\t".join(header) + "\n")

            for gff3 in data:
                for a in header:
                    val = getattr(gff3, a)
                    options.stdout.write("%s\t" % (val))
                options.stdout.write("\n")

        else:

            attributes = set()
            data = []
            for gtf in GTF.iterator(options.stdin):
                data.append(gtf)
                attributes = attributes.union(set(gtf.keys()))

            # remove gene_id and transcript_id, as they are used
            # explicitely later
            attributes.difference_update(["gene_id", "transcript_id"])

            attributes = sorted(list(attributes))

            if options.only_attributes:
                header = ["gene_id", "transcript_id"] + attributes
            else:
                header = [
                    "contig",
                    "source",
                    "feature",
                    "start",
                    "end",
                    "score",
                    "strand",
                    "frame",
                    "gene_id",
                    "transcript_id",
                ] + attributes

            options.stdout.write("\t".join(header) + "\n")

            if options.only_attributes:
                for gtf in data:
                    options.stdout.write("\t".join(
                        map(str, (
                            gtf.gene_id,
                            gtf.transcript_id,
                        ))))
                    for a in attributes:
                        if a in ("gene_id", "transcript_id"):
                            continue
                        try:
                            val = getattr(gtf, a)
                        except AttributeError:
                            val = ""
                        except KeyError:
                            val = ""
                        options.stdout.write("\t%s" % val)

                    options.stdout.write("\n")
            else:
                for gtf in data:
                    options.stdout.write("\t".join(
                        map(str, (
                            gtf.contig,
                            gtf.source,
                            gtf.feature,
                            gtf.start,
                            gtf.end,
                            gtf.score,
                            gtf.strand,
                            gtf.frame,
                            gtf.gene_id,
                            gtf.transcript_id,
                        ))))
                    for a in attributes:
                        try:
                            val = getattr(gtf, a)
                        except AttributeError:
                            val = ""
                        options.stdout.write("\t%s" % val)
                    options.stdout.write("\n")

    elif options.invert:

        gtf = GTF.Entry()
        header = None
        for line in options.stdin:
            if line.startswith("#"):
                continue
            data = line[:-1].split("\t")
            if not header:
                header = data
                map_header2column = dict([(y, x)
                                          for x, y in enumerate(header)])
                continue

            # fill gtf entry with data
            try:
                gtf.contig = data[map_header2column["contig"]]
                gtf.source = data[map_header2column["source"]]
                gtf.feature = data[map_header2column["feature"]]
                # subtract -1 to start for 0-based coordinates
                gtf.start = int(data[map_header2column["start"]])
                gtf.end = int(data[map_header2column["end"]])
                gtf.score = data[map_header2column["score"]]
                gtf.strand = data[map_header2column["strand"]]
                gtf.frame = data[map_header2column["frame"]]
                gtf.gene_id = data[map_header2column["gene_id"]]
                gtf.transcript_id = data[map_header2column["transcript_id"]]
                gtf.parseInfo(data[map_header2column["attributes"]], line)
            except KeyError as msg:
                raise KeyError("incomplete entry %s: %s: %s" %
                               (str(data), str(map_header2column), msg))
            # output gtf entry in gtf format
            options.stdout.write("%s\n" % str(gtf))

    elif options.output_map:

        if options.output_map == "transcript2gene":
            fr = lambda x: x.transcript_id
            to = lambda x: x.gene_id
            options.stdout.write("transcript_id\tgene_id\n")
        elif options.output_map == "peptide2gene":
            fr = lambda x: x.protein_id
            to = lambda x: x.gene_id
            options.stdout.write("peptide_id\tgene_id\n")
        elif options.output_map == "peptide2transcript":
            fr = lambda x: x.protein_id
            to = lambda x: x.transcript_id
            options.stdout.write("peptide_id\ttranscript_id\n")

        map_fr2to = {}
        for gtf in GTF.iterator(options.stdin):
            try:
                map_fr2to[fr(gtf)] = to(gtf)
            except AttributeError:
                pass

        for x, y in sorted(map_fr2to.items()):
            options.stdout.write("%s\t%s\n" % (x, y))
    else:
        header = ("contig", "source", "feature", "start", "end", "score",
                  "strand", "frame", "gene_id", "transcript_id", "attributes")
        options.stdout.write("\t".join(header) + "\n")

        for gtf in GTF.iterator(options.stdin):

            attributes = []
            for a in list(gtf.keys()):
                if a in ("gene_id", "transcript_id"):
                    continue
                attributes.append('%s %s' % (a, GTF.quote(gtf[a])))

            attributes = "; ".join(attributes)

            options.stdout.write("\t".join(
                map(str, (
                    gtf.contig,
                    gtf.source,
                    gtf.feature,
                    gtf.start,
                    gtf.end,
                    GTF.toDot(gtf.score),
                    gtf.strand,
                    gtf.frame,
                    gtf.gene_id,
                    gtf.transcript_id,
                    attributes,
                ))) + "\n")
    E.Stop()
Ejemplo n.º 24
0
 def __init__(self, infile, *args, **kwargs):
     self.gff = pysam.tabix_iterator(iotools.open_file(infile),
                                     parser=pysam.asGFF3())
Ejemplo n.º 25
0
def _aggregate_gtf(gtf_file,
                   sample_id,
                   gtf_expr_attr,
                   output_fh,
                   stats_fh,
                   is_ref=False):
    def _init_t_dict():
        return {'_id': None, 'num_exons': 0, 'length': 0}

    t_dict = collections.defaultdict(_init_t_dict)
    cur_t_id = 1
    exprs = []
    for f in pysam.tabix_iterator(open(gtf_file), pysam.asGTF()):
        if f.feature == 'transcript':
            t_id = f.transcript_id
            if t_id in t_dict:
                m = 'GTF "%s" transcript_id "%s" not unique' % (gtf_file, t_id)
                raise GTFError(m)
            t_item = t_dict[t_id]
            # rename transcript id
            new_t_id = "%s.T%d" % (sample_id, cur_t_id)
            cur_t_id += 1
            t_item['_id'] = new_t_id
            if is_ref:
                expr = 0.0
            else:
                expr = float(f[gtf_expr_attr])
            exprs.append(expr)
            # prepare attributes
            attrs = {
                GTF.Attr.TRANSCRIPT_ID: new_t_id,
                GTF.Attr.SAMPLE_ID: sample_id,
                GTF.Attr.REF: str(int(is_ref)),
                GTF.Attr.EXPR: str(expr)
            }
            # save attributes
            f.fromDict(attrs)
            print >> output_fh, str(f)
        elif f.feature == 'exon':
            t_id = f.transcript_id
            t_item = t_dict[t_id]
            # update statistics
            t_item['num_exons'] += 1
            t_item['length'] += (f.end - f.start)
            # replace transcript id
            f.fromDict({GTF.Attr.TRANSCRIPT_ID: t_item['_id']})
            print >> output_fh, str(f)

    # process statistics
    num_exons = []
    lengths = []
    for t_item in t_dict.itervalues():
        lengths.append(t_item['length'])
        num_exons.append(t_item['num_exons'])

    # compute and write stats
    quantiles = range(0, 101)
    expr_qs = (scoreatpercentile(exprs, q) for q in quantiles)
    expr_qs = ','.join(map(str, expr_qs))
    length_qs = (int(round(scoreatpercentile(lengths, q))) for q in quantiles)
    length_qs = ','.join(map(str, length_qs))
    num_exon_qs = (int(round(scoreatpercentile(num_exons, q)))
                   for q in quantiles)
    num_exon_qs = ','.join(map(str, num_exon_qs))
    fields = [sample_id, len(t_dict), expr_qs, length_qs, num_exon_qs]
    print >> stats_fh, '\t'.join(map(str, fields))
Ejemplo n.º 26
0
def get_bed_dict(refdict, bedfh):
    beddict = {chrom: np.zeros(len(refdict[chrom]), dtype = np.bool) for chrom in refdict.keys()}
    for bedline in pysam.tabix_iterator(bedfh, parser = pysam.asBed()):
        beddict[bedline.contig][bedline.start:bedline.end] = True #end is 1 past the actual end, so this slice should work properly
    return beddict
Ejemplo n.º 27
0
def iterate_parsed_uncompressed(fn):
    with open(fn) as f:
        return len(list(pysam.tabix_iterator(f, parser=pysam.asBed())))
Ejemplo n.º 28
0
def iterator(infile):
    """return a simple iterator over all entries in a file."""
    return pysam.tabix_iterator(infile, pysam.asGTF())
Ejemplo n.º 29
0
 def __init__(self, infile, *args, **kwargs):
     self.gff = pysam.tabix_iterator(IOTools.openFile(infile),
                                     parser=pysam.asGFF3())