Ejemplo n.º 1
0
 def _schedualJobs(self,jobs):
     ''' strategy for jobs
     '''
     time_interval = self.time_interval
     with open(self.log_file,'a') as logFile:
         n_err = 0
         eta = ETA(len(jobs))
         for i in xrange(len(jobs)):
             eta.print_status(i)
             time1 = time.time()
             result =self._doJob(jobs[i],logFile) 
             if(result != None  ):
                 self._saveLog(logFile,jobs[i],success=True)
                 self._collect(result)
                 if self.result_file!=None :
                     self._saveResult()  
                 time_interval = time_interval/4
                 if time_interval < self.time_interval:
                     time_interval = self.time_interval               
             else:
                 self._saveLog(logFile,jobs[i],success=False)
                 n_err += 1
                 time_interval = time_interval*2
             if n_err == self.error_times:
                 break;
             time_span = time.time() - time1
             if(time_span<self.time_interval):
                 time.sleep(self.time_interval - time_span)
         eta.done()
Ejemplo n.º 2
0
def gzip_reader(fname, quiet=False, callback=None, done_callback=None):
    if fname == '-':
        f = sys.stdin
    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
        f = gzip.open(os.path.expanduser(fname))
    else:
        f = open(os.path.expanduser(fname))

    if quiet or fname == '-':
        eta = None
    else:
        eta = ETA(os.stat(fname).st_size, fileobj=f)

    for line in f:
        if eta:
            if callback:
                extra = callback()
            else:
                extra = ''

            eta.print_status(extra=extra)
        yield line

        if done_callback and done_callback():
                break

    if f != sys.stdout:
        f.close()

    if eta:
        eta.done()
Ejemplo n.º 3
0
        def _gen1():
            if not self.quiet:
                eta = ETA(self.regions.total)
            else:
                eta = None

            count = 0
            for region in self.regions:
                working_chrom = None
                if region.chrom in self.bam.references:
                    working_chrom = region.chrom
                elif chrom[0:3] == 'chr':
                        if region.chrom[3:] in self.bam.references:
                            working_chrom = region.chrom[3:]

                if not working_chrom:
                    continue

                # for troubleshooting
                self.cur_chrom = region.chrom
                self.cur_start = region.start
                self.cur_end = region.end

                laststart = 0
                for read in self.bam.fetch(working_chrom, region.start, region.end):
                    if read.pos != laststart:
                        count += 1
                        laststart = read.pos

                    if eta:
                        eta.print_status(count, extra='%s/%s %s:%s' % (count, self.regions.total, self.bam.references[read.tid], read.pos))

                    yield read
            if eta:
                eta.done()
Ejemplo n.º 4
0
def bam_extract(inbam, outbam, bedfile, nostrand=False, quiet=False):
    bed = BedFile(bedfile)
    if not quiet:
        eta = ETA(os.stat(bedfile).st_size, fileobj=bed)
    else:
        eta = None

    passed = 0

    for region in bed:
        if eta:
            eta.print_status(extra="extracted:%s" % (passed))

        if not region.chrom in inbam.references:
            continue

        if not nostrand:
            strand = region.strand
        else:
            strand = None

        for read in bam_extract_reads(inbam, region.chrom, region.start, region.end, strand):
            outbam.write(read)
            passed += 1

    if not quiet:
        eta.done()
        sys.stderr.write("%s extracted\n" % (passed,))
Ejemplo n.º 5
0
def fastq_sort(fastq, byname=True, bysequence=False, tmpdir=None, chunksize=100000, out=sys.stdout, quiet=False):
    tmpfiles = []

    chunk = []
    sys.stderr.write('Sorting FASTQ file into chunks...\n')
    count = 0
    for read in fastq.fetch(quiet):
        count += 1 
        if byname:
            chunk.append((read.name, read))
        if bysequence:
            chunk.append((read.seq, read))

        if len(chunk) >= chunksize:
            tmpfiles.append(_write_tmp(chunk))
            chunk = []

    if chunk:
        tmpfiles.append(_write_tmp(chunk))

    sys.stderr.write('Merging chunks...\n')
    buf = [None, ] * len(tmpfiles)
    skip = [False, ] * len(tmpfiles)

    eta = ETA(count)

    j=0
    writing = True

    while writing:
        j+=1
        eta.print_status(j)
        for i, fobj in enumerate(tmpfiles):
            if not buf[i] and not skip[i]:
                try:
                    read = fastq_read_file(fobj)
                    if byname:
                        buf[i] = (read.name, i, read)
                    if bysequence:
                        buf[i] = (read.seq, i, read)
                except:
                    buf[i] = None
                    skip[i] = True
        
        sorted_list = buf[:]
        sorted_list.sort()
        writing = False

        for tup in sorted_list:
            if not tup:
                continue

            sorter, i, read = tup
            read.write(out)
            buf[i] = None
            writing = True
            break
    eta.done()
Ejemplo n.º 6
0
class Callback(object):
    def __init__(self, total):
        self.i = 0
        self.eta = ETA(total)

    def __call__(self, result=None):
        self.i += 1
        self.eta.print_status(self.i, extra=result if result else '')

    def done(self):
        self.eta.done()
Ejemplo n.º 7
0
def gtf_junctions(gtf, refname, fragment_size, min_size, max_exons=5, known=False, out=sys.stdout, quiet=False, scramble=False, retain_introns=False):
    ref = pysam.Fastafile(refname)

    references = []
    with open('%s.fai' % refname) as f:
        for line in f:
            cols = line.split('\t')
            references.append(cols[0])

    if not quiet:
        eta = ETA(gtf.fsize(), fileobj=gtf)
    else:
        eta = None

    exporter = JunctionExporter(ref, fragment_size, min_size, max_exons, out, scramble)

    for gene in gtf.genes:
        if not gene.chrom in references:
            continue

        if eta:
            eta.print_status(extra='%s:%s %s' % (gene.chrom, gene.start, gene.gene_name))

        if known:
            for txpt in gene.transcripts:
                last = None
                for exon in txpt.exons:
                    if last:
                        exporter.export(gene.chrom, [last, exon])
                    last = exon
        else:
            exons = set()
            for txpt in gene.transcripts:
                for exon in txpt.exons:
                    exons.add(exon)

            exons = list(exons)
            exons.sort()

            if retain_introns:
                exporter.export_retained_introns(gene.chrom, exons, gene.strand)

            if scramble:
                # We can just pretend the transcript is repeated
                # and then let the set take care of removing the duplicates
                exons = exons * 2

            exporter.export(gene.chrom, exons)



    if eta:
        eta.done()
    ref.close()
Ejemplo n.º 8
0
def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int):
    """Computes optimal sequence of player opcodes to reproduce audio data."""

    dlen = len(data)
    # TODO: avoid temporarily doubling memory footprint to concatenate
    data = numpy.concatenate(
        [data, numpy.zeros(lookahead_steps, dtype=numpy.float32)])

    voltage = -1.0
    position = -1.0

    # Pre-warm cache so we don't skew ETA during encoding
    for i in range(2048):
        _, _ = opcodes.candidate_opcodes(frame_horizon(i, lookahead_steps),
                                         lookahead_steps)

    total_err = 0.0
    frame_offset = 0
    eta = ETA(total=1000)
    i = 0
    last_updated = 0
    opcode_counts = collections.defaultdict(int)

    while i < dlen:
        if (i - last_updated) > int((dlen / 1000)):
            eta.print_status()
            last_updated = i

        candidate_opcodes, voltages = opcodes.candidate_opcodes(
            frame_horizon(frame_offset, lookahead_steps), lookahead_steps)
        opcode_idx = lookahead(step, position, data, i, voltage * voltages)
        opcode = candidate_opcodes[opcode_idx][0]
        opcode_counts[opcode] += 1
        yield opcode

        position, voltage, new_error, i = evolve(opcode, position, voltage,
                                                 step, data, i)

        total_err += new_error
        frame_offset = (frame_offset + 1) % 2048

    for _ in range(frame_offset % 2048, 2047):
        yield opcodes.Opcode.NOTICK_6
    yield opcodes.Opcode.EXIT
    eta.done()
    print("Total error %f" % total_err)

    print("Opcodes used:")
    for v, k in sorted(list(opcode_counts.items()),
                       key=lambda kv: kv[1],
                       reverse=True):
        print("%s: %d" % (v, k))
Ejemplo n.º 9
0
    def get_regions(self):
        total = 0
        for chrom, chrom_len in self.chrom_lens:
            total += (chrom_len / self.binsize)
            if chrom_len % self.binsize != 0:
                total += 1

        eta = ETA(total)
        pos_acc = 0
        for chrom, chrom_len in self.chrom_lens:
            pos = -1
            for bin in xrange(0, chrom_len, self.binsize):
                if pos > -1:
                    eta.print_status(pos_acc, extra='%s:%s[+]' % (chrom, bin))
                    yield (chrom, [pos], [bin], '+', [chrom, pos, bin,
                                                      '+'], None)
                    if self.stranded:
                        eta.print_status(pos_acc,
                                         extra='%s:%s[-]' % (chrom, bin))
                        yield (chrom, [pos], [bin], '-',
                               [chrom, pos, bin, '-'], None)
                pos = bin
                pos_acc += 1

            eta.print_status(pos_acc, extra='%s:%s[+]' % (chrom, bin))
            yield (chrom, [pos], [chrom_len], '+',
                   [chrom, pos, chrom_len, '+'], None)
            if self.stranded:
                eta.print_status(pos_acc, extra='%s:%s[-]' % (chrom, bin))
                yield (chrom, [pos], [chrom_len], '- ',
                       [chrom, pos, chrom_len, '-'], None)

        eta.done()
Ejemplo n.º 10
0
    def get_regions(self):
        total = 0
        for chrom, chrom_len in self.chrom_lens:
            total += (chrom_len / self.binsize)
            if chrom_len % self.binsize != 0:
                total += 1

        eta = ETA(total)
        pos_acc = 0
        for chrom, chrom_len in self.chrom_lens:
            pos = -1
            for bin in xrange(0, chrom_len, self.binsize):
                if pos > -1:
                    eta.print_status(pos_acc, extra='%s:%s[+]' % (chrom, bin))
                    yield (chrom, [pos], [bin], '+', [chrom, pos, bin, '+'], None)
                    if self.stranded:
                        eta.print_status(pos_acc, extra='%s:%s[-]' % (chrom, bin))
                        yield (chrom, [pos], [bin], '-', [chrom, pos, bin, '-'], None)
                pos = bin
                pos_acc += 1

            eta.print_status(pos_acc, extra='%s:%s[+]' % (chrom, bin))
            yield (chrom, [pos], [chrom_len], '+', [chrom, pos, chrom_len, '+'], None)
            if self.stranded:
                eta.print_status(pos_acc, extra='%s:%s[-]' % (chrom, bin))
                yield (chrom, [pos], [chrom_len], '- ', [chrom, pos, chrom_len, '-'], None)

        eta.done()
Ejemplo n.º 11
0
def bam_iter(bam, quiet=False, show_ref_pos=False, callback=None):
    '''
    >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]
    ['A', 'B', 'E', 'C', 'D', 'F', 'Z']
    '''
    if not quiet and bam.filename:
        eta = ETA(os.stat(bam.filename).st_size)
    else:
        eta = None

    if os.path.exists('%s.bai' % bam.filename):
        # This is an indexed file, so it is ref sorted...
        # Meaning that we should show chrom:pos, instead of read names
        show_ref_pos = True

    for read in bam:
        pos = bam.tell()
        bgz_offset = pos >> 16

        if not quiet and eta:
            if callback:
                eta.print_status(bgz_offset, extra=callback(read))
            elif (show_ref_pos):
                if read.tid > -1:
                    eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
                else:
                    eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))
            else:
                eta.print_status(bgz_offset, extra='%s' % read.qname)
        yield read

    if eta:
        eta.done()
Ejemplo n.º 12
0
 def output(self, partitions):
     for pn, part in enumerate(partitions):
         d = os.path.abspath(os.path.join(self.dst, self.name % pn))
         if os.path.isfile(d):
             logging.warning('Archive already exists, overwriting: ' + d)
         logging.info('Creating archive %s...' % (self.name % pn))
         eta = ETA(part.size, min_ms_between_updates=500)
         with tarfile.open(d, self.mode) as tar:
             for fn, size, estsize in part.filelist:
                 try:
                     tar.add(os.path.join(self.srcbase, fn), fn)
                 except Exception as ex:
                     logging.error(ex)
                 eta.print_status(estsize)
         eta.done()
Ejemplo n.º 13
0
 def output(self, partitions):
     for pn, part in enumerate(partitions):
         d = os.path.abspath(os.path.join(self.dst, self.name % pn))
         if os.path.isfile(d):
             logging.warning('Archive already exists, overwriting: ' + d)
         logging.info('Creating archive %s...' % (self.name % pn))
         eta = ETA(part.size, min_ms_between_updates=500)
         with zipfile.ZipFile(d, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
             for fn, size, estsize in part.filelist:
                 try:
                     zipf.write(os.path.join(self.srcbase, fn), fn)
                 except Exception as ex:
                     logging.error(ex)
                 eta.print_status(estsize)
         eta.done()
Ejemplo n.º 14
0
def gzip_aware_reader(fname, callback=None):
    if fname[-3:] == '.gz':
        f = gzip.open(fname)
    else:
        f = open(fname)

    eta = ETA(os.stat(fname).st_size, fileobj=f)
    for line in f:
        extra = None
        if callback:
            extra = callback()
        eta.print_status(extra=extra)
        yield line

    f.close()
    eta.done()
Ejemplo n.º 15
0
    def fetch(self, quiet=False):
        if self.fname and not quiet:
            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
        else:
            eta = None

        while True:
            try:
                read = fastq_read_file(self.fileobj)
                if eta:
                    eta.print_status(extra=read.name)
                yield read

            except:
                break

        if eta:
            eta.done()
Ejemplo n.º 16
0
    def fetch(self, quiet=False):
        name = ''
        comment = ''
        seq = ''

        if not quiet and self.fname and self.fname != '-':
            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
        else:
            eta = None

        for line in self.fileobj:
            line = line.strip()
            if not line:
                continue
            if line[0] == '#':
                continue

            if line[0] == '>':
                if name and seq:
                    if eta:
                        eta.print_status(extra=name)
                    yield FASTARead(name, comment, seq)

                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
                name = spl[0]
                if len(spl) > 1:
                    comment = spl[1]
                else:
                    comment = ''
                seq = ''

            else:
                if self.qual:
                    seq = seq + ' ' + line
                else:
                    seq += line

        if name and seq:
            if eta:
                eta.print_status(extra=name)
            yield FASTARead(name, comment, seq)

        if eta:
            eta.done()
Ejemplo n.º 17
0
    def fetch(self, quiet=False):
        name = ''
        comment = ''
        seq = ''

        if not quiet and self.fname and self.fname != '-':
            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
        else:
            eta = None

        for line in self.fileobj:
            line = line.strip()
            if not line:
                continue
            if line[0] == '#':
                continue

            if line[0] == '>':
                if name and seq:
                    if eta:
                        eta.print_status(extra=name)
                    yield FASTARead(name, comment, seq)

                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
                name = spl[0]
                if len(spl) > 1:
                    comment = spl[1]
                else:
                    comment = ''
                seq = ''

            else:
                if self.qual:
                    seq = seq + ' ' + line
                else:
                    seq += line

        if name and seq:
            if eta:
                eta.print_status(extra=name)
            yield FASTARead(name, comment, seq)

        if eta:
            eta.done()
Ejemplo n.º 18
0
 def output(self, partitions):
     for pn, part in enumerate(partitions):
         d = os.path.abspath(os.path.join(self.dst, self.name % pn))
         logging.info('Copying to %s' % d)
         eta = ETA(part.size, min_ms_between_updates=500)
         for fn, size, estsize in part.filelist:
             src = os.path.join(self.srcbase, fn)
             dst = os.path.join(d, fn)
             try:
                 if os.path.isdir(src):
                     os.makedirs(dst, exist_ok=True)
                 else:
                     os.makedirs(os.path.dirname(dst), exist_ok=True)
                     shutil.copy2(src, dst)
             except Exception as ex:
                 logging.error(ex)
                 continue
             eta.print_status(estsize)
         eta.done()
Ejemplo n.º 19
0
def _repeatreader(fname):
    with ngsutils.support.ngs_utils.gzip_opener(fname) as repeat_f:
        eta = ETA(os.stat(fname).st_size, fileobj=repeat_f)
        repeat_f.next()
        repeat_f.next()
        repeat_f.next()

        for line in repeat_f:
            cols = line.strip().split()
            chrom = cols[4]
            start = int(cols[5]) - 1
            end = int(cols[6])
            strand = '+' if cols[8] == '+' else '-'
            family = cols[10]
            member = cols[9]

            eta.print_status(extra='%s|%s %s:%s-%s[%s]' % (family, member, chrom, start, end, strand))
            yield (family, member, chrom, start, end, strand)
        eta.done()
Ejemplo n.º 20
0
def gtf_junctions(gtf, refname, fragment_size, min_size, max_exons=5, known=False, out=sys.stdout, quiet=False):
    ref = pysam.Fastafile(refname)

    references = []
    with open("%s.fai" % refname) as f:
        for line in f:
            cols = line.split("\t")
            references.append(cols[0])

    if not quiet:
        eta = ETA(gtf.fsize(), fileobj=gtf)
    else:
        eta = None

    exporter = JunctionExporter(ref, fragment_size, min_size, max_exons, out)

    for gene in gtf.genes:
        if not gene.chrom in references:
            continue

        if eta:
            eta.print_status(extra="%s:%s %s" % (gene.chrom, gene.start, gene.gene_name))

        if known:
            for txpt in gene.transcripts:
                last = None
                for exon in txpt.exons:
                    if last:
                        exporter.export(gene.chrom, [last, exon])
                    last = exon
        else:
            exons = set()
            for txpt in gene.transcripts:
                for exon in txpt.exons:
                    exons.add(exon)
            exons = list(exons)
            exons.sort()
            exporter.export(gene.chrom, exons)

    if eta:
        eta.done()
    ref.close()
Ejemplo n.º 21
0
def _repeatreader(fname):
    with ngsutils.support.ngs_utils.gzip_opener(fname) as repeat_f:
        eta = ETA(os.stat(fname).st_size, fileobj=repeat_f)
        repeat_f.next()
        repeat_f.next()
        repeat_f.next()

        for line in repeat_f:
            cols = line.strip().split()
            chrom = cols[4]
            start = int(cols[5]) - 1
            end = int(cols[6])
            strand = '+' if cols[8] == '+' else '-'
            family = cols[10]
            member = cols[9]

            eta.print_status(extra='%s|%s %s:%s-%s[%s]' %
                             (family, member, chrom, start, end, strand))
            yield (family, member, chrom, start, end, strand)
        eta.done()
Ejemplo n.º 22
0
def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):
    if not quiet and bam.filename:
        eta = ETA(os.stat(bam.filename).st_size)
    else:
        eta = None

    for pileup in bam.pileup(mask=mask):
        pos = bam.tell()
        bgz_offset = pos >> 16

        if not quiet:
            if callback:
                eta.print_status(bgz_offset, extra=callback(pileup))
            else:
                eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))

        yield pileup

    if eta:
        eta.done()
Ejemplo n.º 23
0
def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):
    if not quiet and bam.filename:
        eta = ETA(os.stat(bam.filename).st_size)
    else:
        eta = None

    for pileup in bam.pileup(mask=mask):
        pos = bam.tell()
        bgz_offset = pos >> 16

        if not quiet:
            if callback:
                eta.print_status(bgz_offset, extra=callback(pileup))
            else:
                eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))

        yield pileup

    if eta:
        eta.done()
Ejemplo n.º 24
0
    def fetch(self, quiet=False, callback=None):
        if self.fname and not quiet:
            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
        else:
            eta = None

        while True:
            try:
                read = fastq_read_file(self.fileobj)
                if eta:
                    if callback:
                        eta.print_status(extra=callback())
                    else:
                        eta.print_status(extra=read.name)
                yield read

            except StopIteration:
                break

        if eta:
            eta.done()
Ejemplo n.º 25
0
    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []

            # just include all regions - don't worry about transcripts and exons
            # the regions encompass all exons anyway...
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

            out = [gene.gene_id, gene.gene_name, ]
            if self.has_isoform:
                out.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '')
            if self.has_biotype:
                out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '')
            out.extend([gene.chrom, gene.strand, gene.start, gene.end])

            yield (gene.chrom, starts, ends, gene.strand, out, None)
        eta.done()
Ejemplo n.º 26
0
def gzip_reader(fname,
                quiet=False,
                callback=None,
                done_callback=None,
                fileobj=None):
    if fileobj:
        f = fileobj
    elif fname == '-':
        f = sys.stdin
    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
        f = gzip.open(os.path.expanduser(fname))
    else:
        f = open(os.path.expanduser(fname))

    if quiet or fname == '-':
        eta = None
    else:
        eta = ETA(os.stat(fname).st_size, fileobj=f)

    for line in f:
        if eta:
            if callback:
                extra = callback()
            else:
                extra = ''

            eta.print_status(extra=extra)
        yield line

        if done_callback and done_callback():
            break

    if f != sys.stdin:
        f.close()

    if eta:
        eta.done()
Ejemplo n.º 27
0
def run(predict, test_size, n_iter=100, n_burnin=10, resample=None):
    p_tot = LogR(0)
    eta = ETA(n_iter); eta.print_status(0, extra='starting...')
    for i in range(n_iter):
        print polya.timestamp(), "iteration %u/%u" % (i+1, n_iter)
        polya.resample()
        if resample: resample()
        
        p = predict()
        pplx = float(p ** (-1./test_size))
        print polya.timestamp(), 'perplexity =', pplx
        if i < n_burnin:
            eta.print_status(i+1, extra="burning in (%.1f)..." % pplx)
        else:
            p_tot += p
            pplx = float((p_tot / (i+1 - n_burnin)) ** (-1./test_size))
            eta.print_status(i+1, extra="perplexity %.1f" % pplx)
    eta.done()
    p_avg = p_tot / (n_iter - n_burnin)
    pplx = float(p_avg ** (-1./test_size))
    print '---\nfinal perplexity =', pplx
    print>>sys.stderr, 'Perplexity:', pplx
    return p_avg
Ejemplo n.º 28
0
    def __init__(self, filename=None, cache_enabled=True, quiet=False, fileobj=None):
        if not filename and not fileobj:
            raise ValueError('Must pass either a filename or a fileobj')

        if fileobj:
            fobj = fileobj
            cache_enabled = False
            eta = None
        else:
            fobj = gzip_aware_open(filename)
            eta = ETA(os.stat(filename).st_size, fileobj=fobj)
            cachefile = os.path.join(os.path.dirname(filename), '.%s.cache' % os.path.basename(filename))

        self._genes = {}
        self._pos = 0
        self._gene_bins = {}
        self._gene_names = {}
        self._gene_ids = {}
        warned = False

        if cache_enabled and os.path.exists(cachefile):
            self._load_cache(cachefile)

        if not self._genes:
            if not quiet:
                sys.stderr.write('Reading GTF file... (%s) \n' % filename)

            for linenum, line in enumerate(fobj):
                try:
                    idx = line.find('#')
                    if idx > -1:
                        if idx == 0:
                            continue
                        line = line[:-idx]
                    chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t')
                    source = symbols[source]
                    start = int(start) - 1  # Note: 1-based
                    end = int(end)
                    attributes = {}

                    for key, val in [x.split(' ', 1) for x in [x.strip() for x in quoted_split(attrs, ';')] if x and ' ' in x]:
                        if val[0] == '"' and val[-1] == '"':
                            val = val[1:-1]
                        attributes[key] = val

                    gid = None

                    if 'isoform_id' in attributes:
                        gid = attributes['isoform_id']

                    elif 'gene_name' in attributes:  # use gene_name if we have it.
                        gid = attributes['gene_name']

                    # elif 'tss_id' in attributes:  # iGenomes GTF files... are strange. use gene_name first.
                    #     gid = attributes['tss_id']

                    elif 'gene_id' in attributes:
                        gid = attributes['gene_id']
                        if not warned and not quiet:
                            sys.stderr.write('\nGTF file potentially missing isoform annotation! Each transcript may be treated separately. (%s)\n' % gid)
                            sys.stderr.write('%s\n\n' % (str(attributes)))
                            warned = True
                    else:
                        if not warned and not quiet:
                            sys.stderr.write('\nNot a valid GTF file! Maybe GFF?\n')
                            sys.stderr.write('%s\n\n' % (str(attributes)))
                            warned = True

                        first_key = None
                        attributes = {}
                        for key, val in [x.split('=', 1) for x in [x.strip() for x in quoted_split(attrs, ';')] if x and '=' in x]:
                            if not first_key:
                                first_key = key
                            if val[0] == '"' and val[-1] == '"':
                                val = val[1:-1]
                            attributes[key] = val

                        if not attributes:
                            gid = 'id_%s' % linenum
                            if not warned and not quiet:
                                sys.stderr.write('\nGTF file missing annotations! Using line numbers as IDs\n')
                                warned = True
                        else:
                            gid = attributes[first_key]
                            if not warned and not quiet:
                                sys.stderr.write('\nGTF file missing annotations (gene_id, transcript_id)! Assuming GFF? Taking first attribute as ID (%s=%s)\n' % (first_key, gid))
                                sys.stderr.write('%s\n\n' % (str(attributes)))
                                warned = True


                    if eta:
                        eta.print_status(extra=gid)
                except:
                    import traceback
                    sys.stderr.write('Error parsing line:\n%s\n' % line)
                    traceback.print_exc()
                    sys.exit(1)

                if not gid in self._genes or chrom != self._genes[gid].chrom:
                    self._genes[gid] = _GTFGene(gid, chrom, source, **attributes)
                    if 'gene_name' in attributes:
                        gene_name = attributes['gene_name']
                        if not gene_name in self._gene_names:
                            self._gene_names[gene_name] = [gid]
                        else:
                            self._gene_names[gene_name].append(gid)

                        if gid != attributes['gene_id']:
                            self._gene_ids[attributes['gene_id']] = gid

                self._genes[gid].add_feature(attributes['transcript_id'] if 'transcript_id' in attributes else gid, feature, start, end, strand)

            if eta:
                eta.done()

            if filename and fobj != sys.stdin:
                fobj.close()

            for gid in self._genes:
                gene = self._genes[gid]

                start_bin = gene.start / GTF.__binsize
                end_bin = gene.end / GTF.__binsize

                for bin in xrange(start_bin, end_bin+1):
                    if not (gene.chrom, bin) in self._gene_bins:
                        self._gene_bins[(gene.chrom, bin)] = [gid]
                    else:
                        self._gene_bins[(gene.chrom, bin)].append(gid)

            if cache_enabled:
                try:
                    self._write_cache(cachefile)
                except Exception, e:
                    sys.stderr.write("Error saving cache: %s!\n" % str(e))
                    pass  # do nothing if we can't write the cache.
Ejemplo n.º 29
0
 def get_regions(self):
     eta = ETA(self.bed.length, fileobj=self.bed)
     for region in self.bed:
         eta.print_status(extra='%s:%s-%s[%s]' % (region.chrom, region.start, region.end, region.strand))
         yield (region.chrom, [region.start], [region.end], region.strand, [region.chrom, region.start, region.end, region.name, region.score, region.strand], None)
     eta.done()
Ejemplo n.º 30
0
    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            geneout = [
                gene.gene_id,
                gene.gene_name,
            ]
            if self.has_isoform:
                geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in
                               gene.attributes else '')
            if self.has_biotype:
                geneout.append(gene.attributes['gene_biotype']
                               if 'gene_biotype' in gene.attributes else '')
            geneout.extend([gene.chrom, gene.strand, gene.start, gene.end])

            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, starts, ends,
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, [start], [end],
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, start, end,
                        self.multiple, self.whitelist, self.blacklist,
                        self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, geneout, callback)
        eta.done()
Ejemplo n.º 31
0
def fastq_sort(fastq,
               bysequence=False,
               tmpdir=None,
               tmpprefix='.tmp',
               chunksize=100000,
               nogz=False,
               out=sys.stdout,
               quiet=False):
    tmpfiles = []
    chunk = []

    sys.stderr.write('Sorting FASTQ file into chunks...\n')
    count = 0
    for read in fastq.fetch(quiet):
        count += 1
        if bysequence:
            chunk.append((read.seq, read))
        else:
            chunk.append((read.name, read))

        if len(chunk) >= chunksize:
            tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))
            chunk = []

    if chunk:
        tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))

    sys.stderr.write('\nMerging chunks...\n')
    sys.stderr.flush()
    buf = [
        None,
    ] * len(tmpfiles)
    skip = [
        False,
    ] * len(tmpfiles)

    eta = ETA(count)

    j = 0
    writing = True

    if nogz:
        tmpfobjs = [open(x) for x in tmpfiles]
    else:
        tmpfobjs = [gzip.open(x) for x in tmpfiles]

    while writing:
        j += 1
        eta.print_status(j)
        for i, fobj in enumerate(tmpfobjs):
            if not buf[i] and not skip[i]:
                try:
                    read = fastq_read_file(fobj)
                    if bysequence:
                        buf[i] = (read.seq, i, read)
                    else:
                        buf[i] = (read.name, i, read)
                except:
                    buf[i] = None
                    skip[i] = True

        sorted_list = buf[:]
        sorted_list.sort()
        writing = False

        for tup in sorted_list:
            if not tup:
                continue

            sorter, i, read = tup
            read.write(out)
            buf[i] = None
            writing = True
            break
    eta.done()

    for fobj in tmpfobjs:
        fobj.close()

    for tmpfile in tmpfiles:
        os.unlink(tmpfile)
Ejemplo n.º 32
0
def contexts(text, M, verbose=None):
    if verbose: eta = ETA(len(text))
    for i in range(len(text)):
        yield tuple(text[max(0,i-M):i]), text[i]
        if verbose: eta.print_status(i, extra=verbose)
    if verbose: eta.done()
Ejemplo n.º 33
0
    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            geneout = [gene.gene_id, gene.gene_name, ]
            if self.has_isoform:
                geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in gene.attributes else '')
            if self.has_biotype:
                geneout.append(gene.attributes['gene_biotype'] if 'gene_biotype' in gene.attributes else '')
            geneout.extend([gene.chrom, gene.strand, gene.start, gene.end])


            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, starts, ends, self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(bam, gene.chrom, gene.strand if self.stranded else None, [start], [end], self.multiple, False, self.whitelist, self.blacklist, self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(bam, gene.chrom, gene.strand if self.stranded else None, start, end, self.multiple, self.whitelist, self.blacklist, self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, geneout, callback)
        eta.done()
Ejemplo n.º 34
0
import os
import pandas as pd
import numpy as np
import csv
from eta import ETA

sub_dir = '/tudelft.net/staff-bulk/ewi/insy/mmc/nathan/nu/crawler/json_dumps/crawls_2014/'
list_dir = '/tudelft.net/staff-bulk/ewi/insy/mmc/nathan/nu/crawler/article_list_2014/'
article_date_df = pd.DataFrame({'article' : [], 'date' : [] , 'section' : [], 'comment_count' : []})

#sections = {''}

eta = ETA(20)

for i in range(20):
	eta.print_status()
	with open(list_dir + 'list_' + str(i)) as list_file:
		list_reader = csv.reader(list_file, delimiter='\t')
		for row in list_reader:
			article_id = str(row[0])
			article_dir = sub_dir + row[0] + '.article'
			with open(article_dir, 'r') as article_file:
				article = json.load(article_file)
				article_id = article['id']
				if article["social_counts"]["nujij_comments"] > 0:
					comment_dir = sub_dir + row[0] + '.comment'
					with open(comment_dir, 'r') as comment_file:
						comment_dict = json.load(comment_file)
						comment_count = len(comment_dict['comments'])
				else:
					comment_count = 0
Ejemplo n.º 35
0
def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):
    """
    >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]
    ['A', 'B', 'E', 'C', 'D', 'F', 'Z']
    """

    if os.path.exists("%s.bai" % bam.filename):
        # This is an indexed file, so it is ref sorted...
        # Meaning that we should show chrom:pos, instead of read names
        show_ref_pos = True

    eta = None

    if not ref:
        if not quiet and bam.filename:
            eta = ETA(os.stat(bam.filename).st_size)

        for read in bam:
            pos = bam.tell()
            bgz_offset = pos >> 16

            if not quiet and eta:
                if callback:
                    eta.print_status(bgz_offset, extra=callback(read))
                elif show_ref_pos:
                    if read.tid > -1:
                        eta.print_status(bgz_offset, extra="%s:%s %s" % (bam.getrname(read.tid), read.pos, read.qname))
                    else:
                        eta.print_status(bgz_offset, extra="unmapped %s" % (read.qname))
                else:
                    eta.print_status(bgz_offset, extra="%s" % read.qname)

            yield read

    else:
        working_chrom = None
        if ref in bam.references:
            working_chrom = ref
        elif ref[0:3] == "chr":
            # compensate for Ensembl vs UCSC ref naming
            if ref[3:] in bam.references:
                working_chrom = ref[3:]

        if not working_chrom:
            raise ValueError("Missing reference: %s" % ref)

        tid = bam.gettid(working_chrom)

        if not start:
            start = 0
        if not end:
            end = bam.lengths[tid]

        if not quiet and bam.filename:
            eta = ETA(end - start)

        for read in bam.fetch(working_chrom, start, end):
            if not quiet and eta:
                if callback:
                    eta.print_status(read.pos - start, extra=callback(read))
                else:
                    eta.print_status(
                        read.pos - start, extra="%s:%s %s" % (bam.getrname(read.tid), read.pos, read.qname)
                    )

            yield read

    if eta:
        eta.done()
Ejemplo n.º 36
0
    def __init__(self, filename=None, cache_enabled=True, quiet=False, fileobj=None):
        if not filename and not fileobj:
            raise ValueError('Must pass either a filename or a fileobj')

        if fileobj:
            fobj = fileobj
            cache_enabled = False
            eta = None
        else:
            fobj = gzip_aware_open(filename)
            eta = ETA(os.stat(filename).st_size, fileobj=fobj)
            cachefile = os.path.join(os.path.dirname(filename), '.%s.cache' % os.path.basename(filename))

        self._genes = {}
        self._pos = 0
        self._gene_bins = {}
        self._gene_names = {}
        self._gene_ids = {}
        warned = False

        if cache_enabled and os.path.exists(cachefile):
            self._load_cache(cachefile)

        if not self._genes:
            if not quiet:
                sys.stderr.write('Reading GTF file... (%s) \n' % filename)

            for line in fobj:
                try:
                    idx = line.find('#')
                    if idx > -1:
                        if idx == 0:
                            continue
                        line = line[:-idx]
                    chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t')
                    source = symbols[source]
                    start = int(start) - 1  # Note: 1-based
                    end = int(end)
                    attributes = {}
                    for key, val in [x.split(' ', 1) for x in [x.strip() for x in attrs.split(';')] if x]:
                        if val[0] == '"' and val[-1] == '"':
                            val = val[1:-1]
                        attributes[key] = val

                    gid = None
                    if 'isoform_id' in attributes:
                        gid = attributes['isoform_id']

                    elif 'gene_name' in attributes:  # use gene_name if we have it.
                        gid = attributes['gene_name']

                    # elif 'tss_id' in attributes:  # iGenomes GTF files... are strange. use gene_name first.
                    #     gid = attributes['tss_id']

                    else:
                        gid = attributes['gene_id']
                        if not warned and not quiet:
                            sys.stderr.write('\nGTF file potentially missing isoform annotation! Each transcript may be treated separately. (%s)\n' % gid)
                            sys.stderr.write('%s\n\n' % (str(attributes)))
                            warned = True
                    if eta:
                        eta.print_status(extra=gid)
                except:
                    import traceback
                    sys.stderr.write('Error parsing line:\n%s\n' % line)
                    traceback.print_exc()
                    sys.exit(1)

                if not gid in self._genes or chrom != self._genes[gid].chrom:
                    self._genes[gid] = _GTFGene(gid, chrom, source, **attributes)
                    if 'gene_name' in attributes:
                        gene_name = attributes['gene_name']
                        if not gene_name in self._gene_names:
                            self._gene_names[gene_name] = [gid]
                        else:
                            self._gene_names[gene_name].append(gid)

                        if gid != attributes['gene_id']:
                            self._gene_ids[attributes['gene_id']] = gid

                self._genes[gid].add_feature(attributes['transcript_id'], feature, start, end, strand)

            if eta:
                eta.done()

            if filename and fobj != sys.stdin:
                fobj.close()

            for gid in self._genes:
                gene = self._genes[gid]

                start_bin = gene.start / GTF.__binsize
                end_bin = gene.end / GTF.__binsize

                for bin in xrange(start_bin, end_bin+1):
                    if not (gene.chrom, bin) in self._gene_bins:
                        self._gene_bins[(gene.chrom, bin)] = [gid]
                    else:
                        self._gene_bins[(gene.chrom, bin)].append(gid)

            if cache_enabled:
                try:
                    self._write_cache(cachefile)
                except Exception, e:
                    sys.stderr.write("Error saving cache: %s!\n" % str(e))
                    pass  # do nothing if we can't write the cache.
Ejemplo n.º 37
0
def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):
    '''
    >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]
    ['A', 'B', 'E', 'C', 'D', 'F', 'Z']
    '''

    if os.path.exists('%s.bai' % bam.filename):
        # This is an indexed file, so it is ref sorted...
        # Meaning that we should show chrom:pos, instead of read names
        show_ref_pos = True

    eta = None

    if not ref:
        if not quiet and bam.filename:
            eta = ETA(os.stat(bam.filename).st_size)

        for read in bam:
            pos = bam.tell()
            bgz_offset = pos >> 16

            if not quiet and eta:
                if callback:
                    eta.print_status(bgz_offset, extra=callback(read))
                elif (show_ref_pos):
                    if read.tid > -1:
                        eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
                    else:
                        eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))
                else:
                    eta.print_status(bgz_offset, extra='%s' % read.qname)

            yield read

    else:
        working_chrom = None
        if ref in bam.references:
            working_chrom = ref
        elif ref[0:3] == 'chr':
            # compensate for Ensembl vs UCSC ref naming
            if ref[3:] in bam.references:
                working_chrom = ref[3:]

        if not working_chrom:
            raise ValueError('Missing reference: %s' % ref)

        tid = bam.gettid(working_chrom)

        if not start:
            start = 0
        if not end:
            end = bam.lengths[tid]

        if not quiet and bam.filename:
            eta = ETA(end - start)

        for read in bam.fetch(working_chrom, start, end):
            if not quiet and eta:
                if callback:
                    eta.print_status(read.pos - start, extra=callback(read))
                else:
                    eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))

            yield read

    if eta:
        eta.done()
Ejemplo n.º 38
0
def fastq_sort(fastq, bysequence=False, tmpdir=None, tmpprefix='.tmp', chunksize=100000, nogz=False, out=sys.stdout, quiet=False):
    tmpfiles = []
    chunk = []

    sys.stderr.write('Sorting FASTQ file into chunks...\n')
    count = 0
    for read in fastq.fetch(quiet):
        count += 1 
        if bysequence:
            chunk.append((read.seq, read))
        else:
            chunk.append((read.name, read))

        if len(chunk) >= chunksize:
            tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))
            chunk = []

    if chunk:
        tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))

    sys.stderr.write('\nMerging chunks...\n')
    sys.stderr.flush()
    buf = [None, ] * len(tmpfiles)
    skip = [False, ] * len(tmpfiles)

    eta = ETA(count)

    j=0
    writing = True

    if nogz:
        tmpfobjs = [open(x) for x in tmpfiles]
    else:
        tmpfobjs = [gzip.open(x) for x in tmpfiles]

    while writing:
        j+=1
        eta.print_status(j)
        for i, fobj in enumerate(tmpfobjs):
            if not buf[i] and not skip[i]:
                try:
                    read = fastq_read_file(fobj)
                    if bysequence:
                        buf[i] = (read.seq, i, read)
                    else:
                        buf[i] = (read.name, i, read)
                except:
                    buf[i] = None
                    skip[i] = True
        
        sorted_list = buf[:]
        sorted_list.sort()
        writing = False

        for tup in sorted_list:
            if not tup:
                continue

            sorter, i, read = tup
            read.write(out)
            buf[i] = None
            writing = True
            break
    eta.done()

    for fobj in tmpfobjs:
        fobj.close()

    for tmpfile in tmpfiles:
        os.unlink(tmpfile)
Ejemplo n.º 39
0
def refiso_junctions(fname,refname,fragment_size=46,min_size=50,out=sys.stdout,max_exons=3):
    '''
    Given a refiso file and a reference genome, it will produce a fasta file 
    representing all possible unique splice junctions within an isoform.
    
    fragement_size - the maximum amount from each side of a splice to include
    
    min_size       - the minimum length of a junction
    
    max_exons      - the maximum number of exons to include in a junction (for small IG exons)
    
    '''
    
    refiso = RefIso(fname)
    ref = pysam.Fastafile(refname)
    
    references = []
    with open('%s.fai' % refname) as f:
        for line in f:
            cols = line.split('\t')
            references.append(cols[0])

    def _extend_junction(seq,name,chrom,exons,counter=1):
        if counter >= max_exons:
            return
        start,end = exons[0]
        frag_end = end
        if end-start > fragment_size:
            frag_end = start+fragment_size

        seq5 = ref.fetch(chrom,start,frag_end)
        newname = '%s,%s-%s' % (name,start,frag_end)
        newseq = seq + seq5
        if len(newseq) >= min_size:
            yield newname,newseq
            return
        elif len(exons) > 1 and counter+1 < max_exons:
            for i in xrange(1,len(exons)):
                for nn_name,nn_seq in _extend_junction(newseq,newname,chrom,exons[i:],counter+1):
                    yield nn_name,nn_seq

    
    eta=ETA(refiso.fsize(),fileobj=refiso)
    junctions = set()
    for gene in refiso.genes:
        if not gene.chrom in references:
            continue
        for txpt in gene.transcripts:
            exons = zip(txpt.exon_starts,txpt.exon_ends)
            # print exons
            if len(exons) > 1000 or gene.name == 'abParts':
                # skip IG hyper / Ab regions
                continue
            for i,(start,end) in enumerate(exons):
                eta.print_status(extra='%s:%s %s #%s' % (gene.chrom,gene.tx_start,gene.name,i))
                if i == len(exons)-1:
                    # con't splice the last exon
                    continue
                frag_start = start
                
                if end-start > fragment_size:
                    frag_start = end-fragment_size
                
                # print '[%s] %s:%s-%s' % (i,gene.chrom,frag_start,end)
                seq3 = ref.fetch(gene.chrom,frag_start,end)
                for j in xrange(len(exons)-i-1):
                    # print '   [%s]' % (j+i+1),
                    # print '%s-%s' % exons[j+i+1]
                    for name,seq in _extend_junction(seq3,'%s:%s-%s' % (gene.chrom,frag_start,end),gene.chrom,exons[j+i+1:]):
                        if not name in junctions:
                            junctions.add(name)
                            out.write('>%s\n%s\n' % (name,seq))
                
    eta.done()
Ejemplo n.º 40
0
def gtf_junctions(gtf,
                  refname,
                  fragment_size,
                  min_size,
                  max_exons=5,
                  known=False,
                  out=sys.stdout,
                  quiet=False,
                  scramble=False,
                  retain_introns=False):
    ref = pysam.Fastafile(refname)

    references = []
    with open('%s.fai' % refname) as f:
        for line in f:
            cols = line.split('\t')
            references.append(cols[0])

    if not quiet:
        eta = ETA(gtf.fsize(), fileobj=gtf)
    else:
        eta = None

    exporter = JunctionExporter(ref, fragment_size, min_size, max_exons, out,
                                scramble)

    for gene in gtf.genes:
        if not gene.chrom in references:
            continue

        if eta:
            eta.print_status(extra='%s:%s %s' %
                             (gene.chrom, gene.start, gene.gene_name))

        if known:
            for txpt in gene.transcripts:
                last = None
                for exon in txpt.exons:
                    if last:
                        exporter.export(gene.chrom, [last, exon])
                    last = exon
        else:
            exons = set()
            for txpt in gene.transcripts:
                for exon in txpt.exons:
                    exons.add(exon)

            exons = list(exons)
            exons.sort()

            if retain_introns:
                exporter.export_retained_introns(gene.chrom, exons,
                                                 gene.strand)

            if scramble:
                # We can just pretend the transcript is repeated
                # and then let the set take care of removing the duplicates
                exons = exons * 2

            exporter.export(gene.chrom, exons)

    if eta:
        eta.done()
    ref.close()
Ejemplo n.º 41
0
 def scanpaths(self, paths, prefix=None):
     prefix = prefix or os.path.join(*os.path.commonprefix(tuple(map(splitpath, map(os.path.abspath, paths)))))
     fl = []
     estsize = 0
     ignored = []
     logging.info("Scanning files...")
     for path in paths:
         if os.path.isfile(path):
             try:
                 relfn = os.path.relpath(path, prefix)
                 if self.ffilter(relfn, prefix):
                     filesize = os.path.getsize(path)
                     fl.append((os.path.relpath(path, prefix), filesize, filesize))
                     estsize += min(self.samplesize, filesize)
                 else:
                     ignored.append(path)
             except Exception as ex:
                 logging.error(ex)
         else:
             for root, dirs, files in os.walk(path):
                 for name in files:
                     fn = os.path.join(root, name)
                     relfn = os.path.relpath(fn, prefix)
                     try:
                         if self.ffilter(relfn, prefix):
                             filesize = os.path.getsize(fn)
                             fl.append((relfn, filesize, filesize))
                             estsize += min(self.samplesize, filesize)
                         else:
                             ignored.append((relfn, os.path.getsize(fn)))
                     except Exception as ex:
                         logging.error(ex)
                         # file access error -> ignore
                         ignored.append((relfn, 0))
                 for name in dirs:
                     fn = os.path.join(root, name)
                     # not ignoring empty dirs
                     if not os.listdir(fn):
                         fl.append((os.path.relpath(fn + '/', prefix), 0, 0))
     # estimate compressd size
     if callable(self.compressfunc):
         logging.info("Calculating estimated compressed size...")
         eta = ETA(estsize, min_ms_between_updates=500)
         estcurrent = 0
         for k, v in enumerate(fl):
             filename, size, size2 = v
             fn = os.path.join(prefix, filename)
             try:
                 fl[k] = (filename, size, self.estcompresssize(fn, size))
             except Exception as ex:
                 logging.exception("Can't access " + path)
             estcurrent += min(self.samplesize, size)
             eta.print_status(estcurrent)
         eta.done()
     if self.totalsizelim:
         filtered = []
         sizesum = 0
         maxfilesize = 0
         for k, v in sorted(enumerate(fl), key=lambda x: x[1][2]):
             filename, origsize, size = v
             if sizesum + size > self.totalsizelim:
                 ignored.append(fl[k][:2])
                 if not maxfilesize:
                     maxfilesize = origsize
             else:
                 filtered.append(fl[k])
                 sizesum += size
         fl = filtered
         if maxfilesize:
             logging.info("Max file size is " + sizeof_fmt(maxfilesize))
     return fl, ignored