コード例 #1
0
ファイル: models.py プロジェクト: xuwei684/ngsutils
    def get_regions(self):
        total = 0
        for chrom, chrom_len in self.chrom_lens:
            total += (chrom_len / self.binsize)
            if chrom_len % self.binsize != 0:
                total += 1

        eta = ETA(total)
        pos_acc = 0
        for chrom, chrom_len in self.chrom_lens:
            pos = -1
            for bin in xrange(0, chrom_len, self.binsize):
                if pos > -1:
                    eta.print_status(pos_acc, extra='%s:%s[+]' % (chrom, bin))
                    yield (chrom, [pos], [bin], '+', [chrom, pos, bin,
                                                      '+'], None)
                    if self.stranded:
                        eta.print_status(pos_acc,
                                         extra='%s:%s[-]' % (chrom, bin))
                        yield (chrom, [pos], [bin], '-',
                               [chrom, pos, bin, '-'], None)
                pos = bin
                pos_acc += 1

            eta.print_status(pos_acc, extra='%s:%s[+]' % (chrom, bin))
            yield (chrom, [pos], [chrom_len], '+',
                   [chrom, pos, chrom_len, '+'], None)
            if self.stranded:
                eta.print_status(pos_acc, extra='%s:%s[-]' % (chrom, bin))
                yield (chrom, [pos], [chrom_len], '- ',
                       [chrom, pos, chrom_len, '-'], None)

        eta.done()
コード例 #2
0
ファイル: basecall.py プロジェクト: xuwei684/ngsutils
        def _gen1():
            if not self.quiet:
                eta = ETA(self.regions.total)
            else:
                eta = None

            count = 0
            for region in self.regions:
                working_chrom = None
                if region.chrom in self.bam.references:
                    working_chrom = region.chrom
                elif chrom[0:3] == 'chr':
                        if region.chrom[3:] in self.bam.references:
                            working_chrom = region.chrom[3:]

                if not working_chrom:
                    continue

                # for troubleshooting
                self.cur_chrom = region.chrom
                self.cur_start = region.start
                self.cur_end = region.end

                laststart = 0
                for read in self.bam.fetch(working_chrom, region.start, region.end):
                    if read.pos != laststart:
                        count += 1
                        laststart = read.pos

                    if eta:
                        eta.print_status(count, extra='%s/%s %s:%s' % (count, self.regions.total, self.bam.references[read.tid], read.pos))

                    yield read
            if eta:
                eta.done()
コード例 #3
0
ファイル: idrip.py プロジェクト: git4unrealnondev/E621-IDRIP
def Loop(TotalPosts):

    cnt = 0
    Bal = True

    PageNum = 0
    MakeURL()
    eta = ETA(TotalPosts / 320)
    file = open(str(total[0]) + str(total[1] + '.txt'), 'w+')
    while Bal == True:
        Bal = APIConnection(PageNum + 1)
        time.sleep(RateLimit / 1000)
        if not cnt <= TotalPosts:
            Bal = False
        cnt += 320
        eta.print_status()
        PageNum += 1
    eta.done()

    print('Parsing XML')
    for each in XMList:
        root = ET.fromstring(each.content)
        for e in root.findall('post'):
            file.write('https://e621.net/post/show/' + e.find('id').text +
                       '\n')

        print('    ' + str(cnt) + '  ' + str(TotalPosts))

    file.close()
    eta.done()
    cntttl = cnt / TotalPosts
    cntttl = cntttl * 100
    print('DL ' + str(cnt) + ' Posts or ' + str(cntttl) + '%')
コード例 #4
0
ファイル: extract.py プロジェクト: xuwei684/ngsutils
def bam_extract(inbam, outbam, bedfile, nostrand=False, quiet=False):
    bed = BedFile(bedfile)
    if not quiet:
        eta = ETA(os.stat(bedfile).st_size, fileobj=bed)
    else:
        eta = None

    passed = 0

    for region in bed:
        if eta:
            eta.print_status(extra="extracted:%s" % (passed))

        if not region.chrom in inbam.references:
            continue

        if not nostrand:
            strand = region.strand
        else:
            strand = None

        for read in bam_extract_reads(inbam, region.chrom, region.start,
                                      region.end, strand):
            outbam.write(read)
            passed += 1

    if not quiet:
        eta.done()
        sys.stderr.write("%s extracted\n" % (passed, ))
コード例 #5
0
ファイル: models.py プロジェクト: xuwei684/ngsutils
    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []

            # just include all regions - don't worry about transcripts and exons
            # the regions encompass all exons anyway...
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

            out = [
                gene.gene_id,
                gene.gene_name,
            ]
            if self.has_isoform:
                out.append(gene.attributes['isoform_id'] if 'isoform_id' in
                           gene.attributes else '')
            if self.has_biotype:
                out.append(gene.attributes['gene_biotype'] if 'gene_biotype' in
                           gene.attributes else '')
            out.extend([gene.chrom, gene.strand, gene.start, gene.end])

            yield (gene.chrom, starts, ends, gene.strand, out, None)
        eta.done()
コード例 #6
0
def download(folder, file_name, user, pwd):
    urls = open(file_name).readlines()
    eta = ETA(n_tot=len(urls))
    for url in urls[::-1]:
        url = url.strip()
        granule = Granule(url=url, folder=folder)
        eta.display(step='Downloading {name}'.format(name=granule.file_name))
        granule.download(user, pwd)
        del urls[-1]
        open(file_name, 'w').writelines(urls)
コード例 #7
0
ファイル: googletest.py プロジェクト: foogle/googletest
def batchcheck(gm, count=10000, workers=100):
    try:
        p = multiprocessing.dummy.Pool(workers)
        workers = min(workers, count)
        chunksize = max(count // workers, 1)
        if count < gm.count:
            gm.progress = ETA(count)
            it = p.imap_unordered(gm.checkoneip, gm.randomips(count),
                                  chunksize)
        else:
            gm.progress = ETA(gm.count)
            it = p.imap_unordered(gm.checkoneip, gm.ips(), chunksize)
        for _ in it:
            pass
    except (KeyboardInterrupt, BrokenPipeError):
        p.terminate()
    finally:
        gm.progress.done()
    return gm.outputip()
コード例 #8
0
ファイル: models.py プロジェクト: xuwei684/ngsutils
 def get_regions(self):
     eta = ETA(self.bed.length, fileobj=self.bed)
     for region in self.bed:
         eta.print_status(
             extra='%s:%s-%s[%s]' %
             (region.chrom, region.start, region.end, region.strand))
         yield (region.chrom, [region.start], [region.end], region.strand, [
             region.chrom, region.start, region.end, region.name,
             region.score, region.strand
         ], None)
     eta.done()
コード例 #9
0
def audio_bytestream(data: numpy.ndarray, step: int, lookahead_steps: int):
    """Computes optimal sequence of player opcodes to reproduce audio data."""

    dlen = len(data)
    # TODO: avoid temporarily doubling memory footprint to concatenate
    data = numpy.concatenate(
        [data, numpy.zeros(lookahead_steps, dtype=numpy.float32)])

    voltage = -1.0
    position = -1.0

    # Pre-warm cache so we don't skew ETA during encoding
    for i in range(2048):
        _, _ = opcodes.candidate_opcodes(frame_horizon(i, lookahead_steps),
                                         lookahead_steps)

    total_err = 0.0
    frame_offset = 0
    eta = ETA(total=1000)
    i = 0
    last_updated = 0
    opcode_counts = collections.defaultdict(int)

    while i < dlen:
        if (i - last_updated) > int((dlen / 1000)):
            eta.print_status()
            last_updated = i

        candidate_opcodes, voltages = opcodes.candidate_opcodes(
            frame_horizon(frame_offset, lookahead_steps), lookahead_steps)
        opcode_idx = lookahead(step, position, data, i, voltage * voltages)
        opcode = candidate_opcodes[opcode_idx][0]
        opcode_counts[opcode] += 1
        yield opcode

        position, voltage, new_error, i = evolve(opcode, position, voltage,
                                                 step, data, i)

        total_err += new_error
        frame_offset = (frame_offset + 1) % 2048

    for _ in range(frame_offset % 2048, 2047):
        yield opcodes.Opcode.NOTICK_6
    yield opcodes.Opcode.EXIT
    eta.done()
    print("Total error %f" % total_err)

    print("Opcodes used:")
    for v, k in sorted(list(opcode_counts.items()),
                       key=lambda kv: kv[1],
                       reverse=True):
        print("%s: %d" % (v, k))
コード例 #10
0
def download(folder, file_list):
    files = open(file_list).readlines()
    eta = ETA(n_tot=len(files))
    for row in files[::-1]:
        ladsweb_file = LadswebFile(file_id=row.split(',')[0],
                                   url=row.split(',')[1])
        ladsweb_file.get_file_name()
        eta.display(step='Downloading {name}'.format(
            name=ladsweb_file.file_name))
        ladsweb_file.get_properties()
        ladsweb_file.verified_download(folder)
        del files[-1]
        open(file_list, 'w').writelines(files)
コード例 #11
0
ファイル: __init__.py プロジェクト: vonovvonov/tools-iuc
    def fetch(self, quiet=False):
        name = ''
        comment = ''
        seq = ''

        if not quiet and self.fname and self.fname != '-':
            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
        else:
            eta = None

        for line in self.fileobj:
            line = line.strip()
            if not line:
                continue
            if line[0] == '#':
                continue

            if line[0] == '>':
                if name and seq:
                    if eta:
                        eta.print_status(extra=name)
                    yield FASTARead(name, comment, seq)

                spl = re.split(r'[ \t]', line[1:], maxsplit=1)
                name = spl[0]
                if len(spl) > 1:
                    comment = spl[1]
                else:
                    comment = ''
                seq = ''

            else:
                if self.qual:
                    seq = seq + ' ' + line
                else:
                    seq += line

        if name and seq:
            if eta:
                eta.print_status(extra=name)
            yield FASTARead(name, comment, seq)

        if eta:
            eta.done()
コード例 #12
0
def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):
    if not quiet and bam.filename:
        eta = ETA(os.stat(bam.filename).st_size)
    else:
        eta = None

    for pileup in bam.pileup(mask=mask):
        pos = bam.tell()
        bgz_offset = pos >> 16

        if not quiet:
            if callback:
                eta.print_status(bgz_offset, extra=callback(pileup))
            else:
                eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))

        yield pileup

    if eta:
        eta.done()
コード例 #13
0
ファイル: models.py プロジェクト: xuwei684/ngsutils
def _repeatreader(fname):
    with ngsutils.support.ngs_utils.gzip_opener(fname) as repeat_f:
        eta = ETA(os.stat(fname).st_size, fileobj=repeat_f)
        repeat_f.next()
        repeat_f.next()
        repeat_f.next()

        for line in repeat_f:
            cols = line.strip().split()
            chrom = cols[4]
            start = int(cols[5]) - 1
            end = int(cols[6])
            strand = '+' if cols[8] == '+' else '-'
            family = cols[10]
            member = cols[9]

            eta.print_status(extra='%s|%s %s:%s-%s[%s]' %
                             (family, member, chrom, start, end, strand))
            yield (family, member, chrom, start, end, strand)
        eta.done()
コード例 #14
0
    def fetch(self, quiet=False, callback=None):
        if self.fname and not quiet:
            eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj)
        else:
            eta = None

        while True:
            try:
                read = fastq_read_file(self.fileobj)
                if eta:
                    if callback:
                        eta.print_status(extra=callback())
                    else:
                        eta.print_status(extra=read.name)
                yield read

            except StopIteration:
                break

        if eta:
            eta.done()
コード例 #15
0
ファイル: __init__.py プロジェクト: vonovvonov/tools-iuc
def gzip_reader(fname,
                quiet=False,
                callback=None,
                done_callback=None,
                fileobj=None):
    if fileobj:
        f = fileobj
    elif fname == '-':
        f = sys.stdin
    elif fname[-3:] == '.gz' or fname[-4:] == '.bgz':
        f = gzip.open(os.path.expanduser(fname))
    else:
        f = open(os.path.expanduser(fname))

    if quiet or fname == '-':
        eta = None
    else:
        eta = ETA(os.stat(fname).st_size, fileobj=f)

    for line in f:
        if eta:
            if callback:
                extra = callback()
            else:
                extra = ''

            eta.print_status(extra=extra)
        yield line

        if done_callback and done_callback():
            break

    if f != sys.stdin:
        f.close()

    if eta:
        eta.done()
コード例 #16
0
def gtf_junctions(gtf,
                  refname,
                  fragment_size,
                  min_size,
                  max_exons=5,
                  known=False,
                  out=sys.stdout,
                  quiet=False,
                  scramble=False,
                  retain_introns=False):
    ref = pysam.Fastafile(refname)

    references = []
    with open('%s.fai' % refname) as f:
        for line in f:
            cols = line.split('\t')
            references.append(cols[0])

    if not quiet:
        eta = ETA(gtf.fsize(), fileobj=gtf)
    else:
        eta = None

    exporter = JunctionExporter(ref, fragment_size, min_size, max_exons, out,
                                scramble)

    for gene in gtf.genes:
        if not gene.chrom in references:
            continue

        if eta:
            eta.print_status(extra='%s:%s %s' %
                             (gene.chrom, gene.start, gene.gene_name))

        if known:
            for txpt in gene.transcripts:
                last = None
                for exon in txpt.exons:
                    if last:
                        exporter.export(gene.chrom, [last, exon])
                    last = exon
        else:
            exons = set()
            for txpt in gene.transcripts:
                for exon in txpt.exons:
                    exons.add(exon)

            exons = list(exons)
            exons.sort()

            if retain_introns:
                exporter.export_retained_introns(gene.chrom, exons,
                                                 gene.strand)

            if scramble:
                # We can just pretend the transcript is repeated
                # and then let the set take care of removing the duplicates
                exons = exons * 2

            exporter.export(gene.chrom, exons)

    if eta:
        eta.done()
    ref.close()
コード例 #17
0
def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):
    '''
    >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]
    ['A', 'B', 'E', 'C', 'D', 'F', 'Z']
    '''

    if os.path.exists('%s.bai' % bam.filename):
        # This is an indexed file, so it is ref sorted...
        # Meaning that we should show chrom:pos, instead of read names
        show_ref_pos = True

    eta = None

    if not ref:
        if not quiet and bam.filename:
            eta = ETA(os.stat(bam.filename).st_size)

        for read in bam:
            pos = bam.tell()
            bgz_offset = pos >> 16

            if not quiet and eta:
                if callback:
                    eta.print_status(bgz_offset, extra=callback(read))
                elif (show_ref_pos):
                    if read.tid > -1:
                        eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
                    else:
                        eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))
                else:
                    eta.print_status(bgz_offset, extra='%s' % read.qname)

            yield read

    else:
        working_chrom = None
        if ref in bam.references:
            working_chrom = ref
        elif ref[0:3] == 'chr':
            # compensate for Ensembl vs UCSC ref naming
            if ref[3:] in bam.references:
                working_chrom = ref[3:]

        if not working_chrom:
            raise ValueError('Missing reference: %s' % ref)

        tid = bam.gettid(working_chrom)

        if not start:
            start = 0
        if not end:
            end = bam.lengths[tid]

        if not quiet and bam.filename:
            eta = ETA(end - start)

        for read in bam.fetch(working_chrom, start, end):
            if not quiet and eta:
                if callback:
                    eta.print_status(read.pos - start, extra=callback(read))
                else:
                    eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))

            yield read

    if eta:
        eta.done()
コード例 #18
0
def refiso_junctions(fname,refname,fragment_size=46,min_size=50,out=sys.stdout,max_exons=3):
    '''
    Given a refiso file and a reference genome, it will produce a fasta file 
    representing all possible unique splice junctions within an isoform.
    
    fragement_size - the maximum amount from each side of a splice to include
    
    min_size       - the minimum length of a junction
    
    max_exons      - the maximum number of exons to include in a junction (for small IG exons)
    
    '''
    
    refiso = RefIso(fname)
    ref = pysam.Fastafile(refname)
    
    references = []
    with open('%s.fai' % refname) as f:
        for line in f:
            cols = line.split('\t')
            references.append(cols[0])

    def _extend_junction(seq,name,chrom,exons,counter=1):
        if counter >= max_exons:
            return
        start,end = exons[0]
        frag_end = end
        if end-start > fragment_size:
            frag_end = start+fragment_size

        seq5 = ref.fetch(chrom,start,frag_end)
        newname = '%s,%s-%s' % (name,start,frag_end)
        newseq = seq + seq5
        if len(newseq) >= min_size:
            yield newname,newseq
            return
        elif len(exons) > 1 and counter+1 < max_exons:
            for i in xrange(1,len(exons)):
                for nn_name,nn_seq in _extend_junction(newseq,newname,chrom,exons[i:],counter+1):
                    yield nn_name,nn_seq

    
    eta=ETA(refiso.fsize(),fileobj=refiso)
    junctions = set()
    for gene in refiso.genes:
        if not gene.chrom in references:
            continue
        for txpt in gene.transcripts:
            exons = zip(txpt.exon_starts,txpt.exon_ends)
            # print exons
            if len(exons) > 1000 or gene.name == 'abParts':
                # skip IG hyper / Ab regions
                continue
            for i,(start,end) in enumerate(exons):
                eta.print_status(extra='%s:%s %s #%s' % (gene.chrom,gene.tx_start,gene.name,i))
                if i == len(exons)-1:
                    # con't splice the last exon
                    continue
                frag_start = start
                
                if end-start > fragment_size:
                    frag_start = end-fragment_size
                
                # print '[%s] %s:%s-%s' % (i,gene.chrom,frag_start,end)
                seq3 = ref.fetch(gene.chrom,frag_start,end)
                for j in xrange(len(exons)-i-1):
                    # print '   [%s]' % (j+i+1),
                    # print '%s-%s' % exons[j+i+1]
                    for name,seq in _extend_junction(seq3,'%s:%s-%s' % (gene.chrom,frag_start,end),gene.chrom,exons[j+i+1:]):
                        if not name in junctions:
                            junctions.add(name)
                            out.write('>%s\n%s\n' % (name,seq))
                
    eta.done()
コード例 #19
0
ファイル: __init__.py プロジェクト: xuwei684/ngsutils
    def __init__(self, filename=None, cache_enabled=True, quiet=False, fileobj=None):
        if not filename and not fileobj:
            raise ValueError('Must pass either a filename or a fileobj')

        if fileobj:
            fobj = fileobj
            cache_enabled = False
            eta = None
        else:
            fobj = gzip_aware_open(filename)
            eta = ETA(os.stat(filename).st_size, fileobj=fobj)
            cachefile = os.path.join(os.path.dirname(filename), '.%s.cache' % os.path.basename(filename))

        self._genes = {}
        self._pos = 0
        self._gene_bins = {}
        self._gene_names = {}
        self._gene_ids = {}
        warned = False

        if cache_enabled and os.path.exists(cachefile):
            self._load_cache(cachefile)

        if not self._genes:
            if not quiet:
                sys.stderr.write('Reading GTF file... (%s) \n' % filename)

            for linenum, line in enumerate(fobj):
                try:
                    idx = line.find('#')
                    if idx > -1:
                        if idx == 0:
                            continue
                        line = line[:-idx]
                    chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t')
                    source = symbols[source]
                    start = int(start) - 1  # Note: 1-based
                    end = int(end)
                    attributes = {}

                    for key, val in [x.split(' ', 1) for x in [x.strip() for x in quoted_split(attrs, ';')] if x and ' ' in x]:
                        if val[0] == '"' and val[-1] == '"':
                            val = val[1:-1]
                        attributes[key] = val

                    gid = None

                    if 'isoform_id' in attributes:
                        gid = attributes['isoform_id']

                    elif 'gene_name' in attributes:  # use gene_name if we have it.
                        gid = attributes['gene_name']

                    # elif 'tss_id' in attributes:  # iGenomes GTF files... are strange. use gene_name first.
                    #     gid = attributes['tss_id']

                    elif 'gene_id' in attributes:
                        gid = attributes['gene_id']
                        if not warned and not quiet:
                            sys.stderr.write('\nGTF file potentially missing isoform annotation! Each transcript may be treated separately. (%s)\n' % gid)
                            sys.stderr.write('%s\n\n' % (str(attributes)))
                            warned = True
                    else:
                        if not warned and not quiet:
                            sys.stderr.write('\nNot a valid GTF file! Maybe GFF?\n')
                            sys.stderr.write('%s\n\n' % (str(attributes)))
                            warned = True

                        first_key = None
                        attributes = {}
                        for key, val in [x.split('=', 1) for x in [x.strip() for x in quoted_split(attrs, ';')] if x and '=' in x]:
                            if not first_key:
                                first_key = key
                            if val[0] == '"' and val[-1] == '"':
                                val = val[1:-1]
                            attributes[key] = val

                        if not attributes:
                            gid = 'id_%s' % linenum
                            if not warned and not quiet:
                                sys.stderr.write('\nGTF file missing annotations! Using line numbers as IDs\n')
                                warned = True
                        else:
                            gid = attributes[first_key]
                            if not warned and not quiet:
                                sys.stderr.write('\nGTF file missing annotations (gene_id, transcript_id)! Assuming GFF? Taking first attribute as ID (%s=%s)\n' % (first_key, gid))
                                sys.stderr.write('%s\n\n' % (str(attributes)))
                                warned = True


                    if eta:
                        eta.print_status(extra=gid)
                except:
                    import traceback
                    sys.stderr.write('Error parsing line:\n%s\n' % line)
                    traceback.print_exc()
                    sys.exit(1)

                if not gid in self._genes or chrom != self._genes[gid].chrom:
                    self._genes[gid] = _GTFGene(gid, chrom, source, **attributes)
                    if 'gene_name' in attributes:
                        gene_name = attributes['gene_name']
                        if not gene_name in self._gene_names:
                            self._gene_names[gene_name] = [gid]
                        else:
                            self._gene_names[gene_name].append(gid)

                        if gid != attributes['gene_id']:
                            self._gene_ids[attributes['gene_id']] = gid

                self._genes[gid].add_feature(attributes['transcript_id'] if 'transcript_id' in attributes else gid, feature, start, end, strand)

            if eta:
                eta.done()

            if filename and fobj != sys.stdin:
                fobj.close()

            for gid in self._genes:
                gene = self._genes[gid]

                start_bin = gene.start / GTF.__binsize
                end_bin = gene.end / GTF.__binsize

                for bin in xrange(start_bin, end_bin+1):
                    if not (gene.chrom, bin) in self._gene_bins:
                        self._gene_bins[(gene.chrom, bin)] = [gid]
                    else:
                        self._gene_bins[(gene.chrom, bin)].append(gid)

            if cache_enabled:
                try:
                    self._write_cache(cachefile)
                except Exception, e:
                    sys.stderr.write("Error saving cache: %s!\n" % str(e))
                    pass  # do nothing if we can't write the cache.
コード例 #20
0
ファイル: models.py プロジェクト: xuwei684/ngsutils
    def get_regions(self):
        eta = ETA(self.gtf.fsize(), fileobj=self.gtf)

        for gene in self.gtf.genes:
            eta.print_status(extra=gene.gene_name)
            starts = []
            ends = []
            const_spans = []

            geneout = [
                gene.gene_id,
                gene.gene_name,
            ]
            if self.has_isoform:
                geneout.append(gene.attributes['isoform_id'] if 'isoform_id' in
                               gene.attributes else '')
            if self.has_biotype:
                geneout.append(gene.attributes['gene_biotype']
                               if 'gene_biotype' in gene.attributes else '')
            geneout.extend([gene.chrom, gene.strand, gene.start, gene.end])

            was_last_const = False
            for num, start, end, const, names in gene.regions:
                starts.append(start)
                ends.append(end)

                # assemble a list of lists with contiguous spans of constant regions
                # this will let us count junction-spanning reads that are cover two
                # constant exons/regions

                if const:
                    if not was_last_const:
                        const_spans.append([])
                    const_spans[-1].append((start, end))
                    was_last_const = True
                else:
                    was_last_const = False

            def callback(bam, common_count, common_reads, common_cols):
                # gather constant reads
                const_count = 0
                for span in const_spans:
                    starts = []
                    ends = []

                    for start, end in span:
                        starts.append(start)
                        ends.append(end)

                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, starts, ends,
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    const_count += count

                #find counts for each region
                for num, start, end, const, names in gene.regions:
                    count, reads = _fetch_reads(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, [start], [end],
                        self.multiple, False, self.whitelist, self.blacklist,
                        self.uniq_only, self.library_type)
                    excl_count, excl_reads = _fetch_reads_excluding(
                        bam, gene.chrom,
                        gene.strand if self.stranded else None, start, end,
                        self.multiple, self.whitelist, self.blacklist,
                        self.library_type)

                    # remove reads that exclude this region
                    for read in excl_reads:
                        if read in reads:
                            reads.remove(read)
                            count = count - 1

                    # find reads that *arent'* in this region
                    other_reads = 0
                    for read in common_reads:
                        if not read in reads and not read in excl_reads:
                            other_reads += 1

                    if other_reads > 0:
                        altindex = float(count - excl_count) / other_reads
                    else:
                        altindex = ''

                    if len(common_reads) > 0:
                        incl_pct = float(count) / len(common_reads)
                        excl_pct = float(excl_count) / len(common_reads)
                    else:
                        incl_pct = ''
                        excl_pct = ''

                    cols = common_cols[:]
                    cols.append(start)
                    cols.append(end)
                    cols.append(const_count)
                    cols.append(num)
                    cols.append('const' if const else 'alt')
                    cols.append(count)
                    cols.append(excl_count)
                    cols.append(incl_pct)
                    cols.append(excl_pct)
                    cols.append(altindex)
                    yield cols

            yield (gene.chrom, starts, ends, gene.strand, geneout, callback)
        eta.done()
コード例 #21
0
ファイル: sort.py プロジェクト: xuwei684/ngsutils
def fastq_sort(fastq,
               bysequence=False,
               tmpdir=None,
               tmpprefix='.tmp',
               chunksize=100000,
               nogz=False,
               out=sys.stdout,
               quiet=False):
    tmpfiles = []
    chunk = []

    sys.stderr.write('Sorting FASTQ file into chunks...\n')
    count = 0
    for read in fastq.fetch(quiet):
        count += 1
        if bysequence:
            chunk.append((read.seq, read))
        else:
            chunk.append((read.name, read))

        if len(chunk) >= chunksize:
            tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))
            chunk = []

    if chunk:
        tmpfiles.append(_write_tmp(chunk, tmpdir, tmpprefix, nogz))

    sys.stderr.write('\nMerging chunks...\n')
    sys.stderr.flush()
    buf = [
        None,
    ] * len(tmpfiles)
    skip = [
        False,
    ] * len(tmpfiles)

    eta = ETA(count)

    j = 0
    writing = True

    if nogz:
        tmpfobjs = [open(x) for x in tmpfiles]
    else:
        tmpfobjs = [gzip.open(x) for x in tmpfiles]

    while writing:
        j += 1
        eta.print_status(j)
        for i, fobj in enumerate(tmpfobjs):
            if not buf[i] and not skip[i]:
                try:
                    read = fastq_read_file(fobj)
                    if bysequence:
                        buf[i] = (read.seq, i, read)
                    else:
                        buf[i] = (read.name, i, read)
                except:
                    buf[i] = None
                    skip[i] = True

        sorted_list = buf[:]
        sorted_list.sort()
        writing = False

        for tup in sorted_list:
            if not tup:
                continue

            sorter, i, read = tup
            read.write(out)
            buf[i] = None
            writing = True
            break
    eta.done()

    for fobj in tmpfobjs:
        fobj.close()

    for tmpfile in tmpfiles:
        os.unlink(tmpfile)