def bit_clone(bits): """ Clone a bitset """ new = BitSet(bits.size) new.ior(bits) return new
def throw_random_bits(lengths, mask, allow_overlap=False): rval = BitSet(mask.size) throw_random_gap_list(lengths, mask, lambda s, e: rval.set_range(s, e - s), allow_overlap) if not allow_overlap: assert rval.count_range(0, rval.size) == sum(lengths) return rval
def count_overlap(bits1, bits2): """ Count the number of bits that overlap between two sets """ b = BitSet(bits1.size) b |= bits1 b &= bits2 return b.count_range(0, b.size)
def as_bits(region_start, region_length, intervals): """ Convert a set of intervals overlapping a region of a chromosome into a bitset for just that region with the bits covered by the intervals set. """ bits = BitSet(region_length) for chr, start, stop in intervals: bits.set_range(start - region_start, stop - start) return bits
def process_gene_model(gene, upstream_promoter_flank=2000, promoter_inset_flank=1000, downstream_flank=1000): if gene.isforward(): if (gene.start - upstream_promoter_flank) < 1: # handle upstream_promoter_flank being less than left-end of chromosome upstream_promoter_flank = gene.start - 1 gene_start,gene_end = to0baseEx(gene.start - upstream_promoter_flank,gene.end + downstream_flank) else: if (gene.start - downstream_flank) < 1: # handle downstream_flank being less than left-end of chromosome downstream_flank = gene.start - 1 gene_start,gene_end = to0baseEx(gene.start - downstream_flank,gene.end + upstream_promoter_flank) base = gene_start size = gene_end - gene_start exon_mask = BitSet(size) promoter_mask = BitSet(size) #pdb.set_trace() # mark all exonic regions in all transcripts for t in gene.transcripts: exons = list(t.exons()) for i_e, ex in enumerate(exons): # convert coordinates e_start,e_end = to0baseEx(ex['start'],ex['end']) pos = e_start - base l = e_end - e_start exon_mask.set_range(pos, l) if gene.isforward(): promoter_exon = exons[0] p_start,p_end = to0baseEx(promoter_exon['start'],promoter_exon['end']) pos = p_start - base p_len = min(size, upstream_promoter_flank + promoter_inset_flank) promoter_mask.set_range( pos - upstream_promoter_flank, p_len ) else: promoter_exon = exons[-1] p_start,p_end = to0baseEx(promoter_exon['start'],promoter_exon['end']) pos = p_end - base p_len = min(size, upstream_promoter_flank + promoter_inset_flank) try: promoter_mask.set_range( max(0, pos - promoter_inset_flank), p_len ) except: print >>sys.stderr, "failed on gene", gene.gene_id, gene.start, gene.end, gene.strand, gene_start, gene_end, base, size raise things = three_states_two_masks(PROMOTER, promoter_mask, EXONIC, exon_mask, INTRONIC) # set the final 3' thing to be downstream flank threeprime_i = -1 if gene.isforward() else 0 s,e,state = things[threeprime_i] if state == INTRONIC: things[threeprime_i] = (s,e,DOWNSTREAM_FLANK) for thing in things: s,e,current_state = thing print_bed6("chr" + gene.seqname, s+base,e+base, gene.strand, current_state, gene.gene_id)
def __main__(): maf_source_type = sys.argv.pop(1) input_maf_filename = sys.argv[1].strip() input_interval_filename = sys.argv[2].strip() output_filename = sys.argv[3].strip() dbkey = sys.argv[4].strip() try: chr_col = int(sys.argv[5].strip()) - 1 start_col = int(sys.argv[6].strip()) - 1 end_col = int(sys.argv[7].strip()) - 1 except Exception: print( "You appear to be missing metadata. You can specify your metadata by clicking on the pencil icon associated with your interval file.", file=sys.stderr) sys.exit() summary = sys.argv[8].strip() if summary.lower() == "true": summary = True else: summary = False mafIndexFile = "%s/maf_index.loc" % sys.argv[9] try: maf_index_filename = sys.argv[10].strip() except Exception: maf_index_filename = None index = index_filename = None if maf_source_type == "user": # index maf for use here index, index_filename = maf_utilities.open_or_build_maf_index( input_maf_filename, maf_index_filename, species=[dbkey]) if index is None: print("Your MAF file appears to be malformed.", file=sys.stderr) sys.exit() elif maf_source_type == "cached": # access existing indexes index = maf_utilities.maf_index_by_uid(input_maf_filename, mafIndexFile) if index is None: print("The MAF source specified (%s) appears to be invalid." % (input_maf_filename), file=sys.stderr) sys.exit() else: print('Invalid source type specified: %s' % maf_source_type, file=sys.stdout) sys.exit() out = open(output_filename, 'w') num_region = None num_bad_region = 0 species_summary = {} total_length = 0 # loop through interval file for num_region, region in enumerate( bx.intervals.io.NiceReaderWrapper( open(input_interval_filename, 'r'), chrom_col=chr_col, start_col=start_col, end_col=end_col, fix_strand=True, return_header=False, return_comments=False)): # noqa: B007 src = "%s.%s" % (dbkey, region.chrom) region_length = region.end - region.start if region_length < 1: num_bad_region += 1 continue total_length += region_length coverage = {dbkey: BitSet(region_length)} for block in index.get_as_iterator(src, region.start, region.end): for spec in maf_utilities.get_species_in_block(block): if spec not in coverage: coverage[spec] = BitSet(region_length) for block in maf_utilities.iter_blocks_split_by_species(block): if maf_utilities.component_overlaps_region( block.get_component_by_src(src), region): # need to chop and orient the block block = maf_utilities.orient_block_by_region( maf_utilities.chop_block_by_region(block, src, region), src, region, force_strand='+') start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start) for i in range(len(alignment[dbkey])): for spec, text in alignment.items(): if text[i] != '-': coverage[spec].set(start_offset + i) if summary: # record summary for key in coverage.keys(): if key not in species_summary: species_summary[key] = 0 species_summary[ key] = species_summary[key] + coverage[key].count_range() else: # print coverage for interval coverage_sum = coverage[dbkey].count_range() out.write("%s\t%s\t%s\t%s\n" % ("\t".join(region.fields), dbkey, coverage_sum, region_length - coverage_sum)) keys = list(coverage.keys()) keys.remove(dbkey) keys.sort() for key in keys: coverage_sum = coverage[key].count_range() out.write("%s\t%s\t%s\t%s\n" % ("\t".join(region.fields), key, coverage_sum, region_length - coverage_sum)) if summary: out.write("#species\tnucleotides\tcoverage\n") for spec in species_summary: out.write("%s\t%s\t%.4f\n" % (spec, species_summary[spec], float(species_summary[spec]) / total_length)) out.close() if num_region is not None: print("%i regions were processed with a total length of %i." % (num_region + 1, total_length)) if num_bad_region: print("%i regions were invalid." % (num_bad_region)) maf_utilities.remove_temp_index_file(index_filename)
"%s.sql" % os.path.join(input_dir, base_filename)) if not os.listdir(table_out_dir): print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir) os.rmdir(table_out_dir) continue #build bitsets from table bitset_dict = {} for line in gzip.open('%s.txt.gz' % os.path.join(input_dir, base_filename)): fields = line.strip().split('\t') chrom = fields[chrom_col] start = int(fields[start_col]) end = int(fields[end_col]) if chrom not in bitset_dict: bitset_dict[chrom] = BitSet( chrom_lengths.get(chrom, options.bitset_size)) bitset_dict[chrom].set_range(start, end - start) #write bitsets as profiled annotations for chrom_name, chrom_bits in bitset_dict.iteritems(): out = open( os.path.join(table_out_dir, '%s.covered' % chrom_name), 'wb') end = 0 total_regions = 0 total_coverage = 0 max_size = chrom_lengths.get(chrom_name, options.bitset_size) while True: start = chrom_bits.next_set(end) if start >= max_size: break end = chrom_bits.next_clear(start)