def bit_clone(bits):
    """
    Clone a bitset
    """
    new = BitSet(bits.size)
    new.ior(bits)
    return new
Beispiel #2
0
def throw_random_bits(lengths, mask, allow_overlap=False):
    rval = BitSet(mask.size)
    throw_random_gap_list(lengths, mask, lambda s, e: rval.set_range(s, e - s),
                          allow_overlap)
    if not allow_overlap:
        assert rval.count_range(0, rval.size) == sum(lengths)
    return rval
def count_overlap(bits1, bits2):
    """
    Count the number of bits that overlap between two sets
    """
    b = BitSet(bits1.size)
    b |= bits1
    b &= bits2
    return b.count_range(0, b.size)
def as_bits(region_start, region_length, intervals):
    """
    Convert a set of intervals overlapping a region of a chromosome into
    a bitset for just that region with the bits covered by the intervals
    set.
    """
    bits = BitSet(region_length)
    for chr, start, stop in intervals:
        bits.set_range(start - region_start, stop - start)
    return bits
def process_gene_model(gene, upstream_promoter_flank=2000, promoter_inset_flank=1000, downstream_flank=1000):
    if gene.isforward():
        if (gene.start - upstream_promoter_flank) < 1: # handle upstream_promoter_flank being less than left-end of chromosome
            upstream_promoter_flank = gene.start - 1
        gene_start,gene_end = to0baseEx(gene.start - upstream_promoter_flank,gene.end + downstream_flank)
    else:
        if (gene.start - downstream_flank) < 1: # handle downstream_flank being less than left-end of chromosome
            downstream_flank = gene.start - 1
        gene_start,gene_end = to0baseEx(gene.start - downstream_flank,gene.end + upstream_promoter_flank)

    base = gene_start
    size = gene_end - gene_start
    exon_mask = BitSet(size)
    promoter_mask = BitSet(size)
    #pdb.set_trace()
    # mark all exonic regions in all transcripts
    for t in gene.transcripts:
        exons = list(t.exons())
        for i_e, ex in enumerate(exons):
            # convert coordinates
            e_start,e_end = to0baseEx(ex['start'],ex['end'])
            pos = e_start - base
            l = e_end - e_start
            exon_mask.set_range(pos, l)
        if gene.isforward():
            promoter_exon = exons[0]
            p_start,p_end = to0baseEx(promoter_exon['start'],promoter_exon['end'])
            pos = p_start - base
            p_len = min(size, upstream_promoter_flank + promoter_inset_flank)
            promoter_mask.set_range( pos - upstream_promoter_flank, p_len )
        else:
            promoter_exon = exons[-1]
            p_start,p_end = to0baseEx(promoter_exon['start'],promoter_exon['end'])
            pos = p_end - base
            p_len = min(size, upstream_promoter_flank + promoter_inset_flank)
            try:
                promoter_mask.set_range( max(0, pos - promoter_inset_flank), p_len )
            except:
                print >>sys.stderr, "failed on gene", gene.gene_id, gene.start, gene.end, gene.strand, gene_start, gene_end, base, size
                raise

    things = three_states_two_masks(PROMOTER, promoter_mask, EXONIC, exon_mask, INTRONIC)

    # set the final 3' thing to be downstream flank
    threeprime_i = -1 if gene.isforward() else 0
    s,e,state = things[threeprime_i]
    if state == INTRONIC:
        things[threeprime_i] = (s,e,DOWNSTREAM_FLANK)

    for thing in things:
        s,e,current_state = thing
        print_bed6("chr" + gene.seqname, s+base,e+base, gene.strand, current_state, gene.gene_id)
Beispiel #6
0
def __main__():
    maf_source_type = sys.argv.pop(1)
    input_maf_filename = sys.argv[1].strip()
    input_interval_filename = sys.argv[2].strip()
    output_filename = sys.argv[3].strip()
    dbkey = sys.argv[4].strip()
    try:
        chr_col = int(sys.argv[5].strip()) - 1
        start_col = int(sys.argv[6].strip()) - 1
        end_col = int(sys.argv[7].strip()) - 1
    except Exception:
        print(
            "You appear to be missing metadata. You can specify your metadata by clicking on the pencil icon associated with your interval file.",
            file=sys.stderr)
        sys.exit()
    summary = sys.argv[8].strip()
    if summary.lower() == "true":
        summary = True
    else:
        summary = False

    mafIndexFile = "%s/maf_index.loc" % sys.argv[9]
    try:
        maf_index_filename = sys.argv[10].strip()
    except Exception:
        maf_index_filename = None
    index = index_filename = None
    if maf_source_type == "user":
        # index maf for use here
        index, index_filename = maf_utilities.open_or_build_maf_index(
            input_maf_filename, maf_index_filename, species=[dbkey])
        if index is None:
            print("Your MAF file appears to be malformed.", file=sys.stderr)
            sys.exit()
    elif maf_source_type == "cached":
        # access existing indexes
        index = maf_utilities.maf_index_by_uid(input_maf_filename,
                                               mafIndexFile)
        if index is None:
            print("The MAF source specified (%s) appears to be invalid." %
                  (input_maf_filename),
                  file=sys.stderr)
            sys.exit()
    else:
        print('Invalid source type specified: %s' % maf_source_type,
              file=sys.stdout)
        sys.exit()

    out = open(output_filename, 'w')

    num_region = None
    num_bad_region = 0
    species_summary = {}
    total_length = 0
    # loop through interval file
    for num_region, region in enumerate(
            bx.intervals.io.NiceReaderWrapper(
                open(input_interval_filename, 'r'),
                chrom_col=chr_col,
                start_col=start_col,
                end_col=end_col,
                fix_strand=True,
                return_header=False,
                return_comments=False)):  # noqa: B007
        src = "%s.%s" % (dbkey, region.chrom)
        region_length = region.end - region.start
        if region_length < 1:
            num_bad_region += 1
            continue
        total_length += region_length
        coverage = {dbkey: BitSet(region_length)}

        for block in index.get_as_iterator(src, region.start, region.end):
            for spec in maf_utilities.get_species_in_block(block):
                if spec not in coverage:
                    coverage[spec] = BitSet(region_length)
            for block in maf_utilities.iter_blocks_split_by_species(block):
                if maf_utilities.component_overlaps_region(
                        block.get_component_by_src(src), region):
                    # need to chop and orient the block
                    block = maf_utilities.orient_block_by_region(
                        maf_utilities.chop_block_by_region(block, src, region),
                        src,
                        region,
                        force_strand='+')
                    start_offset, alignment = maf_utilities.reduce_block_by_primary_genome(
                        block, dbkey, region.chrom, region.start)
                    for i in range(len(alignment[dbkey])):
                        for spec, text in alignment.items():
                            if text[i] != '-':
                                coverage[spec].set(start_offset + i)
        if summary:
            # record summary
            for key in coverage.keys():
                if key not in species_summary:
                    species_summary[key] = 0
                species_summary[
                    key] = species_summary[key] + coverage[key].count_range()
        else:
            # print coverage for interval
            coverage_sum = coverage[dbkey].count_range()
            out.write("%s\t%s\t%s\t%s\n" %
                      ("\t".join(region.fields), dbkey, coverage_sum,
                       region_length - coverage_sum))
            keys = list(coverage.keys())
            keys.remove(dbkey)
            keys.sort()
            for key in keys:
                coverage_sum = coverage[key].count_range()
                out.write("%s\t%s\t%s\t%s\n" %
                          ("\t".join(region.fields), key, coverage_sum,
                           region_length - coverage_sum))
    if summary:
        out.write("#species\tnucleotides\tcoverage\n")
        for spec in species_summary:
            out.write("%s\t%s\t%.4f\n" %
                      (spec, species_summary[spec],
                       float(species_summary[spec]) / total_length))
    out.close()
    if num_region is not None:
        print("%i regions were processed with a total length of %i." %
              (num_region + 1, total_length))
    if num_bad_region:
        print("%i regions were invalid." % (num_bad_region))
    maf_utilities.remove_temp_index_file(index_filename)
Beispiel #7
0
         "%s.sql" % os.path.join(input_dir, base_filename))
     if not os.listdir(table_out_dir):
         print "Removing empty table (%s) directory (%s)." % (
             table_name, table_out_dir)
         os.rmdir(table_out_dir)
     continue
 #build bitsets from table
 bitset_dict = {}
 for line in gzip.open('%s.txt.gz' %
                       os.path.join(input_dir, base_filename)):
     fields = line.strip().split('\t')
     chrom = fields[chrom_col]
     start = int(fields[start_col])
     end = int(fields[end_col])
     if chrom not in bitset_dict:
         bitset_dict[chrom] = BitSet(
             chrom_lengths.get(chrom, options.bitset_size))
     bitset_dict[chrom].set_range(start, end - start)
 #write bitsets as profiled annotations
 for chrom_name, chrom_bits in bitset_dict.iteritems():
     out = open(
         os.path.join(table_out_dir, '%s.covered' % chrom_name),
         'wb')
     end = 0
     total_regions = 0
     total_coverage = 0
     max_size = chrom_lengths.get(chrom_name, options.bitset_size)
     while True:
         start = chrom_bits.next_set(end)
         if start >= max_size:
             break
         end = chrom_bits.next_clear(start)