def main(): region_fname = sys.argv[1] mask_fname = sys.argv[2] nsamples = int( sys.argv[3] ) intervals1_fname = sys.argv[4] intervals2_fnames = sys.argv[5:] nfeatures = len( intervals2_fnames ) total_actual = zeros( nfeatures ) # total_lengths1 = 0 total_lengths2 = zeros( nfeatures ) total_samples = zeros( ( nsamples, nfeatures ) ) for line in open( region_fname ): # Load lengths for all intervals overlapping region fields = line.split() print >>sys.stderr, "Processing region:", fields[3] r_chr, r_start, r_stop = fields[0], int( fields[1] ), int( fields[2] ) r_length = r_stop - r_start # Load the mask mask = overlapping_in_bed( mask_fname, r_chr, r_start, r_stop ) bits_mask = as_bits( r_start, r_length, mask ) bits_not_masked = bit_clone( bits_mask ); bits_not_masked.invert() # Load the first set intervals1 = overlapping_in_bed( intervals1_fname, r_chr, r_start, r_stop ) bits1 = as_bits( r_start, r_length, intervals1 ) # Intersect it with the mask bits1.iand( bits_not_masked ) # Sanity checks assert count_overlap( bits1, bits_mask ) == 0 # For each data set for featnum, intervals2_fname in enumerate( intervals2_fnames ): print >>sys.stderr, intervals2_fname intervals2 = overlapping_in_bed( intervals2_fname, r_chr, r_start, r_stop ) bits2 = as_bits( r_start, r_length, intervals2 ) bits2.iand( bits_not_masked ) assert count_overlap( bits2, bits_mask ) == 0 # Observed values actual_overlap = count_overlap( bits1, bits2 ) total_actual[featnum] += actual_overlap # Sample lengths2 = list( interval_lengths( bits2 ) ) total_lengths2[ featnum ] += sum( lengths2 ) for i in range( nsamples ): # Build randomly covered bitmask for second set random2 = throw_random( lengths2, bits_mask ) # Find intersection random2 &= bits1 # Print amount intersecting total_samples[ i, featnum ] += random2.count_range( 0, random2.size ) print >>sys.stderr, total_samples[ i, featnum ] fraction_overlap = total_samples / total_lengths2 print "\t".join( intervals2_fnames ) print "\t".join( map( str, total_actual/total_lengths2 ) ) for row in fraction_overlap: print "\t".join( map( str, row ) ) #print "total covered by first: %d, second: %d, overlap: %d" % ( total_lengths1, total_lengths2, total_actual ) print "observed overlap: %d, sample mean: %d, sample stdev: %d" % ( total_actual, stats.amean( total_samples ), stats.asamplestdev( total_samples ) ) print "z-score:", ( total_actual - stats.amean( total_samples ) ) / stats.asamplestdev( total_samples ) print "percentile:", sum( total_actual > total_samples ) / nsamples
def main(): region_fname = sys.argv[1] mask_fname = sys.argv[2] nsamples = int( sys.argv[3] ) intervals1_fname = sys.argv[4] intervals2_fnames = sys.argv[5:] nfeatures = len( intervals2_fnames ) total_actual = zeros( nfeatures ) # total_lengths1 = 0 total_lengths2 = zeros( nfeatures ) total_samples = zeros( ( nsamples, nfeatures ) ) for line in open( region_fname ): # Load lengths for all intervals overlapping region fields = line.split() print("Processing region:", fields[3], file=sys.stderr) r_chr, r_start, r_stop = fields[0], int( fields[1] ), int( fields[2] ) r_length = r_stop - r_start # Load the mask mask = overlapping_in_bed( mask_fname, r_chr, r_start, r_stop ) bits_mask = as_bits( r_start, r_length, mask ) bits_not_masked = bit_clone( bits_mask ); bits_not_masked.invert() # Load the first set intervals1 = overlapping_in_bed( intervals1_fname, r_chr, r_start, r_stop ) bits1 = as_bits( r_start, r_length, intervals1 ) # Intersect it with the mask bits1.iand( bits_not_masked ) # Sanity checks assert count_overlap( bits1, bits_mask ) == 0 # For each data set for featnum, intervals2_fname in enumerate( intervals2_fnames ): print(intervals2_fname, file=sys.stderr) intervals2 = overlapping_in_bed( intervals2_fname, r_chr, r_start, r_stop ) bits2 = as_bits( r_start, r_length, intervals2 ) bits2.iand( bits_not_masked ) assert count_overlap( bits2, bits_mask ) == 0 # Observed values actual_overlap = count_overlap( bits1, bits2 ) total_actual[featnum] += actual_overlap # Sample lengths2 = list( interval_lengths( bits2 ) ) total_lengths2[ featnum ] += sum( lengths2 ) for i in range( nsamples ): # Build randomly covered bitmask for second set random2 = throw_random( lengths2, bits_mask ) # Find intersection random2 &= bits1 # Print amount intersecting total_samples[ i, featnum ] += random2.count_range( 0, random2.size ) print(total_samples[ i, featnum ], file=sys.stderr) fraction_overlap = total_samples / total_lengths2 print("\t".join( intervals2_fnames )) print("\t".join( map( str, total_actual/total_lengths2 ) )) for row in fraction_overlap: print("\t".join( map( str, row ) )) #print "total covered by first: %d, second: %d, overlap: %d" % ( total_lengths1, total_lengths2, total_actual ) print("observed overlap: %d, sample mean: %d, sample stdev: %d" % ( total_actual, stats.amean( total_samples ), stats.asamplestdev( total_samples ) )) print("z-score:", ( total_actual - stats.amean( total_samples ) ) / stats.asamplestdev( total_samples )) print("percentile:", sum( total_actual > total_samples ) / nsamples)