def main(args): """ main entry point for the GenomicIntIntersection script. :param args: the arguments for this script, as a list of string. Should already have had things like the script name stripped. That is, if there are no args provided, this should be an empty list. """ # get options and arguments ui = getUI(args) if ui.optionIsSet("test"): # just run unit tests unittest.main(argv=[sys.argv[0]]) elif ui.optionIsSet("help"): # just show help ui.usage() else: verbose = ui.optionIsSet("verbose") # stranded? stranded = ui.optionIsSet("stranded") if stranded: sys.stderr.write( "Sorry, stranded mode hasn't been implemented yet.") sys.exit() # get output handle out_fh = sys.stdout if ui.optionIsSet("output"): out_fh = open(ui.getValue("output"), "w") # get input file-handles -- we know we'll get exactly two, since we # specified it in the UI definition regions_1 = [ x for x in BEDIterator(ui.getArgument(0), verbose=verbose) ] regions_2 = [ x for x in BEDIterator(ui.getArgument(1), verbose=verbose) ] for r in regionsIntersection(regions_1, regions_2): out_fh.write(str(r) + "\n")
def processBED(infh, outhandle, scheme, verbose=False): for read in BEDIterator(infh, verbose=verbose): # split the chrom field to get the genomic indices.. y = collections.deque(read.chrom.split("_")) while len(y) > 5: a = y.popleft() a += ("_" + y.popleft()) y.appendleft(a) chrom = y[0] chrom1SeqStart = int(y[1]) chrom1SeqEnd = int(y[2]) chrom2SeqStart = int(y[3]) # arbitrarily decide the first exon contains the largest portion of # the read if both are the same firstExon = None secondExon = None if scheme != SECOND_EXON: firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1, chrom1SeqEnd, read.name, read.score, read.strand) if scheme != FIRST_EXON: end = chrom2SeqStart + (read.end - (chrom1SeqEnd - chrom1SeqStart)) - 1 secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name, read.score, read.strand) # we add %1 or %2 to the end of the read names so they can # be distinguished later if firstExon is not None: firstExon.name = firstExon.name + "%1" if secondExon is not None: secondExon.name = secondExon.name + "%2" if (scheme == FIRST_EXON) or \ (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \ (scheme == FIVE_PRIME_END and read.strand == "+"): out = str(firstExon) elif (scheme == SECOND_EXON) or \ (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \ (scheme == FIVE_PRIME_END and read.strand == "-"): out = str(secondExon) elif scheme == BOTH_EXONS: out = str(firstExon) + "\n" + str(secondExon) # sanity check -- make sure we create a valid output string for l in out.split("\n"): e = parseBEDString(l) if e.chrom.strip() == "": raise ValueError(" got an emtpy chrom -> " + str(read)) # write output outhandle.write(out + "\n")
def main(args): """ main entry point for the GenomicIntJaccard script. :param args: the arguments for this script, as a list of string. Should already have had things like the script name stripped. That is, if there are no args provided, this should be an empty list. """ # get options and arguments ui = getUI(args) if ui.optionIsSet("test"): # just run unit tests unittest.main(argv=[sys.argv[0]]) elif ui.optionIsSet("help"): # just show help ui.usage() else: verbose = ui.optionIsSet("verbose") stranded = ui.optionIsSet("stranded") if stranded: sys.stderr.write( "Sorry, stranded mode hasn't been implemented yet.") sys.exit() # we required two input files, so we know these will be present... regions_1 = [ e for e in BEDIterator(ui.getArgument(0), verbose=verbose) ] regions_2 = [ e for e in BEDIterator(ui.getArgument(1), verbose=verbose) ] print jaccardIndex(regions_1, regions_2)
def main(args, prog_name): """ main entry point for the script. :param args: the arguments for this script, as a list of string. Should already have had things like the script name stripped. That is, if there are no args provided, this should be an empty list. """ # get options and arguments ui = getUI(args, prog_name) if ui.optionIsSet("test"): # just run unit tests unittest.main(argv=[sys.argv[0]]) elif ui.optionIsSet("help"): # just show help ui.usage() else: verbose = (ui.optionIsSet("verbose") is True) or DEFAULT_VERBOSITY # how to handle strand, names, and whether to collapse only regions with # exactly matching genomic loci? stranded = ui.optionIsSet("stranded") names = ui.optionIsSet("accumulate_names") exact = ui.optionIsSet("exact") # get output handle out_fh = sys.stdout if ui.optionIsSet("output"): out_fh = open(ui.getValue("output"), "w") # get input file-handle in_fh = sys.stdin if ui.hasArgument(0): in_fh = open(ui.getArgument(0)) # load data -- TODO at the moment we load everying; need to think about # whether it is possible to do this using a single pass of the data, but not # loading it all first. regions = [x for x in BEDIterator(in_fh, verbose)] if exact: collapse_exact(regions, stranded, names, out_fh) else: for x in collapseRegions(regions, stranded, names, verbose): out_fh.write(str(x) + "\n")
def test_conservation_profile_pid(self): """Test getting conservation profiles (PID) from genome alignments.""" m = StringIO.StringIO(self.maf1 + "\n\n" + self.maf2) ga = GenomeAlignment([x for x in genome_alignment_iterator(m, "A")]) # here we're testing directly; the actual script will adjust the regions # before it does this step. expect_raw = [[0.66666666, 1.00000000, 1.00000000, 1.0000000], [0.66666666, 0.33333333, None], [0.33333333, 0.33333333, 0.66666666, 0.33333333], [None, None, 1.00000000, 1.00000000], [0.33333333, 0.66666666], [0.33333333, 1.00000000, 1.00000000], [0.66666666, 1.00000000, 0.66666666]] in_regions = [r for r in BEDIterator(StringIO.StringIO(self.roi))] res = [conservtion_profile_pid(r, ga) for r in in_regions] self.assertEqual(len(expect_raw), len(res)) for i in range(0, len(expect_raw)): self.assertEqual(len(expect_raw[i]), len(res[i])) for j in range(0, len(expect_raw[i])): self.assertAlmostEqual(expect_raw[i][j], res[i][j]) # now we test with the adjusted regions for l in in_regions: transform_locus(l, CENTRE, 4) res_adjusted = [conservtion_profile_pid(r, ga) for r in in_regions] expect_adjusted = [[0.66666666, 1.00000000, 1.00000000, 1.0000000], [0.66666666, 0.33333333, None, None], [0.33333333, 0.33333333, 0.66666666, 0.33333333], [None, None, 1.00000000, 1.00000000], [None, 0.33333333, 0.66666666, 0.66666666], [0.33333333, 1.00000000, 1.00000000, 0.66666666], [0.66666666, 1.00000000, 0.66666666, None]] self.assertEqual(len(expect_adjusted), len(res_adjusted)) for i in range(0, len(expect_adjusted)): self.assertEqual(len(expect_adjusted[i]), len(res_adjusted[i])) for j in range(0, len(expect_adjusted[i])): self.assertAlmostEqual(expect_adjusted[i][j], res_adjusted[i][j])
def process_anchor_start(regions_fn, to_count_fn, anchor_to=ANCHOR_START, normalize=False, verbose=False): """ :return: list where each element is the number of hits at that location relative to the start of the regions. """ res = [] possible = [] trees = intervalTrees(to_count_fn, verbose=verbose) for region in BEDIterator(regions_fn, verbose=verbose): if normalize: for i in range(0, len(region)): while len(possible) <= i: possible.append(0) possible[i] += 1 if region.chrom not in trees: continue hits = trees[region.chrom].intersectingInterval( region.start, region.end) for h in hits: abs_pos = h.start if abs_pos < region.start or abs_pos >= region.end: continue rel_pos = __to_relative(abs_pos, region, anchor_to) assert (rel_pos < len(region)) while (len(res) <= rel_pos): res.append(0) res[rel_pos] += 1 if normalize: __norm_counts(res, possible) return res
def processBED(fh, genome_alig, window_size, window_centre, mi_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS, species=None, verbose=False): """ Process BED file, produce profile of conservation using whole genome alig. :param fh: :param genome_alig: the whole-genome alignment to use to compute conservation scores :param window_size: length of the profile. :param window_center: which part of each interval to place at the center of the profile. Acceptable values are in the module constant WINDOW_CENTRE_OPTIONS. :param miss_seqs: how to treat sequence with no actual sequence data for the column. :param verbose: if True, output progress messages to stderr. :return: """ mean_profile = [] while len(mean_profile) < window_size: mean_profile.append(RollingMean()) for e in BEDIterator(fh, verbose=verbose, scoreType=float, sortedby=ITERATOR_SORTED_START): # figure out which interval to look at... transform_locus(e, window_centre, window_size) new_profile = conservtion_profile_pid(e, genome_alig, mi_seqs, species) merge_profile(mean_profile, new_profile) return [m.mean for m in mean_profile]
def count(reads_fh, rois_fh, outfh, verbose=False): for region, reads in bucketIterator(BEDIterator(reads_fh, verbose=verbose), BEDIterator(rois_fh, verbose=verbose)): region.score = len(reads) outfh.write(str(region) + "\n")