Example #1
0
def main(args):
    """
  main entry point for the GenomicIntIntersection script.

  :param args: the arguments for this script, as a list of string. Should
               already have had things like the script name stripped. That
               is, if there are no args provided, this should be an empty
               list.
  """
    # get options and arguments
    ui = getUI(args)

    if ui.optionIsSet("test"):
        # just run unit tests
        unittest.main(argv=[sys.argv[0]])
    elif ui.optionIsSet("help"):
        # just show help
        ui.usage()
    else:
        verbose = ui.optionIsSet("verbose")

        # stranded?
        stranded = ui.optionIsSet("stranded")
        if stranded:
            sys.stderr.write(
                "Sorry, stranded mode hasn't been implemented yet.")
            sys.exit()

        # get output handle
        out_fh = sys.stdout
        if ui.optionIsSet("output"):
            out_fh = open(ui.getValue("output"), "w")

        # get input file-handles -- we know we'll get exactly two, since we
        # specified it in the UI definition
        regions_1 = [
            x for x in BEDIterator(ui.getArgument(0), verbose=verbose)
        ]
        regions_2 = [
            x for x in BEDIterator(ui.getArgument(1), verbose=verbose)
        ]

        for r in regionsIntersection(regions_1, regions_2):
            out_fh.write(str(r) + "\n")
Example #2
0
def processBED(infh, outhandle, scheme, verbose=False):
    for read in BEDIterator(infh, verbose=verbose):
        # split the chrom field to get the genomic indices..
        y = collections.deque(read.chrom.split("_"))
        while len(y) > 5:
            a = y.popleft()
            a += ("_" + y.popleft())
            y.appendleft(a)
        chrom = y[0]
        chrom1SeqStart = int(y[1])
        chrom1SeqEnd = int(y[2])
        chrom2SeqStart = int(y[3])

        # arbitrarily decide the first exon contains the largest portion of
        # the read if both are the same
        firstExon = None
        secondExon = None
        if scheme != SECOND_EXON:
            firstExon = GenomicInterval(chrom, chrom1SeqStart + read.start - 1,
                                        chrom1SeqEnd, read.name, read.score,
                                        read.strand)
        if scheme != FIRST_EXON:
            end = chrom2SeqStart + (read.end -
                                    (chrom1SeqEnd - chrom1SeqStart)) - 1
            secondExon = GenomicInterval(chrom, chrom2SeqStart, end, read.name,
                                         read.score, read.strand)

        # we add %1 or %2 to the end of the read names so they can
        # be distinguished later
        if firstExon is not None:
            firstExon.name = firstExon.name + "%1"
        if secondExon is not None:
            secondExon.name = secondExon.name + "%2"

        if (scheme == FIRST_EXON) or \
           (scheme == BIGGEST_EXON and len(firstExon) >= len(secondExon)) or \
           (scheme == FIVE_PRIME_END and read.strand == "+"):
            out = str(firstExon)
        elif (scheme == SECOND_EXON) or \
             (scheme == BIGGEST_EXON and len(secondExon) > len(firstExon)) or \
             (scheme == FIVE_PRIME_END and read.strand == "-"):
            out = str(secondExon)
        elif scheme == BOTH_EXONS:
            out = str(firstExon) + "\n" + str(secondExon)

        # sanity check -- make sure we create a valid output string
        for l in out.split("\n"):
            e = parseBEDString(l)
            if e.chrom.strip() == "":
                raise ValueError(" got an emtpy chrom -> " + str(read))

        # write output
        outhandle.write(out + "\n")
Example #3
0
def main(args):
    """
  main entry point for the GenomicIntJaccard script.

  :param args: the arguments for this script, as a list of string. Should
               already have had things like the script name stripped. That
               is, if there are no args provided, this should be an empty
               list.
  """
    # get options and arguments
    ui = getUI(args)

    if ui.optionIsSet("test"):
        # just run unit tests
        unittest.main(argv=[sys.argv[0]])
    elif ui.optionIsSet("help"):
        # just show help
        ui.usage()
    else:
        verbose = ui.optionIsSet("verbose")
        stranded = ui.optionIsSet("stranded")

        if stranded:
            sys.stderr.write(
                "Sorry, stranded mode hasn't been implemented yet.")
            sys.exit()

        # we required two input files, so we know these will be present...
        regions_1 = [
            e for e in BEDIterator(ui.getArgument(0), verbose=verbose)
        ]
        regions_2 = [
            e for e in BEDIterator(ui.getArgument(1), verbose=verbose)
        ]

        print jaccardIndex(regions_1, regions_2)
Example #4
0
def main(args, prog_name):
    """
  main entry point for the script.

  :param args: the arguments for this script, as a list of string. Should
               already have had things like the script name stripped. That
               is, if there are no args provided, this should be an empty
               list.
  """
    # get options and arguments
    ui = getUI(args, prog_name)

    if ui.optionIsSet("test"):
        # just run unit tests
        unittest.main(argv=[sys.argv[0]])
    elif ui.optionIsSet("help"):
        # just show help
        ui.usage()
    else:
        verbose = (ui.optionIsSet("verbose") is True) or DEFAULT_VERBOSITY

        # how to handle strand, names, and whether to collapse only regions with
        # exactly matching genomic loci?
        stranded = ui.optionIsSet("stranded")
        names = ui.optionIsSet("accumulate_names")
        exact = ui.optionIsSet("exact")

        # get output handle
        out_fh = sys.stdout
        if ui.optionIsSet("output"):
            out_fh = open(ui.getValue("output"), "w")

        # get input file-handle
        in_fh = sys.stdin
        if ui.hasArgument(0):
            in_fh = open(ui.getArgument(0))

        # load data -- TODO at the moment we load everying; need to think about
        # whether it is possible to do this using a single pass of the data, but not
        # loading it all first.
        regions = [x for x in BEDIterator(in_fh, verbose)]

        if exact:
            collapse_exact(regions, stranded, names, out_fh)
        else:
            for x in collapseRegions(regions, stranded, names, verbose):
                out_fh.write(str(x) + "\n")
Example #5
0
    def test_conservation_profile_pid(self):
        """Test getting conservation profiles (PID) from genome alignments."""
        m = StringIO.StringIO(self.maf1 + "\n\n" + self.maf2)
        ga = GenomeAlignment([x for x in genome_alignment_iterator(m, "A")])

        # here we're testing directly; the actual script will adjust the regions
        # before it does this step.
        expect_raw = [[0.66666666, 1.00000000, 1.00000000, 1.0000000],
                      [0.66666666, 0.33333333, None],
                      [0.33333333, 0.33333333, 0.66666666, 0.33333333],
                      [None, None, 1.00000000, 1.00000000],
                      [0.33333333, 0.66666666],
                      [0.33333333, 1.00000000, 1.00000000],
                      [0.66666666, 1.00000000, 0.66666666]]
        in_regions = [r for r in BEDIterator(StringIO.StringIO(self.roi))]
        res = [conservtion_profile_pid(r, ga) for r in in_regions]
        self.assertEqual(len(expect_raw), len(res))
        for i in range(0, len(expect_raw)):
            self.assertEqual(len(expect_raw[i]), len(res[i]))
            for j in range(0, len(expect_raw[i])):
                self.assertAlmostEqual(expect_raw[i][j], res[i][j])

        # now we test with the adjusted regions
        for l in in_regions:
            transform_locus(l, CENTRE, 4)
        res_adjusted = [conservtion_profile_pid(r, ga) for r in in_regions]
        expect_adjusted = [[0.66666666, 1.00000000, 1.00000000, 1.0000000],
                           [0.66666666, 0.33333333, None, None],
                           [0.33333333, 0.33333333, 0.66666666, 0.33333333],
                           [None, None, 1.00000000, 1.00000000],
                           [None, 0.33333333, 0.66666666, 0.66666666],
                           [0.33333333, 1.00000000, 1.00000000, 0.66666666],
                           [0.66666666, 1.00000000, 0.66666666, None]]
        self.assertEqual(len(expect_adjusted), len(res_adjusted))
        for i in range(0, len(expect_adjusted)):
            self.assertEqual(len(expect_adjusted[i]), len(res_adjusted[i]))
            for j in range(0, len(expect_adjusted[i])):
                self.assertAlmostEqual(expect_adjusted[i][j],
                                       res_adjusted[i][j])
Example #6
0
def process_anchor_start(regions_fn,
                         to_count_fn,
                         anchor_to=ANCHOR_START,
                         normalize=False,
                         verbose=False):
    """
  :return: list where each element is the number of hits at that location
           relative to the start of the regions.
  """
    res = []
    possible = []
    trees = intervalTrees(to_count_fn, verbose=verbose)
    for region in BEDIterator(regions_fn, verbose=verbose):
        if normalize:
            for i in range(0, len(region)):
                while len(possible) <= i:
                    possible.append(0)
                possible[i] += 1

        if region.chrom not in trees:
            continue
        hits = trees[region.chrom].intersectingInterval(
            region.start, region.end)
        for h in hits:
            abs_pos = h.start
            if abs_pos < region.start or abs_pos >= region.end:
                continue
            rel_pos = __to_relative(abs_pos, region, anchor_to)
            assert (rel_pos < len(region))

            while (len(res) <= rel_pos):
                res.append(0)
            res[rel_pos] += 1

    if normalize:
        __norm_counts(res, possible)
    return res
Example #7
0
def processBED(fh,
               genome_alig,
               window_size,
               window_centre,
               mi_seqs=MissingSequenceHandler.TREAT_AS_ALL_GAPS,
               species=None,
               verbose=False):
    """
  Process BED file, produce profile of conservation using whole genome alig.

  :param fh:
  :param genome_alig:   the whole-genome alignment to use to compute
                        conservation scores
  :param window_size:   length of the profile.
  :param window_center: which part of each interval to place at the center
                        of the profile. Acceptable values are in the module
                        constant WINDOW_CENTRE_OPTIONS.
  :param miss_seqs:     how to treat sequence with no actual sequence data for
                        the column.
  :param verbose:       if True, output progress messages to stderr.

  :return:
  """
    mean_profile = []
    while len(mean_profile) < window_size:
        mean_profile.append(RollingMean())

    for e in BEDIterator(fh,
                         verbose=verbose,
                         scoreType=float,
                         sortedby=ITERATOR_SORTED_START):
        # figure out which interval to look at...
        transform_locus(e, window_centre, window_size)
        new_profile = conservtion_profile_pid(e, genome_alig, mi_seqs, species)
        merge_profile(mean_profile, new_profile)
    return [m.mean for m in mean_profile]
Example #8
0
def count(reads_fh, rois_fh, outfh, verbose=False):
    for region, reads in bucketIterator(BEDIterator(reads_fh, verbose=verbose),
                                        BEDIterator(rois_fh, verbose=verbose)):
        region.score = len(reads)
        outfh.write(str(region) + "\n")