Esempio n. 1
0
    def print_metagene(self, pretty=False, header=False, interval_override=False):
        """Converts counts_array data to finalized metagene profiles for printing
        
        Standard printing is in comma-delimited lines for input into metagene_analysis.py
        Pretty printing (pretty=True) gives a human readable, if potentially super long, version
        """

        final_metagenes = {}

        if interval_override:
            metagene = Metagene(self.feature_interval, self.padding["Upstream"], self.padding["Downstream"])
            output = "# Metagene:\t{}\n".format(metagene)
            output += metagene.print_full()
        elif header:
            metagene = Metagene(self.metagene_length, self.padding["Upstream"], self.padding["Downstream"])
            output = metagene.print_full(pretty)
        else:
            output = ""

        # process each subset grouping
        for subset in sorted(self.counts_array, reverse=True):
            # break counts_array into sections -> upstream padding, interval_feature, and downstream padding
            upstream_counts = self.counts_array[subset][0 : self.padding["Upstream"]]
            interval_counts = self.counts_array[subset][
                self.padding["Upstream"] : self.padding["Upstream"] + self.feature_interval
            ]
            downstream_counts = self.counts_array[subset][
                self.padding["Upstream"] + self.feature_interval : len(self.counts_array[subset])
            ]
            if interval_override:
                metagene_interval_counts = interval_counts
            else:
                # compress (or expand) interval_counts to match the size of the internal metagene
                metagene_interval_counts = self.adjust_to_metagene(interval_counts)

            if pretty:
                output += "{0:15s}:\t".format(subset)
                for i in upstream_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))  # keep 2 decimal places in the outputted float
                for i in metagene_interval_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))
                for i in downstream_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))
                output = output[:-1] + "\n"
            else:
                # build output
                output += "{},{}".format(self.name, subset)
                for p in upstream_counts:
                    output += ",{0:0.3f}".format(p)  # keep 3 decimal places in the outputted float
                for p in metagene_interval_counts:
                    output += ",{0:0.3f}".format(p)
                for p in downstream_counts:
                    output += ",{0:0.3f}".format(p)
                output += "\n"

        return output.strip()  # remove trailing "\n"
Esempio n. 2
0
def check_print_metagene_plain(test, values):
    expected = (-values[1], values[0] + values[2] - 1)
    metagene = Metagene(*values)
    plain_print = metagene.print_full().strip()
    plain_print_parts = plain_print.split(",")
    new_range = (int(plain_print_parts[2]), int(plain_print_parts[-1]))
    print "\n\tOutput:\n\t{}".format(plain_print)
    test_description = "\nTest:    \t{}\n".format(test)
    test_description += "Expected:\t{}\n".format(expected)
    test_description += "Range:   \t{}\n".format(new_range)
    test_description += "Output:  \t{}\n".format(plain_print)
    assert new_range == expected, "{}Error:   \tPrinted metagene does not match expected.".format(test_description)
Esempio n. 3
0
def check_print_metagene_pretty(test, values):
    metagene = Metagene(*values)
    pretty_print = metagene.print_full(pretty=True).strip()
    new_values = (len(re.findall('int', pretty_print)),
                  len(re.findall('up', pretty_print)),
                  len(re.findall('down', pretty_print)))
    print "\n\tOutput:\n\t{}".format("\n\t".join(pretty_print.split("\n")))
    test_description = "\nTest:    \t{}\n".format(test)
    test_description += "Expected:\t{}\n".format(values)
    test_description += "Range:   \t{}\n".format(new_values)
    test_description += "Output:  \t{}\n".format(pretty_print)
    assert new_values == values, "{}Error:   \tPrinted metagene does not match expected.".format(test_description)
Esempio n. 4
0
def check_create_metagene(test, values):
    metagene = Metagene(*values)
    length = sum(values)
    (interval, upstream, downstream) = values
    expected = "Upstream:{} -- Interval:{} -- Downstream:{}\tLength:{}".format(upstream, interval, downstream, length)
    test_description = "\nTest:    \t{}\n".format(test)
    test_description += "Expected:\t{}\n".format(expected)
    test_description += "Metagene:\t{}\n".format(metagene)
    assert str(metagene) == expected, "{}Error:   \tMetagene does not match expected.".format(test_description)
Esempio n. 5
0
def setup():
    """Create fixtures"""

    # Define chromosome sizes
    Read.extract_chromosome_sizes([
        "@HD\tVN:1.0\tSO:unsorted", "@SQ\tSN:chr1\tLN:300",
        "@SQ\tSN:chr2\tLN:200", "@PG\tID:test\tVN:0.1"
    ])
    Feature.process_set_chromosome_conversion(["1\tchr1", "2\tchr2"])

    good_input["bed input counting all of the read"] = (
        "all",
        "[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]"
    )
    good_input["bed input counting start of the read"] = (
        "start", "[17, 18, 19, 20, 21, 22, 23]")
    good_input["bed input counting end of the read"] = (
        "end", "[36, 37, 38, 39, 40, 41, 42]")
    good_input["gff input counting all of the read"] = (
        "all",
        "[43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8]"
    )
    good_input["gff input counting start of the read"] = (
        "start", "[43, 42, 41, 40, 39, 38, 37]")
    good_input["gff input counting end of the read"] = (
        "end", "[14, 13, 12, 11, 10, 9, 8]")

    for method in ['all', 'start', 'end']:
        print "\nTesting feature_count option: ****{}****".format(method)

        if method == 'all':
            metagene = Metagene(10, 4, 2)
            print "\t  with Metagene:\t{}".format(metagene)
            print "\t  with chromosome conversions:\t{}".format(
                Feature.chromosome_conversion)
        else:
            metagene = Metagene(1, 4, 2)
            print "\t  with Metagene:\t{}".format(metagene)
            print "\t  with chromosome conversions:\t{}".format(
                Feature.chromosome_conversion)

        # create feature from BED line
        try:
            bedline = "{}\t{}\t{}\t{}\t{}\t{}\n".format(
                1, 20, 40, "first", 44, "+")
            print "\t  with BED line:\t{}".format(bedline.strip())
            feature1 = Feature.create_from_bed(method, metagene, bedline,
                                               False, False)
            if str(feature1.position_array) != correct_features['bed'][method]:
                print "**FAILED**\t  Create Feature from BED line ?"
                print "\t  Desired positions:\t{}".format(
                    correct_features['bed'][method])
                print "\t  Created positions:\t{}".format(
                    feature1.position_array)
        except MetageneError as err:
            print "**FAILED**\t  Create Feature from BED line ?"
        else:
            print "PASSED\t  Create Feature from BED line ?\t\t{}".format(
                feature1.get_chromosome_region())

        # create feature from GFF line
        try:
            gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                2, "test", "gene", 10, 39, ".", "-", ".", "second")
            print "\t  with GFF line:\t{}".format(gffline.strip())
            feature2 = Feature.create_from_gff(method, metagene, gffline,
                                               False, False)
            if str(feature2.position_array) != correct_features['gff'][method]:
                print "**FAILED**\t  Create Feature from GFF line ?\t**FAIL**"
                print "\t  Desired positions:\t{}".format(
                    correct_features['gff'][method])
                print "\t  Created positions:\t{}".format(
                    feature2.position_array)
        except MetageneError as err:
            print "**FAILED**\t  Create Feature from GFF line ?"
        else:
            print "PASSED\t  Create Feature from GFF line ?\t\t{}".format(
                feature2.get_chromosome_region())

        # create feature from GFF line with start and end swapped
        try:
            gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                2, "test", "gene", 39, 10, ".", "-", ".", "second")
            print "\t  with GFF line:\t{}".format(gffline.strip())
            feature2 = Feature.create_from_gff(method, metagene, gffline,
                                               False, False)
            if str(feature2.position_array) != correct_features['gff'][method]:
                print "**FAILED**\t  Create Feature from GFF line with swapped start and end ?\t**FAIL**"
                print "\t  Desired positions:\t{}".format(
                    correct_features['gff'][method])
                print "\t  Created positions:\t{}".format(
                    feature2.position_array)
        except MetageneError as err:
            print "**FAILED**\t  Create Feature from GFF line with swapped start and end ?"
        else:
            print "PASSED\t  Create Feature from GFF line with swapped start and end ?\t\t{}".format(
                feature2.get_chromosome_region())
        try:
            gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                2, "test", "gene", 39, 10, ".", "+", ".", "second")
            print "\t  with GFF line:\t{}".format(gffline.strip())
            feature2 = Feature.create_from_gff(method, metagene, gffline,
                                               False, False)
            if str(feature2.position_array) != correct_features['gff'][method]:
                print "**FAILED**\t  Do not create Feature from GFF line with swapped start and end, + strand ?\t**FAIL**"
                print "\t  Desired positions:\t{}".format(
                    correct_features['gff'][method])
                print "\t  Created positions:\t{}".format(
                    feature2.position_array)
        except MetageneError as err:
            print "PASSED\t  Do not create Feature from GFF line with swapped start and end, + strand ?\t\t{}".format(
                err)
        else:
            print "**FAILED**\t  Do not create Feature from GFF line with swapped start and end, + strand ?\t\t{}".format(
                feature2.get_chromosome_region())

        ##TODO finish complete testing of Feature class
    print "\n##TODO finish testing of Feature class creation\n"

    print "\n**** Testing counting and maniputlation ****\n"

    expected = {'all': {}, 'start': {}, 'end': {}}
    #  Positions in metagene:                           17    18     19   20  21-22,23-24,25-26,27-28,29-30,31-32,33-34,35-36,37-38,39-40,  41,   42
    expected['all'] = {
        'all':
        "first,sense:allreads,0.333,0.333,0.000,0.000,0.000,0.000,0.000,0.000,0.286,0.571,0.571,0.000,0.000,0.286,0.286,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.100,0.100,0.100,0.100,0.100,0.000,0.000,0.000,0.000,0.000,0.111",
        'start':
        "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000",
        'end':
        "first,sense:allreads,0.000,3.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000"
    }
    #  Positions in metagene:                           17    18    19    20   [21]   22    23
    expected['start'] = {
        'all':
        "first,sense:allreads,0.333,0.333,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.050",
        'start':
        "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000",
        'end':
        "first,sense:allreads,0.000,3.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.500"
    }
    #  Positions in metagene:                           36    37    38    39   [40]   41    42
    expected['end'] = {
        'all':
        "first,sense:allreads,0.000,0.000,0.000,0.000,0.286,0.286,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.111",
        'start':
        "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000",
        'end':
        "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,2.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,1.000"
    }

    metagene = {
        'all': Metagene(10, 4, 2),
        'start': Metagene(1, 4, 2),
        'end': Metagene(1, 4, 2)
    }

    for method in ['all', 'start', 'end']:
        if method == 'all':
            print "\t  with Metagene:\t{}".format(metagene[method])
            print "\t  with chromosome conversions:\t{}".format(
                Feature.chromosome_conversion)
        else:
            print "\t  with Metagene:\t{}".format(metagene[method])
            print "\t  with chromosome conversions:\t{}".format(
                Feature.chromosome_conversion)

        print "\nTesting feature_count option: ****{}****".format(method)
        feature_line = "{}\t{}\t{}\t{}\t{}\t{}\n".format(
            1, 20, 40, "first", 44, "+")
        feature1 = Feature.create_from_bed(method, metagene[method],
                                           feature_line, False, False)
        print "\tFeature:\t{}".format(feature1.position_array)

        reads = []
        reads.append(
            Read("chr1", "+", 3, 1, [10, 11, 12, 13, 14, 15, 16, 17, 18]))
        reads.append(
            Read("chr1", "-", 1, 2, [23, 24, 25, 26, 27, 28, 29, 30, 31, 32]))
        reads.append(Read("chr1", "+", 4, 2, [30, 31, 32, 33, 34, 40, 41]))
        reads.append(
            Read("chr1", "-", 1, 1, [42, 43, 44, 45, 46, 47, 48, 49, 50]))

        reads.append(Read("chr1", "+", 10, 1, [51, 52, 53, 54, 55]))
        reads.append(Read("chr2", "+", 10, 1,
                          [18, 19, 20, 21, 22, 23, 24, 25]))

        # starting count
        for count_method in ['all', 'start', 'end']:
            print "\nTesting count_method option: ****{}****".format(
                count_method)

            output = "{}\n".format(feature1)

            for r in reads:
                output += "{}\n".format(r)
                feature1.count_read(r, count_method, count_partial_reads=True)
                output += "{}\n".format(feature1)

            output += feature1.print_metagene(pretty=True)
            if str(feature1.print_metagene()).strip() == str(
                    expected[method][count_method]).strip():
                print "PASSED\tCreated correct metagene with feature method {} and count method {} ?".format(
                    method, count_method)
            else:
                print "**FAILED**\tCreated correct metagene with feature method {} and count method {} ?".format(
                    method, count_method)
                print "\tExpected:\n{}".format(expected[method][count_method])
                print "\tActual  :\n{}".format(feature1.print_metagene())
                print "\tSummary of run:\n{}".format(output)
            feature1 = Feature.create_from_bed(
                method, metagene[method], feature_line, False,
                False)  # zero out counter for next round

    try:
        unstranded_read = Read("chr1", ".", 10, 1,
                               [18, 19, 20, 21, 22, 23, 24, 25])
        feature1.count_read(unstranded_read, 'all')
    except MetageneError as err:
        print "PASSED\tCaught unstranded read on stranded count ?\t\t".format(
            err)
    else:
        print "**FAILED**\tCaught unstranded read on stranded count ?"

    try:
        feature_line = "{}\t{}\t{}\t{}\t{}\t{}\n".format(
            1, 20, 40, "first", 44, ".")
        feature1 = Feature.create_from_bed(method, metagene[method],
                                           feature_line, False, False)
        unstranded_read = Read("chr1", ".", 10, 1,
                               [18, 19, 20, 21, 22, 23, 24, 25])
        feature1.count_read(unstranded_read, 'all')
    except MetageneError as err:
        print "**FAILED**\tAllowed unstranded read on unstranded count ?\t\t".format(
            err)
    else:
        print "PASSED\tAllowed unstranded read on unstranded count ?"

    print "\n**** Testing adjust_to_metagene ****\n"

    chromosome_converter = {"1": "chr1", "2": "chr2"}

    # ((metagene_tupple),(feature_tupple),expected_result_string, message_string)
    tests = [((8, 2, 2), (16, 8, 24, 4),
              '8.000,8.000,4.000,4.000,12.000,12.000,2.000,2.000',
              "Expand to metagene ?"),
             ((4, 2, 2), (6, 8, 6, 2, 4, 4, 2, 4, 24, 8),
              '17.000,9.000,8.000,34.000', "Contract to metagene ?"),
             ((4, 2, 2), (2.5, 4, (10.0 / 3), 10, 11, 7.3, 4),
              '5.500,9.333,17.825,9.475', "Contract with messy floats ?"),
             ((3, 2, 2), (2.5, 4, (10.0 / 3), 10, 11, 7.3, 4),
              '7.611,19.556,14.967', "Contract with other messy floats ?")]

    for t in tests:
        metagene = Metagene(*t[0])
        print "\t{}".format(metagene)
        feature_line = "{}\t{}\t{}\n".format(1, 0, len(t[1]))
        feature = Feature.create_from_bed('all',
                                          metagene,
                                          feature_line,
                                          False,
                                          False,
                                          short=True)
        adjusted_feature = ""
        for f in feature.adjust_to_metagene(t[1]):
            adjusted_feature += "{0:0.3f},".format(f)
        if adjusted_feature[:-1] == t[2]:
            print "PASSED\t{}".format(t[3])
        else:
            print "**FAILED**\t{}".format(t[3])
            print "\tExpected:\t{}".format(t[2])
            print "\tActual  :\t{}".format(adjusted_feature[:-1])
            print "\tOriginal:\t{}".format(feature.adjust_to_metagene(t[1]))

    print "\n**** End of Testing the Feature class ****\n"


# end of Feature.test method
Esempio n. 6
0
def check_catch_bad_input(test, values):
    print Metagene(*values)
Esempio n. 7
0
    def print_metagene(self,
                       pretty=False,
                       header=False,
                       interval_override=False):
        """Converts counts_array data to finalized metagene profiles for printing
        
        Standard printing is in comma-delimited lines for input into metagene_analysis.py
        Pretty printing (pretty=True) gives a human readable, if potentially super long, version
        """

        final_metagenes = {}

        if interval_override:
            metagene = Metagene(self.feature_interval,
                                self.padding['Upstream'],
                                self.padding['Downstream'])
            output = "# Metagene:\t{}\n".format(metagene)
            output += metagene.print_full()
        elif header:
            metagene = Metagene(self.metagene_length, self.padding['Upstream'],
                                self.padding['Downstream'])
            output = metagene.print_full(pretty)
        else:
            output = ""

        # process each subset grouping
        for subset in sorted(self.counts_array, reverse=True):
            # break counts_array into sections -> upstream padding, interval_feature, and downstream padding
            upstream_counts = self.counts_array[subset][0:self.
                                                        padding['Upstream']]
            interval_counts = self.counts_array[
                subset][self.padding['Upstream']:self.padding['Upstream'] +
                        self.feature_interval]
            downstream_counts = self.counts_array[
                subset][self.padding['Upstream'] +
                        self.feature_interval:len(self.counts_array[subset])]
            if interval_override:
                metagene_interval_counts = interval_counts
            else:
                # compress (or expand) interval_counts to match the size of the internal metagene
                metagene_interval_counts = self.adjust_to_metagene(
                    interval_counts)

            if pretty:
                output += "{0:15s}:\t".format(subset)
                for i in upstream_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(
                        i))  # keep 2 decimal places in the outputted float
                for i in metagene_interval_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))
                for i in downstream_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))
                output = output[:-1] + "\n"
            else:
                # build output
                output += "{},{}".format(self.name, subset)
                for p in upstream_counts:
                    output += ",{0:0.3f}".format(
                        p)  # keep 3 decimal places in the outputted float
                for p in metagene_interval_counts:
                    output += ",{0:0.3f}".format(p)
                for p in downstream_counts:
                    output += ",{0:0.3f}".format(p)
                output += "\n"

        return output.strip()  # remove trailing "\n"
Esempio n. 8
0
    def __init__(self,
                 count_method,
                 metagene_object,
                 name,
                 chromosome,
                 start,
                 end,
                 strand,
                 gap_counting=False,
                 ignore_strand=False):
        """Not normally called directly; use Feature.create(file_format, count_method,
        metagene_object, feature_line, chromosome_conversion_table) to call indirectly.
        
        Define a new feature with an interval (represents feature length), 
        up and downstream padding (defined by metagene_object), and genomic 
        (1-based) start and end positions.
        
        Once defined here, the start and end represent the true start and end of
        the feature.  Therefore, if a - strand (Crick strand) feature the start
        will be larger than the end.
        """
        chromosome = Feature.chromosome_conversion[
            chromosome]  # convert to BAM-like chromosome designation
        if (confirm_integer(start,
                            "Start",
                            minimum=1,
                            maximum=Read.chromosome_sizes[chromosome]) and
                confirm_integer(end,
                                "End",
                                minimum=1,
                                maximum=Read.chromosome_sizes[chromosome])):
            start = int(start)
            end = int(end)

        # Define feature-specific metagene where feature_interval respresents
        # the length of the feature NOT the length of the final metagene interval
        if count_method == 'all':
            interval = (end - start + 1)  # length of feature
        else:
            interval = 1  # length of the start (or end) of feature

        Metagene.__init__(self, interval, metagene_object.padding['Upstream'],
                          metagene_object.padding['Downstream'])
        self.name = name
        self.chromosome = chromosome
        self.strand = strand
        self.metagene_length = metagene_object.feature_interval

        # define counts_array dictionary
        # key: orientation:gap_counts string
        #      where orientation = {'unstranded', 'sense', 'antisense'}
        #            gap_counts  = {'ungapped', 'gapped, 'allreads'}
        #      'ungapped' + 'gapped' = 'allreads'
        #      'sense' + 'antisense' = 'unstranded'
        #
        # values: arrays of self.length initialized to 0
        if self.strand != "+" and self.strand != "-":
            self.strand = "."
            orientation = ['unstranded']
        elif ignore_strand:
            orientation = ['unstranded']
        else:
            orientation = ['sense', 'antisense']
        if gap_counting:
            gap_counts = ['ungapped', 'gapped']
        else:
            gap_counts = ['allreads']

        self.counts_array = {}
        for o in orientation:
            for g in gap_counts:
                self.counts_array["{}:{}".format(o, g)] = []
                for p in range(self.length):
                    #self.counts_array["{}:{}".format(o,g)].append(decimal.Decimal(0.0))
                    self.counts_array["{}:{}".format(o, g)].append(0)

        # define position_array
        # values  : chromosomal 1-based nucleotide positions in 5' to 3'
        #           orientation WITH RESPECT TO THE FEATURE
        # Example :
        #       + strand:   [10,11,12,13,14,15]
        #       - strand:   [15,14,13,12,11,10]
        # so position_array[0] is always the start of the feature (with upstream padding)
        #    position_array[-1] is always the end of the feature (with downstream padding)
        self.position_array = []
        if self.strand == "-":
            # chromosome start = feature end
            # chromosome end   = feature start
            if count_method == 'start':
                start = end
            elif count_method == 'end':
                end = start
            region_start = start - self.padding[
                'Downstream']  # start is really end
            region_end = end + self.padding['Upstream']  # end is really start
            positions = range(region_start, region_end + 1)  # inclusive list
            positions.reverse()
        else:
            if count_method == 'start':
                end = start  # set both start and end to the start value
            elif count_method == 'end':
                start = end  # set both start and end to the end value
            region_start = start - self.padding['Upstream']
            region_end = end + self.padding['Downstream']
            positions = range(region_start, region_end + 1)  # inclusive list

        self.position_array = positions
def metagene_count():
    """Chain of command for metagene_count analysis."""
    arguments = get_arguments()
    # confirm BAM file and extract chromosome sizes
    Read.set_chromosome_sizes(arguments.alignment)
    ##TODO: create a list of chromosomes to analyze and/or exclude
    # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM)
    Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys())

    # define has_abundance and has_mappings tags for Read class
    Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)")
    Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)")

    # define the metagene array shape (left padding, start, internal, end, right padding)
    # metagene = padding ---- internal region ---- padding 
    try:
        metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding)
        print "Metagene definition:\t{}".format(metagene)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the metagene template")

    try:
        Feature.set_format(arguments.feature)  # assign file format for the feature file
        print "Reading feature file as {} format".format(Feature.format)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the feature object")

    # print out the header line...
    if not arguments.interval_variable:
        with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file:
            output_file.write("# Metagene:\t{}\n".format(metagene))  # define for plotting later
            output_file.write(metagene.print_full())

    # for each feature
    with open(arguments.feature, 'r') as feature_file:
        for feature_line in read_chunk(feature_file, 1024):
            if feature_line[0] != "#":  # skip comment lines
                # change creation with feature_method
                feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing,
                                         arguments.ignore_strand)

                # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather
                # than Feature.get_chromosome_region() because only the first ensures that the interval does not
                # extend beyond the length of the chromosome which makes samtools view return no reads
                (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format(
                    arguments.alignment,
                    feature.get_samtools_region())])
                if run_pipe_worked:
                    for samline in sam_sample:
                        if len(samline) > 0:
                            # create Read feature
                            (created_read, read) = Read.create_from_sam(samline,
                                                                        Feature.chromosome_conversion.values(),
                                                                        arguments.count_method,
                                                                        arguments.uniquely_mapping,
                                                                        arguments.ignore_strand,
                                                                        arguments.count_secondary_alignments,
                                                                        arguments.count_failed_quality_control,
                                                                        arguments.count_PCR_optical_duplicate,
                                                                        arguments.count_supplementary_alignment)

                            # count read (if it exists)
                            if created_read:
                                feature.count_read(read, arguments.count_method, arguments.count_splicing,
                                                   arguments.count_partial_reads, arguments.ignore_strand)

                    # output the resulting metagene
                    with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file:
                        output_file.write(
                            "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable)))

                else:
                    raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format(
                        feature.get_chromosome_region(),
                        feature.name,
                        arguments.alignment))
Esempio n. 10
0
    def __init__(
        self,
        count_method,
        metagene_object,
        name,
        chromosome,
        start,
        end,
        strand,
        gap_counting=False,
        ignore_strand=False,
    ):
        """Not normally called directly; use Feature.create(file_format, count_method,
        metagene_object, feature_line, chromosome_conversion_table) to call indirectly.
        
        Define a new feature with an interval (represents feature length), 
        up and downstream padding (defined by metagene_object), and genomic 
        (1-based) start and end positions.
        
        Once defined here, the start and end represent the true start and end of
        the feature.  Therefore, if a - strand (Crick strand) feature the start
        will be larger than the end.
        """
        chromosome = Feature.chromosome_conversion[chromosome]  # convert to BAM-like chromosome designation
        if confirm_integer(start, "Start", minimum=1, maximum=Read.chromosome_sizes[chromosome]) and confirm_integer(
            end, "End", minimum=1, maximum=Read.chromosome_sizes[chromosome]
        ):
            start = int(start)
            end = int(end)

        # Define feature-specific metagene where feature_interval respresents
        # the length of the feature NOT the length of the final metagene interval
        if count_method == "all":
            interval = end - start + 1  # length of feature
        else:
            interval = 1  # length of the start (or end) of feature

        Metagene.__init__(self, interval, metagene_object.padding["Upstream"], metagene_object.padding["Downstream"])
        self.name = name
        self.chromosome = chromosome
        self.strand = strand
        self.metagene_length = metagene_object.feature_interval

        # define counts_array dictionary
        # key: orientation:gap_counts string
        #      where orientation = {'unstranded', 'sense', 'antisense'}
        #            gap_counts  = {'ungapped', 'gapped, 'allreads'}
        #      'ungapped' + 'gapped' = 'allreads'
        #      'sense' + 'antisense' = 'unstranded'
        #
        # values: arrays of self.length initialized to 0
        if self.strand != "+" and self.strand != "-":
            self.strand = "."
            orientation = ["unstranded"]
        elif ignore_strand:
            orientation = ["unstranded"]
        else:
            orientation = ["sense", "antisense"]
        if gap_counting:
            gap_counts = ["ungapped", "gapped"]
        else:
            gap_counts = ["allreads"]

        self.counts_array = {}
        for o in orientation:
            for g in gap_counts:
                self.counts_array["{}:{}".format(o, g)] = []
                for p in range(self.length):
                    # self.counts_array["{}:{}".format(o,g)].append(decimal.Decimal(0.0))
                    self.counts_array["{}:{}".format(o, g)].append(0)

        # define position_array
        # values  : chromosomal 1-based nucleotide positions in 5' to 3'
        #           orientation WITH RESPECT TO THE FEATURE
        # Example :
        #       + strand:   [10,11,12,13,14,15]
        #       - strand:   [15,14,13,12,11,10]
        # so position_array[0] is always the start of the feature (with upstream padding)
        #    position_array[-1] is always the end of the feature (with downstream padding)
        self.position_array = []
        if self.strand == "-":
            # chromosome start = feature end
            # chromosome end   = feature start
            if count_method == "start":
                start = end
            elif count_method == "end":
                end = start
            region_start = start - self.padding["Downstream"]  # start is really end
            region_end = end + self.padding["Upstream"]  # end is really start
            positions = range(region_start, region_end + 1)  # inclusive list
            positions.reverse()
        else:
            if count_method == "start":
                end = start  # set both start and end to the start value
            elif count_method == "end":
                start = end  # set both start and end to the end value
            region_start = start - self.padding["Upstream"]
            region_end = end + self.padding["Downstream"]
            positions = range(region_start, region_end + 1)  # inclusive list

        self.position_array = positions
Esempio n. 11
0
def metagene_count():
    """Chain of command for metagene_count analysis."""
    arguments = get_arguments()
    # confirm BAM file and extract chromosome sizes
    Read.set_chromosome_sizes(arguments.alignment)
    ##TODO: create a list of chromosomes to analyze and/or exclude
    # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM)
    Feature.set_chromosome_conversion(arguments.chromosome_names,
                                      Read.chromosome_sizes.keys())

    # define has_abundance and has_mappings tags for Read class
    Read.set_sam_tag(arguments.extract_abundance, arguments.alignment,
                     "NA:i:(\d+)")
    Read.set_sam_tag(arguments.extract_mappings, arguments.alignment,
                     "NH:i:(\d+)")

    # define the metagene array shape (left padding, start, internal, end, right padding)
    # metagene = padding ---- internal region ---- padding
    try:
        metagene = Metagene(arguments.interval_size, arguments.padding,
                            arguments.padding)
        print "Metagene definition:\t{}".format(metagene)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the metagene template")

    try:
        Feature.set_format(
            arguments.feature)  # assign file format for the feature file
        print "Reading feature file as {} format".format(Feature.format)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the feature object")

    # print out the header line...
    if not arguments.interval_variable:
        with open("{}.metagene_counts.csv".format(arguments.output_prefix),
                  'w') as output_file:
            output_file.write("# Metagene:\t{}\n".format(
                metagene))  # define for plotting later
            output_file.write(metagene.print_full())

    # for each feature
    with open(arguments.feature, 'r') as feature_file:
        for feature_line in read_chunk(feature_file, 1024):
            if feature_line[0] != "#":  # skip comment lines
                # change creation with feature_method
                feature = Feature.create(arguments.feature_count, metagene,
                                         feature_line,
                                         arguments.count_splicing,
                                         arguments.ignore_strand)

                # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather
                # than Feature.get_chromosome_region() because only the first ensures that the interval does not
                # extend beyond the length of the chromosome which makes samtools view return no reads
                (run_pipe_worked, sam_sample) = run_pipe([
                    'samtools view {} {}'.format(arguments.alignment,
                                                 feature.get_samtools_region())
                ])
                if run_pipe_worked:
                    for samline in sam_sample:
                        if len(samline) > 0:
                            # create Read feature
                            (created_read, read) = Read.create_from_sam(
                                samline,
                                Feature.chromosome_conversion.values(),
                                arguments.count_method,
                                arguments.uniquely_mapping,
                                arguments.ignore_strand,
                                arguments.count_secondary_alignments,
                                arguments.count_failed_quality_control,
                                arguments.count_PCR_optical_duplicate,
                                arguments.count_supplementary_alignment)

                            # count read (if it exists)
                            if created_read:
                                feature.count_read(
                                    read, arguments.count_method,
                                    arguments.count_splicing,
                                    arguments.count_partial_reads,
                                    arguments.ignore_strand)

                    # output the resulting metagene
                    with open(
                            "{}.metagene_counts.csv".format(
                                arguments.output_prefix), 'a') as output_file:
                        output_file.write("{}\n".format(
                            feature.print_metagene(interval_override=arguments.
                                                   interval_variable)))

                else:
                    raise MetageneError(
                        "Could not pull chromosomal region {} for feature {} from BAM file {}."
                        .format(feature.get_chromosome_region(), feature.name,
                                arguments.alignment))