Example #1
0
    def print_metagene(self, pretty=False, header=False, interval_override=False):
        """Converts counts_array data to finalized metagene profiles for printing
        
        Standard printing is in comma-delimited lines for input into metagene_analysis.py
        Pretty printing (pretty=True) gives a human readable, if potentially super long, version
        """

        final_metagenes = {}

        if interval_override:
            metagene = Metagene(self.feature_interval, self.padding["Upstream"], self.padding["Downstream"])
            output = "# Metagene:\t{}\n".format(metagene)
            output += metagene.print_full()
        elif header:
            metagene = Metagene(self.metagene_length, self.padding["Upstream"], self.padding["Downstream"])
            output = metagene.print_full(pretty)
        else:
            output = ""

        # process each subset grouping
        for subset in sorted(self.counts_array, reverse=True):
            # break counts_array into sections -> upstream padding, interval_feature, and downstream padding
            upstream_counts = self.counts_array[subset][0 : self.padding["Upstream"]]
            interval_counts = self.counts_array[subset][
                self.padding["Upstream"] : self.padding["Upstream"] + self.feature_interval
            ]
            downstream_counts = self.counts_array[subset][
                self.padding["Upstream"] + self.feature_interval : len(self.counts_array[subset])
            ]
            if interval_override:
                metagene_interval_counts = interval_counts
            else:
                # compress (or expand) interval_counts to match the size of the internal metagene
                metagene_interval_counts = self.adjust_to_metagene(interval_counts)

            if pretty:
                output += "{0:15s}:\t".format(subset)
                for i in upstream_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))  # keep 2 decimal places in the outputted float
                for i in metagene_interval_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))
                for i in downstream_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))
                output = output[:-1] + "\n"
            else:
                # build output
                output += "{},{}".format(self.name, subset)
                for p in upstream_counts:
                    output += ",{0:0.3f}".format(p)  # keep 3 decimal places in the outputted float
                for p in metagene_interval_counts:
                    output += ",{0:0.3f}".format(p)
                for p in downstream_counts:
                    output += ",{0:0.3f}".format(p)
                output += "\n"

        return output.strip()  # remove trailing "\n"
Example #2
0
def check_print_metagene_plain(test, values):
    expected = (-values[1], values[0] + values[2] - 1)
    metagene = Metagene(*values)
    plain_print = metagene.print_full().strip()
    plain_print_parts = plain_print.split(",")
    new_range = (int(plain_print_parts[2]), int(plain_print_parts[-1]))
    print "\n\tOutput:\n\t{}".format(plain_print)
    test_description = "\nTest:    \t{}\n".format(test)
    test_description += "Expected:\t{}\n".format(expected)
    test_description += "Range:   \t{}\n".format(new_range)
    test_description += "Output:  \t{}\n".format(plain_print)
    assert new_range == expected, "{}Error:   \tPrinted metagene does not match expected.".format(test_description)
Example #3
0
def check_print_metagene_pretty(test, values):
    metagene = Metagene(*values)
    pretty_print = metagene.print_full(pretty=True).strip()
    new_values = (len(re.findall('int', pretty_print)),
                  len(re.findall('up', pretty_print)),
                  len(re.findall('down', pretty_print)))
    print "\n\tOutput:\n\t{}".format("\n\t".join(pretty_print.split("\n")))
    test_description = "\nTest:    \t{}\n".format(test)
    test_description += "Expected:\t{}\n".format(values)
    test_description += "Range:   \t{}\n".format(new_values)
    test_description += "Output:  \t{}\n".format(pretty_print)
    assert new_values == values, "{}Error:   \tPrinted metagene does not match expected.".format(test_description)
Example #4
0
    def print_metagene(self,
                       pretty=False,
                       header=False,
                       interval_override=False):
        """Converts counts_array data to finalized metagene profiles for printing
        
        Standard printing is in comma-delimited lines for input into metagene_analysis.py
        Pretty printing (pretty=True) gives a human readable, if potentially super long, version
        """

        final_metagenes = {}

        if interval_override:
            metagene = Metagene(self.feature_interval,
                                self.padding['Upstream'],
                                self.padding['Downstream'])
            output = "# Metagene:\t{}\n".format(metagene)
            output += metagene.print_full()
        elif header:
            metagene = Metagene(self.metagene_length, self.padding['Upstream'],
                                self.padding['Downstream'])
            output = metagene.print_full(pretty)
        else:
            output = ""

        # process each subset grouping
        for subset in sorted(self.counts_array, reverse=True):
            # break counts_array into sections -> upstream padding, interval_feature, and downstream padding
            upstream_counts = self.counts_array[subset][0:self.
                                                        padding['Upstream']]
            interval_counts = self.counts_array[
                subset][self.padding['Upstream']:self.padding['Upstream'] +
                        self.feature_interval]
            downstream_counts = self.counts_array[
                subset][self.padding['Upstream'] +
                        self.feature_interval:len(self.counts_array[subset])]
            if interval_override:
                metagene_interval_counts = interval_counts
            else:
                # compress (or expand) interval_counts to match the size of the internal metagene
                metagene_interval_counts = self.adjust_to_metagene(
                    interval_counts)

            if pretty:
                output += "{0:15s}:\t".format(subset)
                for i in upstream_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(
                        i))  # keep 2 decimal places in the outputted float
                for i in metagene_interval_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))
                for i in downstream_counts:
                    output += "{0:>5s},".format("{0:3.2f}".format(i))
                output = output[:-1] + "\n"
            else:
                # build output
                output += "{},{}".format(self.name, subset)
                for p in upstream_counts:
                    output += ",{0:0.3f}".format(
                        p)  # keep 3 decimal places in the outputted float
                for p in metagene_interval_counts:
                    output += ",{0:0.3f}".format(p)
                for p in downstream_counts:
                    output += ",{0:0.3f}".format(p)
                output += "\n"

        return output.strip()  # remove trailing "\n"
def metagene_count():
    """Chain of command for metagene_count analysis."""
    arguments = get_arguments()
    # confirm BAM file and extract chromosome sizes
    Read.set_chromosome_sizes(arguments.alignment)
    ##TODO: create a list of chromosomes to analyze and/or exclude
    # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM)
    Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys())

    # define has_abundance and has_mappings tags for Read class
    Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)")
    Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)")

    # define the metagene array shape (left padding, start, internal, end, right padding)
    # metagene = padding ---- internal region ---- padding 
    try:
        metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding)
        print "Metagene definition:\t{}".format(metagene)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the metagene template")

    try:
        Feature.set_format(arguments.feature)  # assign file format for the feature file
        print "Reading feature file as {} format".format(Feature.format)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the feature object")

    # print out the header line...
    if not arguments.interval_variable:
        with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file:
            output_file.write("# Metagene:\t{}\n".format(metagene))  # define for plotting later
            output_file.write(metagene.print_full())

    # for each feature
    with open(arguments.feature, 'r') as feature_file:
        for feature_line in read_chunk(feature_file, 1024):
            if feature_line[0] != "#":  # skip comment lines
                # change creation with feature_method
                feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing,
                                         arguments.ignore_strand)

                # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather
                # than Feature.get_chromosome_region() because only the first ensures that the interval does not
                # extend beyond the length of the chromosome which makes samtools view return no reads
                (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format(
                    arguments.alignment,
                    feature.get_samtools_region())])
                if run_pipe_worked:
                    for samline in sam_sample:
                        if len(samline) > 0:
                            # create Read feature
                            (created_read, read) = Read.create_from_sam(samline,
                                                                        Feature.chromosome_conversion.values(),
                                                                        arguments.count_method,
                                                                        arguments.uniquely_mapping,
                                                                        arguments.ignore_strand,
                                                                        arguments.count_secondary_alignments,
                                                                        arguments.count_failed_quality_control,
                                                                        arguments.count_PCR_optical_duplicate,
                                                                        arguments.count_supplementary_alignment)

                            # count read (if it exists)
                            if created_read:
                                feature.count_read(read, arguments.count_method, arguments.count_splicing,
                                                   arguments.count_partial_reads, arguments.ignore_strand)

                    # output the resulting metagene
                    with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file:
                        output_file.write(
                            "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable)))

                else:
                    raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format(
                        feature.get_chromosome_region(),
                        feature.name,
                        arguments.alignment))
Example #6
0
def metagene_count():
    """Chain of command for metagene_count analysis."""
    arguments = get_arguments()
    # confirm BAM file and extract chromosome sizes
    Read.set_chromosome_sizes(arguments.alignment)
    ##TODO: create a list of chromosomes to analyze and/or exclude
    # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM)
    Feature.set_chromosome_conversion(arguments.chromosome_names,
                                      Read.chromosome_sizes.keys())

    # define has_abundance and has_mappings tags for Read class
    Read.set_sam_tag(arguments.extract_abundance, arguments.alignment,
                     "NA:i:(\d+)")
    Read.set_sam_tag(arguments.extract_mappings, arguments.alignment,
                     "NH:i:(\d+)")

    # define the metagene array shape (left padding, start, internal, end, right padding)
    # metagene = padding ---- internal region ---- padding
    try:
        metagene = Metagene(arguments.interval_size, arguments.padding,
                            arguments.padding)
        print "Metagene definition:\t{}".format(metagene)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the metagene template")

    try:
        Feature.set_format(
            arguments.feature)  # assign file format for the feature file
        print "Reading feature file as {} format".format(Feature.format)
    except MetageneError as err:
        print err
        raise MetageneError("Unable to create the feature object")

    # print out the header line...
    if not arguments.interval_variable:
        with open("{}.metagene_counts.csv".format(arguments.output_prefix),
                  'w') as output_file:
            output_file.write("# Metagene:\t{}\n".format(
                metagene))  # define for plotting later
            output_file.write(metagene.print_full())

    # for each feature
    with open(arguments.feature, 'r') as feature_file:
        for feature_line in read_chunk(feature_file, 1024):
            if feature_line[0] != "#":  # skip comment lines
                # change creation with feature_method
                feature = Feature.create(arguments.feature_count, metagene,
                                         feature_line,
                                         arguments.count_splicing,
                                         arguments.ignore_strand)

                # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather
                # than Feature.get_chromosome_region() because only the first ensures that the interval does not
                # extend beyond the length of the chromosome which makes samtools view return no reads
                (run_pipe_worked, sam_sample) = run_pipe([
                    'samtools view {} {}'.format(arguments.alignment,
                                                 feature.get_samtools_region())
                ])
                if run_pipe_worked:
                    for samline in sam_sample:
                        if len(samline) > 0:
                            # create Read feature
                            (created_read, read) = Read.create_from_sam(
                                samline,
                                Feature.chromosome_conversion.values(),
                                arguments.count_method,
                                arguments.uniquely_mapping,
                                arguments.ignore_strand,
                                arguments.count_secondary_alignments,
                                arguments.count_failed_quality_control,
                                arguments.count_PCR_optical_duplicate,
                                arguments.count_supplementary_alignment)

                            # count read (if it exists)
                            if created_read:
                                feature.count_read(
                                    read, arguments.count_method,
                                    arguments.count_splicing,
                                    arguments.count_partial_reads,
                                    arguments.ignore_strand)

                    # output the resulting metagene
                    with open(
                            "{}.metagene_counts.csv".format(
                                arguments.output_prefix), 'a') as output_file:
                        output_file.write("{}\n".format(
                            feature.print_metagene(interval_override=arguments.
                                                   interval_variable)))

                else:
                    raise MetageneError(
                        "Could not pull chromosomal region {} for feature {} from BAM file {}."
                        .format(feature.get_chromosome_region(), feature.name,
                                arguments.alignment))