def print_metagene(self, pretty=False, header=False, interval_override=False): """Converts counts_array data to finalized metagene profiles for printing Standard printing is in comma-delimited lines for input into metagene_analysis.py Pretty printing (pretty=True) gives a human readable, if potentially super long, version """ final_metagenes = {} if interval_override: metagene = Metagene(self.feature_interval, self.padding["Upstream"], self.padding["Downstream"]) output = "# Metagene:\t{}\n".format(metagene) output += metagene.print_full() elif header: metagene = Metagene(self.metagene_length, self.padding["Upstream"], self.padding["Downstream"]) output = metagene.print_full(pretty) else: output = "" # process each subset grouping for subset in sorted(self.counts_array, reverse=True): # break counts_array into sections -> upstream padding, interval_feature, and downstream padding upstream_counts = self.counts_array[subset][0 : self.padding["Upstream"]] interval_counts = self.counts_array[subset][ self.padding["Upstream"] : self.padding["Upstream"] + self.feature_interval ] downstream_counts = self.counts_array[subset][ self.padding["Upstream"] + self.feature_interval : len(self.counts_array[subset]) ] if interval_override: metagene_interval_counts = interval_counts else: # compress (or expand) interval_counts to match the size of the internal metagene metagene_interval_counts = self.adjust_to_metagene(interval_counts) if pretty: output += "{0:15s}:\t".format(subset) for i in upstream_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) # keep 2 decimal places in the outputted float for i in metagene_interval_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) for i in downstream_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) output = output[:-1] + "\n" else: # build output output += "{},{}".format(self.name, subset) for p in upstream_counts: output += ",{0:0.3f}".format(p) # keep 3 decimal places in the outputted float for p in metagene_interval_counts: output += ",{0:0.3f}".format(p) for p in downstream_counts: output += ",{0:0.3f}".format(p) output += "\n" return output.strip() # remove trailing "\n"
def check_print_metagene_plain(test, values): expected = (-values[1], values[0] + values[2] - 1) metagene = Metagene(*values) plain_print = metagene.print_full().strip() plain_print_parts = plain_print.split(",") new_range = (int(plain_print_parts[2]), int(plain_print_parts[-1])) print "\n\tOutput:\n\t{}".format(plain_print) test_description = "\nTest: \t{}\n".format(test) test_description += "Expected:\t{}\n".format(expected) test_description += "Range: \t{}\n".format(new_range) test_description += "Output: \t{}\n".format(plain_print) assert new_range == expected, "{}Error: \tPrinted metagene does not match expected.".format(test_description)
def check_print_metagene_pretty(test, values): metagene = Metagene(*values) pretty_print = metagene.print_full(pretty=True).strip() new_values = (len(re.findall('int', pretty_print)), len(re.findall('up', pretty_print)), len(re.findall('down', pretty_print))) print "\n\tOutput:\n\t{}".format("\n\t".join(pretty_print.split("\n"))) test_description = "\nTest: \t{}\n".format(test) test_description += "Expected:\t{}\n".format(values) test_description += "Range: \t{}\n".format(new_values) test_description += "Output: \t{}\n".format(pretty_print) assert new_values == values, "{}Error: \tPrinted metagene does not match expected.".format(test_description)
def print_metagene(self, pretty=False, header=False, interval_override=False): """Converts counts_array data to finalized metagene profiles for printing Standard printing is in comma-delimited lines for input into metagene_analysis.py Pretty printing (pretty=True) gives a human readable, if potentially super long, version """ final_metagenes = {} if interval_override: metagene = Metagene(self.feature_interval, self.padding['Upstream'], self.padding['Downstream']) output = "# Metagene:\t{}\n".format(metagene) output += metagene.print_full() elif header: metagene = Metagene(self.metagene_length, self.padding['Upstream'], self.padding['Downstream']) output = metagene.print_full(pretty) else: output = "" # process each subset grouping for subset in sorted(self.counts_array, reverse=True): # break counts_array into sections -> upstream padding, interval_feature, and downstream padding upstream_counts = self.counts_array[subset][0:self. padding['Upstream']] interval_counts = self.counts_array[ subset][self.padding['Upstream']:self.padding['Upstream'] + self.feature_interval] downstream_counts = self.counts_array[ subset][self.padding['Upstream'] + self.feature_interval:len(self.counts_array[subset])] if interval_override: metagene_interval_counts = interval_counts else: # compress (or expand) interval_counts to match the size of the internal metagene metagene_interval_counts = self.adjust_to_metagene( interval_counts) if pretty: output += "{0:15s}:\t".format(subset) for i in upstream_counts: output += "{0:>5s},".format("{0:3.2f}".format( i)) # keep 2 decimal places in the outputted float for i in metagene_interval_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) for i in downstream_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) output = output[:-1] + "\n" else: # build output output += "{},{}".format(self.name, subset) for p in upstream_counts: output += ",{0:0.3f}".format( p) # keep 3 decimal places in the outputted float for p in metagene_interval_counts: output += ",{0:0.3f}".format(p) for p in downstream_counts: output += ",{0:0.3f}".format(p) output += "\n" return output.strip() # remove trailing "\n"
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format(arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format(metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format( arguments.alignment, feature.get_samtools_region())]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam(samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read(read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file: output_file.write( "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable))) else: raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format( feature.get_chromosome_region(), feature.name, arguments.alignment))
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format( arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format( metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe([ 'samtools view {} {}'.format(arguments.alignment, feature.get_samtools_region()) ]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam( samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read( read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open( "{}.metagene_counts.csv".format( arguments.output_prefix), 'a') as output_file: output_file.write("{}\n".format( feature.print_metagene(interval_override=arguments. interval_variable))) else: raise MetageneError( "Could not pull chromosomal region {} for feature {} from BAM file {}." .format(feature.get_chromosome_region(), feature.name, arguments.alignment))