def print_metagene(self, pretty=False, header=False, interval_override=False): """Converts counts_array data to finalized metagene profiles for printing Standard printing is in comma-delimited lines for input into metagene_analysis.py Pretty printing (pretty=True) gives a human readable, if potentially super long, version """ final_metagenes = {} if interval_override: metagene = Metagene(self.feature_interval, self.padding["Upstream"], self.padding["Downstream"]) output = "# Metagene:\t{}\n".format(metagene) output += metagene.print_full() elif header: metagene = Metagene(self.metagene_length, self.padding["Upstream"], self.padding["Downstream"]) output = metagene.print_full(pretty) else: output = "" # process each subset grouping for subset in sorted(self.counts_array, reverse=True): # break counts_array into sections -> upstream padding, interval_feature, and downstream padding upstream_counts = self.counts_array[subset][0 : self.padding["Upstream"]] interval_counts = self.counts_array[subset][ self.padding["Upstream"] : self.padding["Upstream"] + self.feature_interval ] downstream_counts = self.counts_array[subset][ self.padding["Upstream"] + self.feature_interval : len(self.counts_array[subset]) ] if interval_override: metagene_interval_counts = interval_counts else: # compress (or expand) interval_counts to match the size of the internal metagene metagene_interval_counts = self.adjust_to_metagene(interval_counts) if pretty: output += "{0:15s}:\t".format(subset) for i in upstream_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) # keep 2 decimal places in the outputted float for i in metagene_interval_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) for i in downstream_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) output = output[:-1] + "\n" else: # build output output += "{},{}".format(self.name, subset) for p in upstream_counts: output += ",{0:0.3f}".format(p) # keep 3 decimal places in the outputted float for p in metagene_interval_counts: output += ",{0:0.3f}".format(p) for p in downstream_counts: output += ",{0:0.3f}".format(p) output += "\n" return output.strip() # remove trailing "\n"
def check_print_metagene_plain(test, values): expected = (-values[1], values[0] + values[2] - 1) metagene = Metagene(*values) plain_print = metagene.print_full().strip() plain_print_parts = plain_print.split(",") new_range = (int(plain_print_parts[2]), int(plain_print_parts[-1])) print "\n\tOutput:\n\t{}".format(plain_print) test_description = "\nTest: \t{}\n".format(test) test_description += "Expected:\t{}\n".format(expected) test_description += "Range: \t{}\n".format(new_range) test_description += "Output: \t{}\n".format(plain_print) assert new_range == expected, "{}Error: \tPrinted metagene does not match expected.".format(test_description)
def check_print_metagene_pretty(test, values): metagene = Metagene(*values) pretty_print = metagene.print_full(pretty=True).strip() new_values = (len(re.findall('int', pretty_print)), len(re.findall('up', pretty_print)), len(re.findall('down', pretty_print))) print "\n\tOutput:\n\t{}".format("\n\t".join(pretty_print.split("\n"))) test_description = "\nTest: \t{}\n".format(test) test_description += "Expected:\t{}\n".format(values) test_description += "Range: \t{}\n".format(new_values) test_description += "Output: \t{}\n".format(pretty_print) assert new_values == values, "{}Error: \tPrinted metagene does not match expected.".format(test_description)
def check_create_metagene(test, values): metagene = Metagene(*values) length = sum(values) (interval, upstream, downstream) = values expected = "Upstream:{} -- Interval:{} -- Downstream:{}\tLength:{}".format(upstream, interval, downstream, length) test_description = "\nTest: \t{}\n".format(test) test_description += "Expected:\t{}\n".format(expected) test_description += "Metagene:\t{}\n".format(metagene) assert str(metagene) == expected, "{}Error: \tMetagene does not match expected.".format(test_description)
def setup(): """Create fixtures""" # Define chromosome sizes Read.extract_chromosome_sizes([ "@HD\tVN:1.0\tSO:unsorted", "@SQ\tSN:chr1\tLN:300", "@SQ\tSN:chr2\tLN:200", "@PG\tID:test\tVN:0.1" ]) Feature.process_set_chromosome_conversion(["1\tchr1", "2\tchr2"]) good_input["bed input counting all of the read"] = ( "all", "[17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]" ) good_input["bed input counting start of the read"] = ( "start", "[17, 18, 19, 20, 21, 22, 23]") good_input["bed input counting end of the read"] = ( "end", "[36, 37, 38, 39, 40, 41, 42]") good_input["gff input counting all of the read"] = ( "all", "[43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8]" ) good_input["gff input counting start of the read"] = ( "start", "[43, 42, 41, 40, 39, 38, 37]") good_input["gff input counting end of the read"] = ( "end", "[14, 13, 12, 11, 10, 9, 8]") for method in ['all', 'start', 'end']: print "\nTesting feature_count option: ****{}****".format(method) if method == 'all': metagene = Metagene(10, 4, 2) print "\t with Metagene:\t{}".format(metagene) print "\t with chromosome conversions:\t{}".format( Feature.chromosome_conversion) else: metagene = Metagene(1, 4, 2) print "\t with Metagene:\t{}".format(metagene) print "\t with chromosome conversions:\t{}".format( Feature.chromosome_conversion) # create feature from BED line try: bedline = "{}\t{}\t{}\t{}\t{}\t{}\n".format( 1, 20, 40, "first", 44, "+") print "\t with BED line:\t{}".format(bedline.strip()) feature1 = Feature.create_from_bed(method, metagene, bedline, False, False) if str(feature1.position_array) != correct_features['bed'][method]: print "**FAILED**\t Create Feature from BED line ?" print "\t Desired positions:\t{}".format( correct_features['bed'][method]) print "\t Created positions:\t{}".format( feature1.position_array) except MetageneError as err: print "**FAILED**\t Create Feature from BED line ?" else: print "PASSED\t Create Feature from BED line ?\t\t{}".format( feature1.get_chromosome_region()) # create feature from GFF line try: gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( 2, "test", "gene", 10, 39, ".", "-", ".", "second") print "\t with GFF line:\t{}".format(gffline.strip()) feature2 = Feature.create_from_gff(method, metagene, gffline, False, False) if str(feature2.position_array) != correct_features['gff'][method]: print "**FAILED**\t Create Feature from GFF line ?\t**FAIL**" print "\t Desired positions:\t{}".format( correct_features['gff'][method]) print "\t Created positions:\t{}".format( feature2.position_array) except MetageneError as err: print "**FAILED**\t Create Feature from GFF line ?" else: print "PASSED\t Create Feature from GFF line ?\t\t{}".format( feature2.get_chromosome_region()) # create feature from GFF line with start and end swapped try: gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( 2, "test", "gene", 39, 10, ".", "-", ".", "second") print "\t with GFF line:\t{}".format(gffline.strip()) feature2 = Feature.create_from_gff(method, metagene, gffline, False, False) if str(feature2.position_array) != correct_features['gff'][method]: print "**FAILED**\t Create Feature from GFF line with swapped start and end ?\t**FAIL**" print "\t Desired positions:\t{}".format( correct_features['gff'][method]) print "\t Created positions:\t{}".format( feature2.position_array) except MetageneError as err: print "**FAILED**\t Create Feature from GFF line with swapped start and end ?" else: print "PASSED\t Create Feature from GFF line with swapped start and end ?\t\t{}".format( feature2.get_chromosome_region()) try: gffline = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( 2, "test", "gene", 39, 10, ".", "+", ".", "second") print "\t with GFF line:\t{}".format(gffline.strip()) feature2 = Feature.create_from_gff(method, metagene, gffline, False, False) if str(feature2.position_array) != correct_features['gff'][method]: print "**FAILED**\t Do not create Feature from GFF line with swapped start and end, + strand ?\t**FAIL**" print "\t Desired positions:\t{}".format( correct_features['gff'][method]) print "\t Created positions:\t{}".format( feature2.position_array) except MetageneError as err: print "PASSED\t Do not create Feature from GFF line with swapped start and end, + strand ?\t\t{}".format( err) else: print "**FAILED**\t Do not create Feature from GFF line with swapped start and end, + strand ?\t\t{}".format( feature2.get_chromosome_region()) ##TODO finish complete testing of Feature class print "\n##TODO finish testing of Feature class creation\n" print "\n**** Testing counting and maniputlation ****\n" expected = {'all': {}, 'start': {}, 'end': {}} # Positions in metagene: 17 18 19 20 21-22,23-24,25-26,27-28,29-30,31-32,33-34,35-36,37-38,39-40, 41, 42 expected['all'] = { 'all': "first,sense:allreads,0.333,0.333,0.000,0.000,0.000,0.000,0.000,0.000,0.286,0.571,0.571,0.000,0.000,0.286,0.286,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.100,0.100,0.100,0.100,0.100,0.000,0.000,0.000,0.000,0.000,0.111", 'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000", 'end': "first,sense:allreads,0.000,3.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,2.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,1.000" } # Positions in metagene: 17 18 19 20 [21] 22 23 expected['start'] = { 'all': "first,sense:allreads,0.333,0.333,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.050", 'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000", 'end': "first,sense:allreads,0.000,3.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.500" } # Positions in metagene: 36 37 38 39 [40] 41 42 expected['end'] = { 'all': "first,sense:allreads,0.000,0.000,0.000,0.000,0.286,0.286,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.111", 'start': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,0.000", 'end': "first,sense:allreads,0.000,0.000,0.000,0.000,0.000,2.000,0.000\nfirst,antisense:allreads,0.000,0.000,0.000,0.000,0.000,0.000,1.000" } metagene = { 'all': Metagene(10, 4, 2), 'start': Metagene(1, 4, 2), 'end': Metagene(1, 4, 2) } for method in ['all', 'start', 'end']: if method == 'all': print "\t with Metagene:\t{}".format(metagene[method]) print "\t with chromosome conversions:\t{}".format( Feature.chromosome_conversion) else: print "\t with Metagene:\t{}".format(metagene[method]) print "\t with chromosome conversions:\t{}".format( Feature.chromosome_conversion) print "\nTesting feature_count option: ****{}****".format(method) feature_line = "{}\t{}\t{}\t{}\t{}\t{}\n".format( 1, 20, 40, "first", 44, "+") feature1 = Feature.create_from_bed(method, metagene[method], feature_line, False, False) print "\tFeature:\t{}".format(feature1.position_array) reads = [] reads.append( Read("chr1", "+", 3, 1, [10, 11, 12, 13, 14, 15, 16, 17, 18])) reads.append( Read("chr1", "-", 1, 2, [23, 24, 25, 26, 27, 28, 29, 30, 31, 32])) reads.append(Read("chr1", "+", 4, 2, [30, 31, 32, 33, 34, 40, 41])) reads.append( Read("chr1", "-", 1, 1, [42, 43, 44, 45, 46, 47, 48, 49, 50])) reads.append(Read("chr1", "+", 10, 1, [51, 52, 53, 54, 55])) reads.append(Read("chr2", "+", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25])) # starting count for count_method in ['all', 'start', 'end']: print "\nTesting count_method option: ****{}****".format( count_method) output = "{}\n".format(feature1) for r in reads: output += "{}\n".format(r) feature1.count_read(r, count_method, count_partial_reads=True) output += "{}\n".format(feature1) output += feature1.print_metagene(pretty=True) if str(feature1.print_metagene()).strip() == str( expected[method][count_method]).strip(): print "PASSED\tCreated correct metagene with feature method {} and count method {} ?".format( method, count_method) else: print "**FAILED**\tCreated correct metagene with feature method {} and count method {} ?".format( method, count_method) print "\tExpected:\n{}".format(expected[method][count_method]) print "\tActual :\n{}".format(feature1.print_metagene()) print "\tSummary of run:\n{}".format(output) feature1 = Feature.create_from_bed( method, metagene[method], feature_line, False, False) # zero out counter for next round try: unstranded_read = Read("chr1", ".", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25]) feature1.count_read(unstranded_read, 'all') except MetageneError as err: print "PASSED\tCaught unstranded read on stranded count ?\t\t".format( err) else: print "**FAILED**\tCaught unstranded read on stranded count ?" try: feature_line = "{}\t{}\t{}\t{}\t{}\t{}\n".format( 1, 20, 40, "first", 44, ".") feature1 = Feature.create_from_bed(method, metagene[method], feature_line, False, False) unstranded_read = Read("chr1", ".", 10, 1, [18, 19, 20, 21, 22, 23, 24, 25]) feature1.count_read(unstranded_read, 'all') except MetageneError as err: print "**FAILED**\tAllowed unstranded read on unstranded count ?\t\t".format( err) else: print "PASSED\tAllowed unstranded read on unstranded count ?" print "\n**** Testing adjust_to_metagene ****\n" chromosome_converter = {"1": "chr1", "2": "chr2"} # ((metagene_tupple),(feature_tupple),expected_result_string, message_string) tests = [((8, 2, 2), (16, 8, 24, 4), '8.000,8.000,4.000,4.000,12.000,12.000,2.000,2.000', "Expand to metagene ?"), ((4, 2, 2), (6, 8, 6, 2, 4, 4, 2, 4, 24, 8), '17.000,9.000,8.000,34.000', "Contract to metagene ?"), ((4, 2, 2), (2.5, 4, (10.0 / 3), 10, 11, 7.3, 4), '5.500,9.333,17.825,9.475', "Contract with messy floats ?"), ((3, 2, 2), (2.5, 4, (10.0 / 3), 10, 11, 7.3, 4), '7.611,19.556,14.967', "Contract with other messy floats ?")] for t in tests: metagene = Metagene(*t[0]) print "\t{}".format(metagene) feature_line = "{}\t{}\t{}\n".format(1, 0, len(t[1])) feature = Feature.create_from_bed('all', metagene, feature_line, False, False, short=True) adjusted_feature = "" for f in feature.adjust_to_metagene(t[1]): adjusted_feature += "{0:0.3f},".format(f) if adjusted_feature[:-1] == t[2]: print "PASSED\t{}".format(t[3]) else: print "**FAILED**\t{}".format(t[3]) print "\tExpected:\t{}".format(t[2]) print "\tActual :\t{}".format(adjusted_feature[:-1]) print "\tOriginal:\t{}".format(feature.adjust_to_metagene(t[1])) print "\n**** End of Testing the Feature class ****\n" # end of Feature.test method
def check_catch_bad_input(test, values): print Metagene(*values)
def print_metagene(self, pretty=False, header=False, interval_override=False): """Converts counts_array data to finalized metagene profiles for printing Standard printing is in comma-delimited lines for input into metagene_analysis.py Pretty printing (pretty=True) gives a human readable, if potentially super long, version """ final_metagenes = {} if interval_override: metagene = Metagene(self.feature_interval, self.padding['Upstream'], self.padding['Downstream']) output = "# Metagene:\t{}\n".format(metagene) output += metagene.print_full() elif header: metagene = Metagene(self.metagene_length, self.padding['Upstream'], self.padding['Downstream']) output = metagene.print_full(pretty) else: output = "" # process each subset grouping for subset in sorted(self.counts_array, reverse=True): # break counts_array into sections -> upstream padding, interval_feature, and downstream padding upstream_counts = self.counts_array[subset][0:self. padding['Upstream']] interval_counts = self.counts_array[ subset][self.padding['Upstream']:self.padding['Upstream'] + self.feature_interval] downstream_counts = self.counts_array[ subset][self.padding['Upstream'] + self.feature_interval:len(self.counts_array[subset])] if interval_override: metagene_interval_counts = interval_counts else: # compress (or expand) interval_counts to match the size of the internal metagene metagene_interval_counts = self.adjust_to_metagene( interval_counts) if pretty: output += "{0:15s}:\t".format(subset) for i in upstream_counts: output += "{0:>5s},".format("{0:3.2f}".format( i)) # keep 2 decimal places in the outputted float for i in metagene_interval_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) for i in downstream_counts: output += "{0:>5s},".format("{0:3.2f}".format(i)) output = output[:-1] + "\n" else: # build output output += "{},{}".format(self.name, subset) for p in upstream_counts: output += ",{0:0.3f}".format( p) # keep 3 decimal places in the outputted float for p in metagene_interval_counts: output += ",{0:0.3f}".format(p) for p in downstream_counts: output += ",{0:0.3f}".format(p) output += "\n" return output.strip() # remove trailing "\n"
def __init__(self, count_method, metagene_object, name, chromosome, start, end, strand, gap_counting=False, ignore_strand=False): """Not normally called directly; use Feature.create(file_format, count_method, metagene_object, feature_line, chromosome_conversion_table) to call indirectly. Define a new feature with an interval (represents feature length), up and downstream padding (defined by metagene_object), and genomic (1-based) start and end positions. Once defined here, the start and end represent the true start and end of the feature. Therefore, if a - strand (Crick strand) feature the start will be larger than the end. """ chromosome = Feature.chromosome_conversion[ chromosome] # convert to BAM-like chromosome designation if (confirm_integer(start, "Start", minimum=1, maximum=Read.chromosome_sizes[chromosome]) and confirm_integer(end, "End", minimum=1, maximum=Read.chromosome_sizes[chromosome])): start = int(start) end = int(end) # Define feature-specific metagene where feature_interval respresents # the length of the feature NOT the length of the final metagene interval if count_method == 'all': interval = (end - start + 1) # length of feature else: interval = 1 # length of the start (or end) of feature Metagene.__init__(self, interval, metagene_object.padding['Upstream'], metagene_object.padding['Downstream']) self.name = name self.chromosome = chromosome self.strand = strand self.metagene_length = metagene_object.feature_interval # define counts_array dictionary # key: orientation:gap_counts string # where orientation = {'unstranded', 'sense', 'antisense'} # gap_counts = {'ungapped', 'gapped, 'allreads'} # 'ungapped' + 'gapped' = 'allreads' # 'sense' + 'antisense' = 'unstranded' # # values: arrays of self.length initialized to 0 if self.strand != "+" and self.strand != "-": self.strand = "." orientation = ['unstranded'] elif ignore_strand: orientation = ['unstranded'] else: orientation = ['sense', 'antisense'] if gap_counting: gap_counts = ['ungapped', 'gapped'] else: gap_counts = ['allreads'] self.counts_array = {} for o in orientation: for g in gap_counts: self.counts_array["{}:{}".format(o, g)] = [] for p in range(self.length): #self.counts_array["{}:{}".format(o,g)].append(decimal.Decimal(0.0)) self.counts_array["{}:{}".format(o, g)].append(0) # define position_array # values : chromosomal 1-based nucleotide positions in 5' to 3' # orientation WITH RESPECT TO THE FEATURE # Example : # + strand: [10,11,12,13,14,15] # - strand: [15,14,13,12,11,10] # so position_array[0] is always the start of the feature (with upstream padding) # position_array[-1] is always the end of the feature (with downstream padding) self.position_array = [] if self.strand == "-": # chromosome start = feature end # chromosome end = feature start if count_method == 'start': start = end elif count_method == 'end': end = start region_start = start - self.padding[ 'Downstream'] # start is really end region_end = end + self.padding['Upstream'] # end is really start positions = range(region_start, region_end + 1) # inclusive list positions.reverse() else: if count_method == 'start': end = start # set both start and end to the start value elif count_method == 'end': start = end # set both start and end to the end value region_start = start - self.padding['Upstream'] region_end = end + self.padding['Downstream'] positions = range(region_start, region_end + 1) # inclusive list self.position_array = positions
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format(arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format(metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format( arguments.alignment, feature.get_samtools_region())]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam(samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read(read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file: output_file.write( "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable))) else: raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format( feature.get_chromosome_region(), feature.name, arguments.alignment))
def __init__( self, count_method, metagene_object, name, chromosome, start, end, strand, gap_counting=False, ignore_strand=False, ): """Not normally called directly; use Feature.create(file_format, count_method, metagene_object, feature_line, chromosome_conversion_table) to call indirectly. Define a new feature with an interval (represents feature length), up and downstream padding (defined by metagene_object), and genomic (1-based) start and end positions. Once defined here, the start and end represent the true start and end of the feature. Therefore, if a - strand (Crick strand) feature the start will be larger than the end. """ chromosome = Feature.chromosome_conversion[chromosome] # convert to BAM-like chromosome designation if confirm_integer(start, "Start", minimum=1, maximum=Read.chromosome_sizes[chromosome]) and confirm_integer( end, "End", minimum=1, maximum=Read.chromosome_sizes[chromosome] ): start = int(start) end = int(end) # Define feature-specific metagene where feature_interval respresents # the length of the feature NOT the length of the final metagene interval if count_method == "all": interval = end - start + 1 # length of feature else: interval = 1 # length of the start (or end) of feature Metagene.__init__(self, interval, metagene_object.padding["Upstream"], metagene_object.padding["Downstream"]) self.name = name self.chromosome = chromosome self.strand = strand self.metagene_length = metagene_object.feature_interval # define counts_array dictionary # key: orientation:gap_counts string # where orientation = {'unstranded', 'sense', 'antisense'} # gap_counts = {'ungapped', 'gapped, 'allreads'} # 'ungapped' + 'gapped' = 'allreads' # 'sense' + 'antisense' = 'unstranded' # # values: arrays of self.length initialized to 0 if self.strand != "+" and self.strand != "-": self.strand = "." orientation = ["unstranded"] elif ignore_strand: orientation = ["unstranded"] else: orientation = ["sense", "antisense"] if gap_counting: gap_counts = ["ungapped", "gapped"] else: gap_counts = ["allreads"] self.counts_array = {} for o in orientation: for g in gap_counts: self.counts_array["{}:{}".format(o, g)] = [] for p in range(self.length): # self.counts_array["{}:{}".format(o,g)].append(decimal.Decimal(0.0)) self.counts_array["{}:{}".format(o, g)].append(0) # define position_array # values : chromosomal 1-based nucleotide positions in 5' to 3' # orientation WITH RESPECT TO THE FEATURE # Example : # + strand: [10,11,12,13,14,15] # - strand: [15,14,13,12,11,10] # so position_array[0] is always the start of the feature (with upstream padding) # position_array[-1] is always the end of the feature (with downstream padding) self.position_array = [] if self.strand == "-": # chromosome start = feature end # chromosome end = feature start if count_method == "start": start = end elif count_method == "end": end = start region_start = start - self.padding["Downstream"] # start is really end region_end = end + self.padding["Upstream"] # end is really start positions = range(region_start, region_end + 1) # inclusive list positions.reverse() else: if count_method == "start": end = start # set both start and end to the start value elif count_method == "end": start = end # set both start and end to the end value region_start = start - self.padding["Upstream"] region_end = end + self.padding["Downstream"] positions = range(region_start, region_end + 1) # inclusive list self.position_array = positions
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format( arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format( metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe([ 'samtools view {} {}'.format(arguments.alignment, feature.get_samtools_region()) ]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam( samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read( read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open( "{}.metagene_counts.csv".format( arguments.output_prefix), 'a') as output_file: output_file.write("{}\n".format( feature.print_metagene(interval_override=arguments. interval_variable))) else: raise MetageneError( "Could not pull chromosomal region {} for feature {} from BAM file {}." .format(feature.get_chromosome_region(), feature.name, arguments.alignment))