def main(args): numerator_seq_files_data, denominator_seq_files_data, matrix_params, heatmap_params, filenames, max_threads = get_args( args) # Get the fold change matrix fold_change_matrix = get_fold_change_matrix(numerator_seq_files_data, denominator_seq_files_data, matrix_params, filenames, max_threads) # Now plot! bp_width, width, height, max_log2_fc, interval_size, minor_ticks, major_ticks = heatmap_params output_prefix = filenames[-1] output_filename = output_prefix + "_max_" + str(max_log2_fc) + "_width_" + str(bp_width) + \ "bp_fold_change_TES_heatmap" only_heatmap_filename = generate_random_filename(".tiff") negative_log2_value = -1 * max_log2_fc if max_log2_fc else None generate_heatmap(fold_change_matrix, 'red/blue', only_heatmap_filename, 2.2, negative_log2_value, max_log2_fc) tick_params = (minor_ticks, major_ticks) ticks_image_filename = make_ticks_image(width, interval_size, tick_params) combine_images(ticks_image_filename, only_heatmap_filename, output_filename) remove_files(fold_change_matrix, ticks_image_filename, only_heatmap_filename)
def test_arguments(self): with self.assertRaises(SystemExit): with Quieter(): sequence_from_region_around_max_tss.parse_args([]) with self.assertRaises(SystemExit): with Quieter(): sequence_from_region_around_max_tss.parse_args( ["max_tss_file"]) with self.assertRaises(SystemExit): with Quieter(): sequence_from_region_around_max_tss.parse_args( ["max_tss_file", 'left']) max_tss_file = generate_random_filename() with open(max_tss_file, 'w') as file: file.write( "\t".join(["chr16", "53607", "53608", "POLR3K", "0", "-"]) + "\n" + "\t".join(["chr16", "53872", "53873", "SNRNP25", "0", "+"]) + "\n") result = sequence_from_region_around_max_tss.parse_args( [max_tss_file, '-5', '10']) search = [["-", 5], ['+', 10]] self.assertEqual(result, (False, max_tss_file, search)) remove_files(max_tss_file)
def run_divergent_pileup_plots(regions_filename, sequencing_files_list, region_length, max_threads): # Make the opposite stranded regions file rev_region_filename = make_rev_region_file(regions_filename) # Split the files by strand region_files = split_bed_file(regions_filename) rev_region_files = split_bed_file(rev_region_filename) split_seq_filenames = [(split_bed_file(filename), filename) for filename in sequencing_files_list] with multiprocessing.Pool(processes=max_threads) as pool: averages = pool.starmap( get_pileups, [(region_files, rev_region_files, seq_files, region_length, filename) for (seq_files, filename) in split_seq_filenames]) split_seq_filenames_to_delete = [tup[0] for tup in split_seq_filenames] # Remove all the stranded region files remove_files(region_files, rev_region_files, rev_region_filename, split_seq_filenames_to_delete) output_metaplot_data(averages, region_length, "")
def make_rgb_heatmap(fold_change_matrix_filename, heatmap_params, output_filename_prefix): bp_width, width, height, gamma, max_fold_change, interval_size, minor_ticks_bp, major_ticks_bp = heatmap_params tick_params = minor_ticks_bp, major_ticks_bp only_heatmap_filename = generate_random_filename(extension=".tiff") if max_fold_change != None: negative_max_fold_change = -1 * max_fold_change else: negative_max_fold_change = None generate_heatmap(fold_change_matrix_filename, 'red/blue', only_heatmap_filename, gamma, negative_max_fold_change, max_fold_change, ticks=None) ticks_image_filename = make_ticks_image(width, interval_size, tick_params) # Combine the two images together output_filename = output_filename_prefix + "_max_" + str( max_fold_change) + "_width_" + str( bp_width) + "bp_gene_body_fold_change_heatmap" combine_images(ticks_image_filename, only_heatmap_filename, output_filename) remove_files(fold_change_matrix_filename, ticks_image_filename, only_heatmap_filename)
def make_ticks_image(width, interval_size, tick_params): minor_ticks_bp, major_ticks_bp = tick_params # Make the tick marks t = Ticks(minor_tick_mark_interval_size=(minor_ticks_bp / interval_size), major_tick_mark_interval_size=(major_ticks_bp / interval_size)) # Ticks matrix with a height of 50 px and a max black value of 1 ticks_matrix = make_ticks_matrix(width, 50, 1, t) # Write to a file ticks_matrix_filename = generate_random_filename() with open(ticks_matrix_filename, 'w') as file: for row in ticks_matrix: file.write("\t".join([str(val) for val in row]) + "\n") ticks_image_filename = generate_random_filename(".tiff") os.system( "/usr/bin/Rscript " + generate_heatmap_location + " " + " ".join([ticks_matrix_filename, "gray", ticks_image_filename, "2.2"])) remove_files(ticks_matrix_filename) return ticks_image_filename
def gather_data(sequencing_file, blacklist_filename, annotated_dataset, region_filenames, truQuant_regions_dict): paused_region_filename, gene_body_region_filename = region_filenames region_data_dict = {} # We need to blacklist the data before running the program blacklisted_sequencing_filename = generate_random_filename() run_subtract(sequencing_file, rna_blacklist_file, blacklist_filename, strand_specific=False, output_filename=blacklisted_sequencing_filename) indv_gene_counts_dict = get_counts_in_paused_region( paused_region_filename, blacklisted_sequencing_filename) get_counts_in_gene_bodies(gene_body_region_filename, blacklisted_sequencing_filename, indv_gene_counts_dict) # Only get the region data from the dataset which was annotated if annotated_dataset: five_prime_counts_dict = build_counts_dict(sequencing_file, "five") for gene in truQuant_regions_dict: region_data_dict[gene] = get_region_data( truQuant_regions_dict[gene]["Pause"], five_prime_counts_dict) remove_files(blacklisted_sequencing_filename) return sequencing_file, indv_gene_counts_dict, region_data_dict
def run_read_through_transcription(regions_filename, tsr_file, upstream_distance, downstream_distance, interval_size, sequencing_files, max_threads): # 1. Make the region intervals file from upstream distance to downstream distance in intervals incremented_regions_filename = make_incremented_regions( regions_filename, upstream_distance, downstream_distance, interval_size) # Blacklist the TSRs if tsr_file != 'no': blacklisted_filenames = blacklist_tsrs(sequencing_files, tsr_file) coverage_files = get_coverage_files(blacklisted_filenames, incremented_regions_filename, max_threads) remove_files(blacklisted_filenames) else: coverage_files = get_coverage_files(sequencing_files, incremented_regions_filename, max_threads) combined_dict = coverage_files_to_dictionary(coverage_files, sequencing_files) output_data(combined_dict, sequencing_files, upstream_distance, interval_size) # Remove all of the temporary files remove_files(incremented_regions_filename, coverage_files)
def test_get_pausing_distances_helper(self): region_filename = generate_random_filename() with open(region_filename, 'w') as file: file.write( "\t".join(["chr1", "100", "101", "positive_gene", "0", "+"]) + "\n" + "\t".join(["chr1", "9999", "10000", "negative_gene", "0", "-"]) + "\n" ) transcripts_dict = { "chr1": { "+": { 100: {200: 3, 300: 1, 800: 1, 283: 1} }, "-": { 9999: {9000: 3, 8000: 1, 7050: 1, 6542: 1} } } } result = tps_distance_per_gene.get_pausing_distances_helper(region_filename, transcripts_dict, 1) expected = { "positive_gene": 101, "negative_gene": 999 } self.assertDictEqual(result, expected) remove_files(region_filename)
def test_incorrect_number_of_arguments(self): with self.assertRaises(SystemExit): with Quieter(): nucleotide_heatmap.parse_args([]) with self.assertRaises(SystemExit): with Quieter(): nucleotide_heatmap.parse_args(["max_tss_file"]) with self.assertRaises(SystemExit): with Quieter(): nucleotide_heatmap.parse_args(["max_tss_file", "region_width"]) with self.assertRaises(SystemExit): with Quieter(): nucleotide_heatmap.parse_args(["max_tss_file", "region_width", "vertical_average", "extra"]) max_tss_file = generate_random_filename() with open(max_tss_file, 'w') as file: file.write( "\t".join(['chr1', '1', '2', 'name', '0', '+']) ) result = nucleotide_heatmap.parse_args([max_tss_file, '50', '2000', '2']) self.assertEqual(result, (max_tss_file, 50, 2000, 2)) remove_files(max_tss_file)
def test_positive_read_three(self, stdout): region_filename = generate_random_filename() seq_filename = generate_random_filename() with open(region_filename, 'w') as file: file.write("\t".join(["chr1", "0", "10", "name", "0", "+"])) with open(seq_filename, 'w') as file: file.write("\t".join(["chr1", "2", "9", "name", "0", "+"])) metaplot.main(['three', region_filename, seq_filename]) # Get the result from stdout by splitting into a list and making the output floats where possible result = [line for line in stdout.getvalue().split("\n") if line] result[0] = result[0].split("\t") for i, line in enumerate(result[1:]): result[i + 1] = [float(val) for val in line.split()] seq_file_basename = seq_filename.split("/")[-1] expected = [[ "Position", seq_file_basename + " 3' sense strand", seq_file_basename + " 3' divergent strand" ], [-5, 0, 0], [-4, 0, 0], [-3, 0, 0], [-2, 0, 0], [-1, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 1, 0], [5, 0, 0]] remove_files(region_filename, seq_filename) self.assertEqual(result, expected)
def get_matrix(seq_files_data, matrix_params, filenames, threads): upstream_distance, downstream_distance, bp_width, width, height, interval_size = matrix_params truQuant_output_file, tsr_file, output_filename_prefix = filenames blacklist_regions_file = blacklist_extended_gene_bodies( tsr_file, downstream_distance) # Make the intervals file to quantify intervals_filename = make_incremented_regions(truQuant_output_file, downstream_distance, upstream_distance, bp_width, interval_size) with multiprocessing.Pool(threads) as pool: args = [] dimensions = width, height for dataset in seq_files_data: seq_filename, spike_in = dataset filenames = [ seq_filename, blacklist_regions_file, intervals_filename ] args.append((filenames, dimensions, spike_in)) individual_matrices = pool.starmap(get_individual_matrix, args) combined_matrix = add_matrices(individual_matrices) remove_files(blacklist_regions_file, intervals_filename) return combined_matrix
def test_complete_run(self, stdout): reads_filename = generate_random_filename() with open(reads_filename, 'w') as file: file.write("chr1\t1\t10\tname\t0\t+\n") file.write("chr1\t10\t20\tname\t0\t-\n") regions_filename = generate_random_filename() with open(regions_filename, 'w') as file: file.write("chr1\t0\t20\tname\t0\t+\n") metaplot.main(['whole', regions_filename, reads_filename]) output = stdout.getvalue().split("\n")[1:] result = [] for line in output: if line: result.append(tuple([float(val) for val in line.split()])) remove_files(reads_filename, regions_filename) position = list(range(-10, 0)) + list(range(1, 11)) fw_expected = [0] + [1] * 9 + [0] * 10 rv_expected = [0] * 10 + [-1] * 10 expected = list(zip(position, fw_expected, rv_expected)) self.assertEqual(result, expected)
def get_individual_matrix(regions_filename, seq_file_data, end, repeat_amounts): repeat_amount, vertical_averaging = repeat_amounts seq_file, norm_factor = seq_file_data # 2. Load 2D list containing the data to be outputted original_matrix = get_original_matrix(regions_filename, seq_file, norm_factor, end) # Expand the matrix using the repeat amounts and write it to a file matrix_filename = generate_random_filename(".matrix") with open(matrix_filename, 'w') as file: for row in original_matrix: # Make the row the correct size by repeating each element by repeat_amount output_list = [] for val in row: for _ in range(repeat_amount): output_list.append(str(val)) file.write("\t".join(output_list) + "\n") # Do the vertical averaging heatmap_matrix = average_matrix(matrix_filename, vertical_averaging) remove_files(matrix_filename) return heatmap_matrix
def build_matrix(seq_file_data, matrix_params, filenames, threads): # Need to build a matrix for each sequencing file in seq_file_data truQuant_output_file, tsr_file, output_filename_prefix = filenames upstream_distance, distance_past_tes, width, height, interval_size = matrix_params blacklist_regions_file = blacklist_extended_gene_bodies( tsr_file, distance_past_tes) # Step 1. Make regions to quantify intervals_filename = make_incremented_regions(truQuant_output_file, distance_past_tes, interval_size, upstream_distance) dimensions = width, height with multiprocessing.Pool(threads) as pool: args = [] for dataset in seq_file_data: seq_filename, spike_in = dataset filenames = [ seq_filename, blacklist_regions_file, intervals_filename ] args.append((filenames, dimensions, spike_in)) individual_matrices = pool.starmap(build_individual_matrix, args) combined_matrix = add_matrices(individual_matrices) remove_files(individual_matrices, blacklist_regions_file, intervals_filename) return combined_matrix
def test_get_regions_file(self): max_tss_file = generate_random_filename() with open(max_tss_file, 'w') as file: file.write( "\t".join(["chr16", "53607", "53608", "POLR3K", "0", "-"]) + "\n" + "\t".join(["chr16", "53872", "53873", "SNRNP25", "0", "+"]) + "\n") search = [["-", 5], ["+", 5]] chrom_sizes = {} with open(hg38_chrom_sizes_random_file) as file: for line in file: chrom, size = line.split() chrom_sizes[chrom] = int(size) region_file, gene_names = sequence_from_region_around_max_tss.get_regions_file( max_tss_file, search, chrom_sizes) result = [] with open(region_file) as file: for line in file: result.append(line.split()) expected = [["chr16", "53603", "53613", "POLR3K", "0", "-"], ["chr16", "53867", "53877", "SNRNP25", "0", "+"]] self.assertEqual(result, expected) remove_files(max_tss_file, region_file)
def test_expand_region(self): max_tss_file = generate_random_filename() with open(max_tss_file, 'w') as file: file.write( "\t".join(["chr1", "925739", "925740", "SAMD11", "0", "+"]) + "\n" + "\t".join(["chr1", "959255", "959256", "NOC2L", "0", "-"]) + "\n" ) region_width = 20 expanded_region = nucleotide_heatmap.expand_region(max_tss_file, region_width) result = [] with open(expanded_region) as file: for line in file: result.append(line.split()) expected = [ ["chr1", "925729", "925749", "SAMD11", "0", "+"], ["chr1", "959246", "959266", "NOC2L", "0", "-"] ] self.assertEqual(result, expected) remove_files(expanded_region, max_tss_file)
def get_fold_change_matrix(numerator_seq_files_data, denominator_seq_files_data, matrix_params, filenames, max_threads): # We use max_threads / 2 because we will be running two instances of the combined threads_per_heatmap = int(max_threads / 2) # Make sure that if the user only wants to run on one thread that it does not default to 0 if threads_per_heatmap == 0: threads_per_heatmap = 1 numerator_args = (numerator_seq_files_data, matrix_params, filenames, threads_per_heatmap) denominator_args = (denominator_seq_files_data, matrix_params, filenames, threads_per_heatmap) with NestedPool(max_threads) as pool: numerator_matrix_filename, denominator_matrix_filename = pool.starmap( TES_heatmap.get_matrix, [numerator_args, denominator_args]) # Make the fold change matrix log_two_fold_change_matrix_filename = make_log_two_fold_change_matrix( numerator_matrix_filename, denominator_matrix_filename) remove_files(numerator_matrix_filename, denominator_matrix_filename) return log_two_fold_change_matrix_filename
def run_read_end_fold_change_heatmap(args): end, filenames, max_log2_fc, repeat_amounts, numerator_seq_files_data, denominator_seq_files_data, \ threads, tick_parameters = parse_input(args) regions_file, output_prefix = filenames # We use max_threads / 2 because we will be running two instances of the combined threads_per_heatmap = int(threads / 2) # Make sure that if the user only wants to run on one thread that it does not default to 0 if threads_per_heatmap == 0: threads_per_heatmap = 1 # Get the numerators and denominators matrix with NestedPool(threads) as pool: args = [ (regions_file, numerator_seq_files_data, end, repeat_amounts, threads_per_heatmap), (regions_file, denominator_seq_files_data, end, repeat_amounts, threads_per_heatmap) ] numerator_matrix, denominator_matrix = pool.starmap(region_heatmap.get_matrix, args) # Do the log2 fold change for them log2_matrix = make_log_two_fold_change_matrix(numerator_matrix, denominator_matrix) remove_files(numerator_matrix, denominator_matrix) px_per_bp, vertical_averaging = repeat_amounts output_filename = output_prefix.replace(".tiff", "") + "_max_" + str(max_log2_fc) + "_vertical_averaging_" + \ str(vertical_averaging) + "_px_per_bp_" + str(px_per_bp) + "_region_heatmap.tiff" # Make the heatmap make_heatmap(log2_matrix, max_log2_fc, tick_parameters, output_filename, px_per_bp) remove_files(log2_matrix)
def test_parse_input(self): # No arguments throws error with self.assertRaises(SystemExit): with Quieter(): metaplot.parse_input([]) # Needs a region file and at least one seq file regions_file = generate_random_filename() with open(regions_file, 'w') as file: file.write("\t".join(['chr1', '1', '3', 'name', '0', '+'])) region_length = 2 max_threads = multiprocessing.cpu_count() # These will work!! result = metaplot.parse_input(['five', regions_file, 'seq_file']) self.assertEqual(result, ('five', regions_file, ['seq_file'], 2, max_threads)) result = metaplot.parse_input(['three', regions_file, 'seq_file']) self.assertEqual(result, ('three', regions_file, ['seq_file'], 2, max_threads)) # Test the threading is working result = metaplot.parse_input( ['five', regions_file, 'seq_file', '-t', '4']) self.assertEqual(result, ('five', regions_file, ['seq_file'], 2, 4)) result = metaplot.parse_input( ['three', regions_file, 'seq_file', '--threads', '4']) self.assertEqual(result, ('three', regions_file, ['seq_file'], 2, 4)) remove_files(regions_file)
def test_get_counts(self): pause_regions_filename = generate_random_filename() with open(pause_regions_filename, 'w') as file: file.write( "\t".join(["chr1", "100", "250", "positive_gene", "90", "+"]) + "\n" + "\t".join( ["chr1", "700", "850", "negative_gene", "1523", "-"]) + "\n") gene_body_filename = generate_random_filename() with open(gene_body_filename, 'w') as file: file.write( "\t".join(["chr1", "251", "750", "positive_gene", "90", "+"]) + "\n" + "\t".join( ["chr1", "200", "700", "negative_gene", "1523", "-"]) + "\n") blacklisted_sequencing_file = generate_random_filename() with open(blacklisted_sequencing_file, 'w') as file: file.write( "\t".join(["chr1", "80", "220", "5'not_counted", "0", "+"]) + "\n" + "\t".join(["chr1", "259", "285", "5'not_counted", "0", "+"]) + "\n" + "\t".join(["chr1", "132", "220", "5'count", "0", "+"]) + "\n" + "\t".join(["chr1", "132", "800", "5'count", "0", "-"]) + "\n" + "\t".join(["chr1", "750", "783", "5'count", "0", "-"]) + "\n" + "\t".join(["chr1", "750", "900", "5'not_counted", "0", "-"]) + "\n" + "\t".join(["chr1", "500", "600", "5'not_counted", "0", "-"]) + "\n") indv_gene_counts_dict = truQuant.get_counts_in_paused_region( pause_regions_filename, blacklisted_sequencing_file) indv_gene_counts_dict = truQuant.get_counts_in_gene_bodies( gene_body_filename, blacklisted_sequencing_file, indv_gene_counts_dict) expected_indv_gene_counts_dict = { "positive_gene": { "Pause": 1, "Body": 1 }, "negative_gene": { "Pause": 2, "Body": 1 } } self.assertDictEqual(indv_gene_counts_dict, expected_indv_gene_counts_dict) remove_files(pause_regions_filename, gene_body_filename, blacklisted_sequencing_file)
def test_map_tsrs_to_search_regions(self): # Test # TSRs # One that is contained in the search region, one with partial overlap in the 5' end, # one with partial overlap in the 3' end. One with no overlap before the TSR. One with no overlap after the TSR # One on the opposite strand # Need to define a TSR file and a search regions dict search_regions_dict = { "chr1": [["chr1", "100", "200", "positive_strand_test", "0", "+"], ["chr1", "500", "600", "negative_strand_test", "0", "-"]] } tsr_filename = generate_random_filename(".tab") additional_columns = [ "tss_left", "tss_right", "tss_strength", "avg_tss" ] with open(tsr_filename, 'w') as file: file.write("\t".join(["chr1", "40", "60", "no_overlap", "0", "+"] + additional_columns) + "\n") file.write( "\t".join(["chr1", "300", "320", "no_overlap2", "0", "+"] + additional_columns) + "\n") file.write("\t".join( ["chr1", "90", "110", "partial_overlap_5'", "0", "+"] + additional_columns) + "\n") file.write("\t".join( ["chr1", "190", "210", "partial_overlap_3'", "0", "+"] + additional_columns) + "\n") file.write("\t".join( ["chr1", "140", "160", "complete_overlap", "0", "+"] + additional_columns) + "\n") file.write( "\t".join(["chr1", "140", "160", "opposite_strand", "0", "-"] + additional_columns) + "\n") gene_tsr_dict, flow_through_tsrs = truQuant.map_tsrs_to_search_regions( tsr_filename, search_regions_dict) expected_gene_tsr_dict = { "positive_strand_test": [["chr1", "90", "110", "partial_overlap_5'", "0", "+", "avg_tss"], ["chr1", "190", "210", "partial_overlap_3'", "0", "+", "avg_tss"], ["chr1", "140", "160", "complete_overlap", "0", "+", "avg_tss"]] } expected_flow_through_tsrs = [ ["chr1", "40", "60", "no_overlap", "0", "+", "avg_tss"], ["chr1", "300", "320", "no_overlap2", "0", "+", "avg_tss"], ["chr1", "140", "160", "opposite_strand", "0", "-", "avg_tss"] ] self.assertDictEqual(gene_tsr_dict, expected_gene_tsr_dict) self.assertEqual(flow_through_tsrs, expected_flow_through_tsrs) remove_files(tsr_filename)
def test_with_n_in_sequence(self, stdout): region_file = generate_random_filename() with open(region_file, 'w') as file: file.write("chr1\t1\t11\tname\t0\t+") self.assertFalse(self.get_sequence(stdout.getvalue())) base_distribution.main([region_file]) remove_files(region_file)
def main(args): seq_files_data, matrix_params, heatmap_params, filenames, threads = get_args( args) output_filename_prefix = filenames[-1] matrix = build_matrix(seq_files_data, matrix_params, filenames, threads) make_heatmap(matrix, heatmap_params, output_filename_prefix) remove_files(matrix)
def test_two_sequencing_files(self, stdout): reads_filename = generate_random_filename() reads_filename_two = generate_random_filename() with open(reads_filename, 'w') as file: file.write("chr1\t1\t10\tname\t0\t+\n") file.write("chr1\t10\t20\tname\t0\t-\n") with open(reads_filename_two, 'w') as file: file.write("chr1\t1\t10\tname\t0\t-\n") file.write("chr1\t10\t20\tname\t0\t+\n") regions_filename = generate_random_filename() with open(regions_filename, 'w') as file: file.write("chr1\t0\t20\tname\t0\t+\n") metaplot.main( ['whole', regions_filename, reads_filename, reads_filename_two]) output = stdout.getvalue().split("\n") header = output[0].split("\t") reads_basename = reads_filename.split("/")[-1] reads_basename_two = reads_filename_two.split("/")[-1] expected_header = [ "Position", reads_basename + " whole sense strand", reads_basename + " whole divergent strand", reads_basename_two + " whole sense strand", reads_basename_two + " whole divergent strand" ] self.assertEqual(header, expected_header) result = [] for line in output[1:]: if line: result.append(tuple([float(val) for val in line.split()])) remove_files(reads_filename, regions_filename, reads_filename_two) position = list(range(-10, 0)) + list(range(1, 11)) fw_expected = [0] + [1] * 9 + [0] * 10 rv_expected = [0] * 10 + [-1] * 10 fw_expected_two = [0] * 10 + [1] * 10 rv_expected_two = [0] + [-1] * 9 + [0] * 10 expected = list( zip(position, fw_expected, rv_expected, fw_expected_two, rv_expected_two)) self.assertEqual(result, expected)
def combine_images(ticks_image_filename, only_heatmap_filename, output_filename): ticks_image = Image.open(ticks_image_filename) heatmap_image = Image.open(only_heatmap_filename) final_image = Image.new('RGB', (ticks_image.width, ticks_image.height + heatmap_image.height)) final_image.paste(heatmap_image, (0, 0)) final_image.paste(ticks_image, (0, heatmap_image.height)) final_image.save(output_filename + ".tiff") remove_files(ticks_image_filename, only_heatmap_filename)
def run_base_distribution(regions_file, region_length): # 1. Get the sequences of the region fasta_file = run_getfasta(regions_file) sequences = read_fasta(fasta_file) remove_files(fasta_file) # 2. Get the percentages at each position avgs_dict = calculate_averages(sequences) # 3. Output into a file output_data(avgs_dict, region_length)
def get_coverage_files_helper(filename, region_intervals_file): # First makes the three bed file three_prime_end_file = make_read_end_file(filename, 'three') # Run coverage on the three bed file coverage_file = generate_random_filename() run_coverage(region_intervals_file, three_prime_end_file, output_filename=coverage_file) remove_files(three_prime_end_file) return coverage_file
def run_region_heatmap(args): end, filenames, seq_files_data, heatmap_parameters, repeat_amounts, tick_parameters, threads = parse_input( args) regions_filename, output_prefix = filenames combined_matrix = get_matrix(regions_filename, seq_files_data, end, repeat_amounts, threads) make_heatmap(combined_matrix, output_prefix, heatmap_parameters, tick_parameters) remove_files(combined_matrix)
def test_make_search_regions(self): # Tests both positive and negative strands with differing extensions regions_filename = generate_random_filename() with open(regions_filename, 'w') as file: file.write("\t".join([ "chr1", "100", "200", "+", "positive_strand_test", "108", "111" ]) + "\n") file.write("\t".join([ "chr1", "2", "505", "-", "negative_strand_test", "498", "501" ]) + "\n") search_regions_dict, annotations_dict = truQuant.make_search_regions( regions_filename, 10) expected_search_regions_dict = { "chr1": [["chr1", "90", "108", "positive_strand_test", "0", "+"], ["chr1", "501", "515", "negative_strand_test", "0", "-"]] } expected_annotations_dict = { "positive_strand_test": ["chr1", "100", "200", "positive_strand_test", "0", "+"], "negative_strand_test": ["chr1", "2", "505", "negative_strand_test", "0", "-"] } self.assertDictEqual(search_regions_dict, expected_search_regions_dict) self.assertDictEqual(annotations_dict, expected_annotations_dict) search_regions_dict, annotations_dict = truQuant.make_search_regions( regions_filename, 100) expected_search_regions_dict = { "chr1": [["chr1", "0", "108", "positive_strand_test", "0", "+"], ["chr1", "501", "605", "negative_strand_test", "0", "-"]] } expected_annotations_dict = { "positive_strand_test": ["chr1", "100", "200", "positive_strand_test", "0", "+"], "negative_strand_test": ["chr1", "2", "505", "negative_strand_test", "0", "-"] } self.assertDictEqual(search_regions_dict, expected_search_regions_dict) self.assertDictEqual(annotations_dict, expected_annotations_dict) remove_files(regions_filename)
def test_make_incremented_regions(self): return truQuant_output_file = generate_random_filename('-truQuant_output.txt') tQ_text = """ Gene Chromosome Pause Region Left Pause Region Right Strand Total 5' Reads MaxTSS MaxTSS 5' Reads Weighted Pause Region Center STDEV of TSSs Gene Body Left Gene Body Right Gene Body Distance seq_file.bed Pause Region seq_file.bed Gene Body negative_gene chr1 5000 5150 - 194 5100 46 5100 13.306459171023036 4000 5000 600 194 18 positive_gene chr1 1000 1150 + 234 1100 27 1100 25.417791063821863 1150 2000 850 234 17""" tQ_text = [line.split() for line in tQ_text.split("\n") if line] with open(truQuant_output_file, 'w') as file: for line in tQ_text: file.write("\t".join(line) + "\n") downstream_distance = 0 upstream_distance = 200 bp_width = 1000 interval_size = 200 incremented_regions_file = TES_heatmap.make_incremented_regions(truQuant_output_file, downstream_distance, upstream_distance, bp_width, interval_size) result = [] with open(incremented_regions_file) as file: for line in file: result.append(line.split()) expected = [ ["chr1", "5100", "5300", "negative_gene", "46", "-"], ["chr1", "4900", "5100", "negative_gene", "46", "-"], ["chr1", "4700", "4900", "negative_gene", "46", "-"], ["chr1", "4500", "4700", "negative_gene", "46", "-"], ["chr1", "4300", "4500", "negative_gene", "46", "-"], ["chr1", "900", "1100", "positive_gene", "27", "+"], ["chr1", "1100", "1300", "positive_gene", "27", "+"], ["chr1", "1300", "1500", "positive_gene", "27", "+"], ["chr1", "1500", "1700", "positive_gene", "27", "+"], ["chr1", "1700", "1900", "positive_gene", "27", "+"], ] remove_files(truQuant_output_file) self.assertEqual(result, expected)