def get_coverage_files_helper(filename, region_intervals_file):
    # First makes the three bed file
    three_prime_end_file = make_read_end_file(filename, 'three')

    # Run coverage on the three bed file
    coverage_file = generate_random_filename()
    run_coverage(region_intervals_file,
                 three_prime_end_file,
                 output_filename=coverage_file)

    remove_files(three_prime_end_file)
    return coverage_file
Example #2
0
def get_counts_in_gene_bodies(regions_filename, blacklisted_sequencing_file,
                              indv_gene_counts_dict):
    three_bed_filename = make_read_end_file(blacklisted_sequencing_file,
                                            'three')

    random_filename = run_coverage(regions_filename, three_bed_filename)

    with open(random_filename) as file:
        for line in file:
            counts = int(line.split()[-4])
            gene_name = line.split()[3]

            indv_gene_counts_dict[gene_name]["Body"] = counts

    remove_files(three_bed_filename, random_filename)
    return indv_gene_counts_dict
Example #3
0
def quantify_intervals(sequencing_filename, blacklist_filename,
                       intervals_file):
    # First make the sequencing file 3' ends only
    three_prime_filename = make_read_end_file(sequencing_filename, 'three')

    # Now blacklist the sequencing file
    blacklisted_sequencing_filename = run_subtract(three_prime_filename,
                                                   blacklist_filename,
                                                   rna_blacklist_file)

    # Use bedtools coverage to get the number of 3' reads for each interval
    coverage_file = run_coverage(intervals_file,
                                 blacklisted_sequencing_filename)

    # We can now remove the 3' reads file
    remove_files(blacklisted_sequencing_filename, three_prime_filename)

    return coverage_file
Example #4
0
def gather_data(read_type, seq_file, regions_file):

    if read_type in ['five', 'three']:
        modified_seq_filename = make_read_end_file(seq_file, read_type)
        need_to_remove_modified_seq_filename = True
    else:
        modified_seq_filename = seq_file
        need_to_remove_modified_seq_filename = False

    # Then quantify the 5' ends
    coverage_file = run_coverage(regions_file, modified_seq_filename)

    data = organize_counts(coverage_file, seq_file)

    remove_files(coverage_file)

    if need_to_remove_modified_seq_filename:
        remove_files(modified_seq_filename)

    return data
Example #5
0
def get_counts_in_paused_region(pause_region_filename,
                                blacklisted_sequencing_file):
    five_bed_filename = make_read_end_file(blacklisted_sequencing_file, 'five')

    # Run bedtools coverage on the 5' bed file
    random_filename = run_coverage(pause_region_filename, five_bed_filename)

    # Not using a defaultdict because multiprocessing does not like it
    indv_gene_counts_dict = {}

    with open(random_filename) as file:
        for line in file:
            counts = int(line.split()[-4])
            gene_name = line.split()[3]

            if gene_name not in indv_gene_counts_dict:
                indv_gene_counts_dict[gene_name] = {"Pause": -1, "Body": -1}

            indv_gene_counts_dict[gene_name]["Pause"] = counts

    remove_files(five_bed_filename, random_filename)
    return indv_gene_counts_dict