def test_run_process(self): filepath = "{0}/tests/data/test_file.txt".format(MODULE_DIR) remove_file(filepath) # simple test to create file args = ["touch", filepath] run_process(args) self.assertEqual(os.path.isfile(filepath), True)
def intersect_with_bed(input_bam, input_bed, output_file, force_strand=True): """ Given a bam file, intersect with a bed file to leave only thse reads that correspond to entries in the bed file Parameters --------- input_bam : str Path to the .bam file containing the reads input_bed : str Path to the .bed file containing the intervals output_file : str Path to the output file force_strand : bool If set, ensure the reads map to the same strand Examples --------- >>> from bioUtilities.bam import intersect_with_bed >>> intersect_with_bed("reads.bam", "exons.bed", "exon_reads.bam") """ # now use bedtools to count the reads args = ["bedtools", "intersect", "-s", "-abam", input_bam, "-b", input_bed] if not force_strand: del args[2] run_process(args, file_for_output=output_file)
def nm_filter(input_file, output_file, lower_limit = 0, upper_limit = 10): """ Filter a .bam/.sam by NM value Parameters --------- input_file : str Path to the file to be filtered output_file : str Path to the output lower_limit : int If set, the lower boundary of NM values, for which all reads have to be greater than or equal to upper_limit : int If set, the upper boundary of NM values, for which all reads have to be less than or equal to Examples --------- >>> from bioUtilities.bam import nm_filter >>> nm_filter("test.bam", "test_output.bam", lower_limit = 2) >>> nm_filter("test.bam", "test_output.bam", upper_limit = 6) >>> nm_filter("test.bam", "test_output.bam", lower_limit = 2, upper_limit = 6) """ # if neither thresholds are specified if not lower_limit and not upper_limit: raise Exception("\nERROR: You must specify at least one of the lower_limit or upper_limit thresholds.\n") if not lower_limit: print("Using the default lower limit of 0.") if not upper_limit: print("Using the default uppper limit of 10.") #create output file if output_file[-4:] == ".bam": temp_output_file = "{0}.sam".format(output_file[:-4]) else: temp_output_file = output_file grep_args = ["^@"] for i in range(lower_limit, upper_limit + 1): grep_args.append("\|\tNM:i:{0}\t".format(i)) grep_args = "".join(grep_args) # read in the file file_reads = run_process(["samtools", "view", "-h", input_file]) # now run the grep args output = run_process(["grep", grep_args], input_to_pipe = file_reads, file_for_output = temp_output_file) # if the output file is in bam format if output_file != temp_output_file: samtools_args = ["samtools", "view", "-bh", temp_output_file] run_process(samtools_args, file_for_output = output_file) remove_file(temp_output_file)
def test_mapq_filter_lower_limit(self): input_file = "{0}/tests/data/input.bam".format(MODULE_DIR) expected_file = "{0}/tests/data/expected_mapq_filter_1.sam".format(MODULE_DIR) observed_file = "{0}/tests/data/observed_mapq_filter_1.bam".format(MODULE_DIR) mapq_filter(input_file, observed_file, lower_limit = 200) expected = read_many_fields(expected_file, "\t") # convert bam to sam to check correct output # use samtools to extract in the same format as sam temp_observed = "{0}/tests/data/observed_mapq_filter_1.sam".format(MODULE_DIR) samtools_args = ["samtools", "view", observed_file] run_process(samtools_args, file_for_output = temp_observed) observed = read_many_fields(temp_observed, "\t") self.assertEqual(expected, observed) remove_file(temp_observed) remove_file(observed_file)
def read_count(input_file): """ Get the number of reads from a .bam/.sam file Parameters --------- input_file : str Path to the file to be counted Returns --------- read_count : int The number of reads in the specified file Examples --------- >>> from bioUtilities.bam import read_count >>> reads = read_count("test.bam") >>> print(reads) >>> 15 """ raw_read_count = run_process(["samtools", "view", "-c", input_file]) read_count = int(re.findall("(\d+)", raw_read_count)[0]) return read_count
def test_xt_filter(self): input_file = "{0}/tests/data/input.bam".format(MODULE_DIR) expected_file = "{0}/tests/data/expected_xt_filter.sam".format( MODULE_DIR) observed_file = "{0}/tests/data/observed_xt_filter.bam".format( MODULE_DIR) xt_filter(input_file, observed_file, filter="XT:A:U") #convert bam to sam to check correct output temp_observed = "{0}/tests/data/observed_xt_filter.sam".format( MODULE_DIR) samtools_args = ["samtools", "view", observed_file] run_process(samtools_args, file_for_output=temp_observed) expected = read_many_fields(expected_file, "\t") observed = read_many_fields(temp_observed, "\t") self.assertEqual(expected, observed) remove_file(temp_observed) remove_file(observed_file)
def xt_filter(input_file, output_file, filter=None): """ Filter a .bam/.sam file by the XT tag Parameters --------- input_file : str Path to the file to be filtered output_file : str Path to the output filter : str Filter than reads should contain Examples --------- >>> from bioUtilities.bam import xt_filter >>> xt_filter("test.bam", "test_xt_filtered.bam", filter = "XT:A:U") """ if not xt_filter: raise Exception('\nXT filter not specified.\n') # if the output format is .bam, temporarily create .sam output file if output_file[-4:] == ".bam": temp_output_file = "{0}.sam".format(output_file[:-4]) else: temp_output_file = output_file # get the header of the file sam_output = run_process(["samtools", "view", "-h", input_file]) grep_args = [] # get header lines grep_args.append("^@") # get XT values matching the filter grep_args.append("\|\t{0}\t".format(filter)) grep_args = "".join(grep_args) # run the filter run_process(["grep", grep_args], input_to_pipe=sam_output, file_for_output=temp_output_file) # if wanting to create bam, create bam and delete sam if output_file != temp_output_file: samtools_args = ["samtools", "view", "-bh", temp_output_file] run_process(samtools_args, file_for_output=output_file) remove_file(temp_output_file)
def test_intersect_with_bed(self): input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR) input_bam = "{0}/tests/data/input3.bam".format(MODULE_DIR) observed_file = "{0}/tests/data/observed_intersect_with_bed.bam".format( MODULE_DIR) expected_file = "{0}/tests/data/expected_intersect_with_bed.sam".format( MODULE_DIR) remove_file(observed_file) intersect_with_bed(input_bam, input_bed, observed_file) observed_sam = "{0}/tests/data/observed_intersect_with_bed.sam".format( MODULE_DIR) args = ["samtools", "view", "-h", observed_file] run_process(args, file_for_output=observed_sam) observed = read_many_fields(observed_sam) expected = read_many_fields(expected_file) self.assertEqual(expected, observed) remove_file(observed_file) remove_file(observed_sam)
def line_count(filepath): """ Count the number of lines in a file Parameters --------- filepath : str Path to the file Returns --------- line_count : int The number of lines in the file Examples --------- >>> from bioUtilities.files import line_count >>> line_count("test_file.txt") 3 """ count_output = run_process(["wc", "-l", filepath]) line_count = int(re.findall("\s+(\d+)", count_output)[0]) return line_count
def mapq_filter(input_file, output_file, lower_limit=None, upper_limit=None): """ Filter a .bam/.sam by MAPQ value Parameters --------- input_file : str Path to the file to be filtered output_file : str Path to the output lower_limit : int If set, the lower boundary of mapq values, for which all reads have to be greater than or equal to upper_limit : int If set, the upper boundary of mapq values, for which all reads have to be less than or equal to Examples --------- >>> from bioUtilities.bam import xt_filter >>> mapq_filter("test.bam", "test_output.bam", lower_limit = 100) >>> mapq_filter("test.bam", "test_output.bam", upper_limit = 250) >>> mapq_filter("test.bam", "test_output.bam", lower_limit = 100, upper_limit = 250) """ #if neither thresholds are specified if not lower_limit and not upper_limit: raise Exception( "ERROR: You must specify at least one of the lower_limit or upper_limit thresholds." ) samtools_args = ["samtools", "view", "-h"] # if both thresholds are specified, we want the reads with values between these if lower_limit and upper_limit: # create temp file temp_directory = "temp_mapq_filter.{0}".format(random.random()) create_directory(temp_directory) temp_file = "{0}/{1}".format(temp_directory, output_file.split("/")[-1]) # first get everything above the lower limit temp_args = samtools_args.copy() temp_args.extend(["-q", lower_limit, input_file]) run_process(temp_args, file_for_output=temp_file) # now get everything below the upper limit. need to account for # samtools removing everything below threshold # so when inversing need to add 1 to total temp_args = samtools_args.copy() upper_limit = upper_limit + 1 temp_args.extend(["-q", upper_limit, temp_file, "-U", output_file]) run_process(temp_args) # cleanup the temp files remove_directory(temp_directory) # if only a lower limit is specified elif lower_limit and not upper_limit: samtools_args.extend(["-bq", lower_limit, input_file]) run_process(samtools_args, file_for_output=output_file) #if only the upper threshold is specified elif upper_limit and not lower_limit: # account for inverse by adding 1 upper_limit = upper_limit + 1 samtools_args.extend( ["-q", upper_limit, input_file, "-U", output_file]) run_process(samtools_args)
def count_interval_reads(input_file, input_bam, output_file, paired_end=False, min_qual=None, min_length=50): """ For each interval in bed format, count the number of reads in the bam file Parameters --------- input_file : str Path to the file containing the intervals input_bam : str Path to the .bam file containing the reads output_file : str Path to the output file Dependencies --------- featureCounts v1.6.4 Examples --------- >>> from bioUtilities.bam import count_interval_reads >>> count_interval_reads("exon_junctions.bed", "reads.bam", "exon_junction_reads.bed") """ # check that featureCounts command exists if not shutil.which('featureCounts'): raise Exception('\nERROR: featureCounts must be installed.\n') # if input_file is in bed format, need to convert to .saf format # .saf format its 1-based if get_extension(input_file) == ".bed": base_input_file = input_file working_input_file = "{0}.saf".format(input_file[:-4]) bed_to_saf(old_input_file, input_file) else: working_input_file = input_file if get_extension(output_file) == ".bed": working_output_file = "{0}.saf".format(output_file[:-4]) else: working_output_file = output_file # now can use featureCounts to count reads # this return the file in 'saf' format args = ["featureCounts", "-fO", "-F", "SAF", "-g", "ID"] if paired_end: args.append("-p") if min_qual: args.extend(["-Q", min_qual]) if min_length: args.extend(["-d", min_length]) args.extend( ["-a", working_input_file, "-o", working_output_file, input_bam]) # now run the count run_process(args) # if the output format is bed, convert the saf output to bed if get_extension(output_file) == ".bed": entries = read_many_fields(working_output_file)[2:] with open(output_file, "w") as outfile: for entry in entries: output = [ entry[1], str(int(entry[2]) - 1), str(int(entry[3]) - 1), entry[0], ".", entry[4] ] output.extend(entry[5:]) outfile.write("{0}\n".format("\t".join(output))) # now clean up the files if working_input_file != input_file: remove_file(working_input_file) if working_output_file != output_file: remove_file(output_file)