Beispiel #1
0
 def test_run_process(self):
     filepath = "{0}/tests/data/test_file.txt".format(MODULE_DIR)
     remove_file(filepath)
     # simple test to create file
     args = ["touch", filepath]
     run_process(args)
     self.assertEqual(os.path.isfile(filepath), True)
Beispiel #2
0
 def test_remove_file(self):
     filepath = "{0}/tests/data/test_remove_file.txt".format(MODULE_DIR)
     # write to a test file
     with open(filepath, "w") as file:
         pass
     # remove the file
     remove_file(filepath)
     self.assertFalse(os.path.isfile(filepath))
 def test_convert_one_to_zero(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_convert_one_to_zero.bed".format(MODULE_DIR)
     expected_file = "{0}/tests/data/observed_convert_one_to_zero.bed".format(MODULE_DIR)
     remove_file(observed_file)
     convert_one_to_zero(input_bed, observed_file)
     observed = read_many_fields(observed_file)
     expected = read_many_fields(expected_file)
     self.assertEqual(expected, observed)
     remove_file(observed_file)
Beispiel #4
0
def nm_filter(input_file, output_file, lower_limit = 0, upper_limit = 10):
    """
    Filter a .bam/.sam by NM value

    Parameters
    ---------
    input_file : str
        Path to the file to be filtered
    output_file : str
        Path to the output
    lower_limit : int
        If set, the lower boundary of NM values, for which all reads have to
        be greater than or equal to
    upper_limit : int
        If set, the upper boundary of NM values, for which all reads have to
        be less than or equal to

    Examples
    ---------
    >>> from bioUtilities.bam import nm_filter
    >>> nm_filter("test.bam", "test_output.bam", lower_limit = 2)
    >>> nm_filter("test.bam", "test_output.bam", upper_limit = 6)
    >>> nm_filter("test.bam", "test_output.bam", lower_limit = 2, upper_limit = 6)
    """

    # if neither thresholds are specified
    if not lower_limit and not upper_limit:
        raise Exception("\nERROR: You must specify at least one of the lower_limit or upper_limit thresholds.\n")

    if not lower_limit:
        print("Using the default lower limit of 0.")
    if not upper_limit:
        print("Using the default uppper limit of 10.")

    #create output file
    if output_file[-4:] == ".bam":
        temp_output_file = "{0}.sam".format(output_file[:-4])
    else:
        temp_output_file = output_file

    grep_args = ["^@"]
    for i in range(lower_limit, upper_limit + 1):
        grep_args.append("\|\tNM:i:{0}\t".format(i))
    grep_args = "".join(grep_args)

    # read in the file
    file_reads = run_process(["samtools", "view", "-h", input_file])
    # now run the grep args
    output = run_process(["grep", grep_args], input_to_pipe = file_reads, file_for_output = temp_output_file)

    # if the output file is in bam format
    if output_file != temp_output_file:
        samtools_args = ["samtools", "view", "-bh", temp_output_file]
        run_process(samtools_args, file_for_output = output_file)
        remove_file(temp_output_file)
Beispiel #5
0
 def test_get_exon_junctions1(self):
     input_file = "{0}/tests/data/input_coding_exons.bed".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_get_exon_junctions1.bed".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_get_exon_junctions1.bed".format(
         MODULE_DIR)
     remove_file(observed_file)
     get_exon_junctions(input_file, observed_file)
     observed = read_many_fields(observed_file)
     expected = read_many_fields(expected_file)
     self.assertEqual(observed, expected)
     remove_file(observed_file)
Beispiel #6
0
 def test_bed_to_saf(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_bed_to_saf.saf".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_bed_to_saf.saf".format(
         MODULE_DIR)
     remove_file(observed_file)
     bed_to_saf(input_bed, observed_file)
     observed = read_many_fields(observed_file)
     expected = read_many_fields(expected_file)
     self.assertEqual(expected, observed)
     remove_file(observed_file)
Beispiel #7
0
 def test_parse_gtf1(self):
     input_file = "{0}/tests/data/input.gtf".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_parse_gtf1.bed".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_parse_gtf1.bed".format(
         MODULE_DIR)
     parse_gtf(input_file,
               features=["exon"],
               protein_coding=True,
               output_file=observed_file)
     expected = read_many_fields(expected_file, "\t")
     observed = read_many_fields(observed_file, "\t")
     self.assertEqual(observed, expected)
     remove_file(observed_file)
Beispiel #8
0
 def test_parse_gtf2(self):
     input_file = "{0}/tests/data/input.gtf".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_parse_gtf2.bed".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_parse_gtf2.bed".format(
         MODULE_DIR)
     parse_gtf(input_file,
               features=["exon"],
               transcript_ids=["ENST00000456328"],
               output_file=observed_file)
     expected = read_many_fields(expected_file, "\t")
     observed = read_many_fields(observed_file, "\t")
     self.assertEqual(observed, expected)
     remove_file(observed_file)
Beispiel #9
0
 def test_mapq_filter_lower_limit(self):
     input_file = "{0}/tests/data/input.bam".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_mapq_filter_1.sam".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_mapq_filter_1.bam".format(MODULE_DIR)
     mapq_filter(input_file, observed_file, lower_limit = 200)
     expected = read_many_fields(expected_file, "\t")
     # convert bam to sam to check correct output
     # use samtools to extract in the same format as sam
     temp_observed = "{0}/tests/data/observed_mapq_filter_1.sam".format(MODULE_DIR)
     samtools_args = ["samtools", "view", observed_file]
     run_process(samtools_args, file_for_output = temp_observed)
     observed = read_many_fields(temp_observed, "\t")
     self.assertEqual(expected, observed)
     remove_file(temp_observed)
     remove_file(observed_file)
Beispiel #10
0
 def test_xt_filter(self):
     input_file = "{0}/tests/data/input.bam".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_xt_filter.sam".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_xt_filter.bam".format(
         MODULE_DIR)
     xt_filter(input_file, observed_file, filter="XT:A:U")
     #convert bam to sam to check correct output
     temp_observed = "{0}/tests/data/observed_xt_filter.sam".format(
         MODULE_DIR)
     samtools_args = ["samtools", "view", observed_file]
     run_process(samtools_args, file_for_output=temp_observed)
     expected = read_many_fields(expected_file, "\t")
     observed = read_many_fields(temp_observed, "\t")
     self.assertEqual(expected, observed)
     remove_file(temp_observed)
     remove_file(observed_file)
Beispiel #11
0
def xt_filter(input_file, output_file, filter=None):
    """
    Filter a .bam/.sam file by the XT tag

    Parameters
    ---------
    input_file : str
        Path to the file to be filtered
    output_file : str
        Path to the output
    filter : str
        Filter than reads should contain


    Examples
    ---------
    >>> from bioUtilities.bam import xt_filter
    >>> xt_filter("test.bam", "test_xt_filtered.bam", filter = "XT:A:U")
    """

    if not xt_filter:
        raise Exception('\nXT filter not specified.\n')
    # if the output format is .bam, temporarily create .sam output file
    if output_file[-4:] == ".bam":
        temp_output_file = "{0}.sam".format(output_file[:-4])
    else:
        temp_output_file = output_file

    # get the header of the file
    sam_output = run_process(["samtools", "view", "-h", input_file])
    grep_args = []
    # get header lines
    grep_args.append("^@")
    # get XT values matching the filter
    grep_args.append("\|\t{0}\t".format(filter))
    grep_args = "".join(grep_args)
    # run the filter
    run_process(["grep", grep_args],
                input_to_pipe=sam_output,
                file_for_output=temp_output_file)

    # if wanting to create bam, create bam and delete sam
    if output_file != temp_output_file:
        samtools_args = ["samtools", "view", "-bh", temp_output_file]
        run_process(samtools_args, file_for_output=output_file)
        remove_file(temp_output_file)
 def test_read_count(self):
     input_bed = "{0}/tests/data/input2.bed".format(MODULE_DIR)
     input_bam = "{0}/tests/data/input2.bam".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_count_interval_reads.saf".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_count_interval_reads.saf".format(
         MODULE_DIR)
     remove_file(observed_file)
     count_interval_reads(input_bed, input_bam, observed_file)
     observed = read_many_fields(observed_file)[2:]
     expected = read_many_fields(expected_file)
     self.assertEqual(observed, expected)
     remove_file(observed_file)
     remove_file("{0}.summary".format(observed_file))
 def test_fasta_from_bed1(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     input_genome_fasta = "{0}/tests/data/test_genome.fa".format(MODULE_DIR)
     input_genome_fasta_index = "{0}/tests/data/test_genome.fa.fai".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_fasta_from_bed1.fa".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_fasta_from_bed1.fa".format(
         MODULE_DIR)
     remove_file(observed_file)
     remove_file(input_genome_fasta_index)
     fasta_from_bed(input_bed, input_genome_fasta, observed_file)
     observed = read_fasta(observed_file)
     expected = read_fasta(expected_file)
     self.assertEqual(observed, expected)
     remove_file(observed_file)
 def test_intersect_with_bed(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     input_bam = "{0}/tests/data/input3.bam".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_intersect_with_bed.bam".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_intersect_with_bed.sam".format(
         MODULE_DIR)
     remove_file(observed_file)
     intersect_with_bed(input_bam, input_bed, observed_file)
     observed_sam = "{0}/tests/data/observed_intersect_with_bed.sam".format(
         MODULE_DIR)
     args = ["samtools", "view", "-h", observed_file]
     run_process(args, file_for_output=observed_sam)
     observed = read_many_fields(observed_sam)
     expected = read_many_fields(expected_file)
     self.assertEqual(expected, observed)
     remove_file(observed_file)
     remove_file(observed_sam)
def count_interval_reads(input_file,
                         input_bam,
                         output_file,
                         paired_end=False,
                         min_qual=None,
                         min_length=50):
    """
    For each interval in bed format, count the number of reads in the bam file

    Parameters
    ---------
    input_file : str
        Path to the file containing the intervals
    input_bam : str
        Path to the .bam file containing the reads
    output_file : str
        Path to the output file


    Dependencies
    ---------
    featureCounts v1.6.4

    Examples
    ---------
    >>> from bioUtilities.bam import count_interval_reads
    >>> count_interval_reads("exon_junctions.bed", "reads.bam", "exon_junction_reads.bed")
    """

    # check that featureCounts command exists
    if not shutil.which('featureCounts'):
        raise Exception('\nERROR: featureCounts must be installed.\n')

    # if input_file is in bed format, need to convert to .saf format
    # .saf format its 1-based
    if get_extension(input_file) == ".bed":
        base_input_file = input_file
        working_input_file = "{0}.saf".format(input_file[:-4])
        bed_to_saf(old_input_file, input_file)
    else:
        working_input_file = input_file

    if get_extension(output_file) == ".bed":
        working_output_file = "{0}.saf".format(output_file[:-4])
    else:
        working_output_file = output_file

    # now can use featureCounts to count reads
    # this return the file in 'saf' format
    args = ["featureCounts", "-fO", "-F", "SAF", "-g", "ID"]
    if paired_end:
        args.append("-p")
    if min_qual:
        args.extend(["-Q", min_qual])
    if min_length:
        args.extend(["-d", min_length])
    args.extend(
        ["-a", working_input_file, "-o", working_output_file, input_bam])

    # now run the count
    run_process(args)

    # if the output format is bed, convert the saf output to bed
    if get_extension(output_file) == ".bed":
        entries = read_many_fields(working_output_file)[2:]
        with open(output_file, "w") as outfile:
            for entry in entries:
                output = [
                    entry[1],
                    str(int(entry[2]) - 1),
                    str(int(entry[3]) - 1), entry[0], ".", entry[4]
                ]
                output.extend(entry[5:])
                outfile.write("{0}\n".format("\t".join(output)))

    # now clean up the files
    if working_input_file != input_file:
        remove_file(working_input_file)
    if working_output_file != output_file:
        remove_file(output_file)