def test_convert_one_to_zero(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_convert_one_to_zero.bed".format(MODULE_DIR)
     expected_file = "{0}/tests/data/observed_convert_one_to_zero.bed".format(MODULE_DIR)
     remove_file(observed_file)
     convert_one_to_zero(input_bed, observed_file)
     observed = read_many_fields(observed_file)
     expected = read_many_fields(expected_file)
     self.assertEqual(expected, observed)
     remove_file(observed_file)
Exemple #2
0
 def test_get_exon_junctions1(self):
     input_file = "{0}/tests/data/input_coding_exons.bed".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_get_exon_junctions1.bed".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_get_exon_junctions1.bed".format(
         MODULE_DIR)
     remove_file(observed_file)
     get_exon_junctions(input_file, observed_file)
     observed = read_many_fields(observed_file)
     expected = read_many_fields(expected_file)
     self.assertEqual(observed, expected)
     remove_file(observed_file)
Exemple #3
0
 def test_bed_to_saf(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_bed_to_saf.saf".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_bed_to_saf.saf".format(
         MODULE_DIR)
     remove_file(observed_file)
     bed_to_saf(input_bed, observed_file)
     observed = read_many_fields(observed_file)
     expected = read_many_fields(expected_file)
     self.assertEqual(expected, observed)
     remove_file(observed_file)
 def test_read_count(self):
     input_bed = "{0}/tests/data/input2.bed".format(MODULE_DIR)
     input_bam = "{0}/tests/data/input2.bam".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_count_interval_reads.saf".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_count_interval_reads.saf".format(
         MODULE_DIR)
     remove_file(observed_file)
     count_interval_reads(input_bed, input_bam, observed_file)
     observed = read_many_fields(observed_file)[2:]
     expected = read_many_fields(expected_file)
     self.assertEqual(observed, expected)
     remove_file(observed_file)
     remove_file("{0}.summary".format(observed_file))
Exemple #5
0
 def test_parse_gtf1(self):
     input_file = "{0}/tests/data/input.gtf".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_parse_gtf1.bed".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_parse_gtf1.bed".format(
         MODULE_DIR)
     parse_gtf(input_file,
               features=["exon"],
               protein_coding=True,
               output_file=observed_file)
     expected = read_many_fields(expected_file, "\t")
     observed = read_many_fields(observed_file, "\t")
     self.assertEqual(observed, expected)
     remove_file(observed_file)
Exemple #6
0
 def test_parse_gtf2(self):
     input_file = "{0}/tests/data/input.gtf".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_parse_gtf2.bed".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_parse_gtf2.bed".format(
         MODULE_DIR)
     parse_gtf(input_file,
               features=["exon"],
               transcript_ids=["ENST00000456328"],
               output_file=observed_file)
     expected = read_many_fields(expected_file, "\t")
     observed = read_many_fields(observed_file, "\t")
     self.assertEqual(observed, expected)
     remove_file(observed_file)
 def test_mapq_filter_lower_limit(self):
     input_file = "{0}/tests/data/input.bam".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_mapq_filter_1.sam".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_mapq_filter_1.bam".format(MODULE_DIR)
     mapq_filter(input_file, observed_file, lower_limit = 200)
     expected = read_many_fields(expected_file, "\t")
     # convert bam to sam to check correct output
     # use samtools to extract in the same format as sam
     temp_observed = "{0}/tests/data/observed_mapq_filter_1.sam".format(MODULE_DIR)
     samtools_args = ["samtools", "view", observed_file]
     run_process(samtools_args, file_for_output = temp_observed)
     observed = read_many_fields(temp_observed, "\t")
     self.assertEqual(expected, observed)
     remove_file(temp_observed)
     remove_file(observed_file)
def convert_chr_name(input_file, output_file, full_name=False, delimiter="\t"):
    """
    Given a bed file, convert the chromosome name for use between formats

    Parameters
    ---------
    input_file : str
        Path to the file to convert
    output_file : str
        Path to the output file
    header : bool
        If true, header present and ignore
    delimiter : str
        If set, the delimiter for the bed file

    Examples
    ---------
    >>> from bioUtilities.bed import convert_chr_name
    >>> convert_chr_name("input.bed", "input_converted.bed")
    >>> convert_chr_name("input1.bed", "input_converted1.bed", full_name = True)
    """

    entries = read_many_fields(input_file, delimiter=delimiter)

    with open(output_file, "w") as outfile:
        for entry in entries:
            if full_name:
                entry[0] = "chr{0}".format(entry[0])
            else:
                entry[0] = entry[0].strip("chr")
            outfile.write("{0}\n".format("\t".join(entry)))
Exemple #9
0
def convert_one_to_zero(input_file, output_file, delimiter="\t"):
    """
    Given a bed file in index 1 format, convert entries to index 0 format

    Parameters
    ---------
    input_file : str
        Path to the file to convert
    output_file : str
        Path to the output file
    delimiter : str
        If set, the delimiter for the bed file

    Examples
    ---------
    >>> from bioUtilities.bed import convert_one_to_zero
    >>> convert_one_to_zero("exon_junctions.bed", "exon_junction_index_0.bed")
    """

    entries = read_many_fields(input_file, delimiter)
    with open(output_file, "w") as outfile:
        for entry in entries:
            entry[1] = str(int(entry[1]) - 1)
            entry[2] = str(int(entry[2]) - 1)
            outfile.write("{0}\n".format(delimiter.join(entry)))
 def test_read_many_fields_comma(self):
     filepath = "{0}/tests/data/test_file_comma_delimited.txt".format(
         MODULE_DIR)
     expected = [["entry1.1", "entry1.2", "entry1.3"],
                 ["entry2.1", "entry2.2", "entry2.3"]]
     observed = read_many_fields(filepath, ",")
     self.assertEqual(expected, observed)
Exemple #11
0
def bed_to_saf(input_file, output_file, header = False, delimiter = "\t"):
    """
    Given a bed file, convert to saf format
    See http://bioinf.wehi.edu.au/featureCounts/

    Parameters
    ---------
    input_file : str
        Path to the file to convert
    output_file : str
        Path to the output file
    header : bool
        If true, header present and ignore
    delimiter : str
        If set, the delimiter for the bed file

    Examples
    ---------
    >>> from bioUtilities.bed import bed_to_saf
    >>> bed_to_saf("input.bed", "output.saf")
    """

    entries = read_many_fields(input_file, delimiter = delimiter)
    # if header exists, ignore it
    if header:
        entries = entries[1:]

    with open(output_file, "w") as outfile:
        header = ["GeneID", "Chr", "Start", "End", "Strand"]
        outfile.write("{0}\n".format("\t".join(header)))
        for entry in entries:
            output = [entry[3], entry[0], str(int(entry[1])+1), str(int(entry[2])+1), entry[5]]
            outfile.write("{0}\n".format("\t".join(output)))
Exemple #12
0
 def test_xt_filter(self):
     input_file = "{0}/tests/data/input.bam".format(MODULE_DIR)
     expected_file = "{0}/tests/data/expected_xt_filter.sam".format(
         MODULE_DIR)
     observed_file = "{0}/tests/data/observed_xt_filter.bam".format(
         MODULE_DIR)
     xt_filter(input_file, observed_file, filter="XT:A:U")
     #convert bam to sam to check correct output
     temp_observed = "{0}/tests/data/observed_xt_filter.sam".format(
         MODULE_DIR)
     samtools_args = ["samtools", "view", observed_file]
     run_process(samtools_args, file_for_output=temp_observed)
     expected = read_many_fields(expected_file, "\t")
     observed = read_many_fields(temp_observed, "\t")
     self.assertEqual(expected, observed)
     remove_file(temp_observed)
     remove_file(observed_file)
 def test_intersect_with_bed(self):
     input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR)
     input_bam = "{0}/tests/data/input3.bam".format(MODULE_DIR)
     observed_file = "{0}/tests/data/observed_intersect_with_bed.bam".format(
         MODULE_DIR)
     expected_file = "{0}/tests/data/expected_intersect_with_bed.sam".format(
         MODULE_DIR)
     remove_file(observed_file)
     intersect_with_bed(input_bam, input_bed, observed_file)
     observed_sam = "{0}/tests/data/observed_intersect_with_bed.sam".format(
         MODULE_DIR)
     args = ["samtools", "view", "-h", observed_file]
     run_process(args, file_for_output=observed_sam)
     observed = read_many_fields(observed_sam)
     expected = read_many_fields(expected_file)
     self.assertEqual(expected, observed)
     remove_file(observed_file)
     remove_file(observed_sam)
Exemple #14
0
def get_terminal_coordinates(input_file, output_file, delimiter="\t"):
    """
    Given a bed file of sequence coordinates, return both the 5' and 3' terminal
    coordinates

    Parameters
    ---------
    input_file : str
        Path to the file to gett the coordinates for
    output_file : str
        Path to the output file
    delimiter : str
        If set, the delimiter for the bed file

    Examples
    ---------
    >>> from bioUtilities.bed import convert_zero_to_one
    >>> get_terminal_coordinates("exons.bed", "exon_terminal_nucleotides.bed")
    """

    entries = read_many_fields(input_file, delimiter)

    with open(output_file, "w") as outfile:
        for entry in entries:
            id = entry[3]
            start = int(entry[1])
            end = int(entry[2])
            five_prime_entry = [
                entry[0],
                str(start),
                str(start + 1), "{0}.5".format(id)
            ] + entry[4:]
            three_prime_entry = [
                entry[0], str(end - 1),
                str(end), "{0}.3".format(id)
            ] + entry[4:]
            outfile.write("{0}\n".format(delimiter.join(five_prime_entry)))
            outfile.write("{0}\n".format(delimiter.join(three_prime_entry)))
Exemple #15
0
        seq = seq_list[id]
        seq = seq.upper()
        hits = []
        matches = re.finditer(motif_search, seq)
        [hits.extend(list(range(hit.span()[0], hit.span()[0] + len(hit.group(1))))) for hit in matches]
        hits = sorted(list(set(hits)))
        for i in hits:
            hit_count[i-length] += 1
    densities = {i: np.divide(hit_count[i], len(seq_list)) for i in hit_count}
    return densities


length = 50
# get_sequences(length)

motifs = [i[0] for i in files.read_many_fields(ess_file, "\t")]

decoys = files.read_fasta(decoy_file)
decoys = {id: decoys.sequences[i] for i, id in enumerate(decoys.ids)}

all = files.read_fasta(output_file)
non_decoys = {id: all.sequences[i] for i, id in enumerate(all.ids) if id not in decoys}


cds_entries = files.read_fasta(transcripts_file)
cds_entries = {id: cds_entries.sequences[i] for i, id in enumerate(cds_entries.ids)}

seq = "ATCAGCAGTCAG"
query = "GCA"
index = seq.index(query)
print((index + len(query)) % 3)
def get_exon_junctions(input_bed, output_file, all_exons_file = None):
    """
    Given a .bed file, return all the exon junctions

    Parameters
    ---------
    input_bed : str
        Path to the input bed file
    output_file : str
        Path to the output file
    all_exons_file : str
        If set, path to a file containing all exons to get junctions that might
        not appear in the main set because you may have just coding exons

    Examples
    ---------
    >>> from bioUtilities.bed import get_exon_junctions
    >>> get_exon_junctions("coding_exons.bed", "exon_junctions.bed", all_exons_file = "all_exons.bed")
    """


    entries = read_many_fields(input_bed)

    # index each of the entries
    entry_exons = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict())))
    for entry in entries:
        entry_exons[entry[0]][entry[5]][entry[3].split(".")[0]][int(entry[3].split(".")[1])] = [int(entry[1]), int(entry[2])]

    # if the file containing all exons has been given too
    if all_exons_file:
        all_entries = read_many_fields(all_exons_file)
        all_exons = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict())))
        for entry in all_entries:
            all_exons[entry[0]][entry[5]][entry[3].split(".")[0]][int(entry[3].split(".")[1])] = [int(entry[1]), int(entry[2])]

    retained = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict()))

    for chr in entry_exons:
        for strand in entry_exons[chr]:
            for transcript_id in entry_exons[chr][strand]:
                for exon_id in sorted(entry_exons[chr][strand][transcript_id]):
                    focal = entry_exons[chr][strand][transcript_id][exon_id]

                    # now get the each downstream case
                    try:
                        downstream = entry_exons[chr][strand][transcript_id][exon_id + 1]
                        if strand == "-":
                            junction = [downstream[1], focal[0]]
                        else:
                            junction = [focal[1], downstream[0]]
                        junction_id = "{0}-{1}".format(exon_id, exon_id + 1)
                        retained[chr][transcript_id][junction_id] = [junction, strand]
                    except:
                        try:
                            # these are cases where the downstream exon is a noncoding exon
                            # and the all exons has been defined
                            if all_exons_file:
                                downstream = all_exons[chr][strand][transcript_id][exon_id + 1]
                                if strand == "-":
                                    junction = [downstream[1], focal[0]]
                                else:
                                    junction = [focal[1], downstream[0]]
                                junction_id = "{0}-{1}".format(exon_id, exon_id + 1)
                                retained[chr][transcript_id][junction_id] = [junction, strand]
                        except:
                            pass

                    # upstream junction
                    upstream_junction_id = "{0}-{1}".format(exon_id - 1, exon_id)
                    if upstream_junction_id not in retained[chr][transcript_id]:
                        # get the case where the first coding exon has a noncoding exon upstream
                        try:
                            upstream = all_exons[chr][strand][transcript_id][exon_id - 1]
                            if strand == "-":
                                junction = [focal[1], upstream[0]]
                            else:
                                junction = [upstream[1], focal[0]]
                            retained[chr][transcript_id][upstream_junction_id] = [junction, strand]
                        except:
                            pass

    junctions = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict()))
    for chr in retained:
        for transcript_id in retained[chr]:
            for exon_junction in retained[chr][transcript_id]:
                exon1 = int(exon_junction.split("-")[0])
                exon2 = int(exon_junction.split("-")[1])
                info = retained[chr][transcript_id][exon_junction]
                junctions[chr][transcript_id][exon1] = [exon2] + info

    # now write each of the entries to the output file
    with open(output_file, "w") as outfile:
        for chr in sorted(junctions):
            for transcript_id in sorted(junctions[chr]):
                for exon1 in sorted(junctions[chr][transcript_id]):
                    info = junctions[chr][transcript_id][exon1]
                    exon2 = junctions[chr][transcript_id][exon1][0]
                    coordinates = junctions[chr][transcript_id][exon1][1]
                    strand = junctions[chr][transcript_id][exon1][2]
                    output = [chr, coordinates[0], coordinates[1], "{0}.{1}-{2}".format(transcript_id, exon1, exon2), ".",  strand]
                    outfile.write("{0}\n".format("\t".join([str(i) for i in output])))
Exemple #17
0
import ftp_ops
from bioUtilities.files import read_many_fields
from bioUtilities.dir import create_directory

link = "ftp://ftp.ensembl.org/pub/release-96/bamcov/homo_sapiens/genebuild/"

output_dir = "human_bodymap_bams"
create_directory(output_dir)

files = [i[0] for i in read_many_fields("filelist.txt", "\t")]

host = "ftp.ensembl.org"
user = None
password = None

needed_dir = "/pub/release-96/bamcov/homo_sapiens/genebuild/"

ftp = ftp_ops.ftp_connect(host, user, password, directory = needed_dir)

for file in files:
    ftp = ftp_ops.ftp_retrieve(ftp, host, user, password, needed_dir, file, destination = output_dir)
Exemple #18
0
import collections

from bioUtilities.files import fasta_from_bed, read_many_fields
from bioUtilities.bed import convert_chr_name

input_file = "source_data/alternative_5_splice_site_exons_chr.bed"
entries_file = "source_data/alternative_5_splice_site_exons.bed"
output_fasta = "results/alternative_5_splice_site_exons.fa"
genome_fasta = "../source_data/Genomes/hg38/Homo_sapiens.GRCh38.dna.primary_assembly.fa"

# convert_chr_name(input_file, entries_file)

entries = read_many_fields(input_file)


entry_list = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
for entry in entries[1:]:
    entry_list[entry[0]][int(entry[2])].append(entry)

output_ids = collections.Counter()

with open(entries_file, "w") as outfile:
    outfile.write("#chr\tstart\tend\tid\t.\tstrand\t{0}\n".format("\t".join(entries[0][4:-1])))
    for chr in sorted(entry_list):
        print(chr)
        for start in sorted(entry_list[chr]):
            for entry in  entry_list[chr][start]:
                entry_chr = entry[0].strip("chr")
                strand = entry[1]
                start = entry[2]
                end = entry[3]
def count_interval_reads(input_file,
                         input_bam,
                         output_file,
                         paired_end=False,
                         min_qual=None,
                         min_length=50):
    """
    For each interval in bed format, count the number of reads in the bam file

    Parameters
    ---------
    input_file : str
        Path to the file containing the intervals
    input_bam : str
        Path to the .bam file containing the reads
    output_file : str
        Path to the output file


    Dependencies
    ---------
    featureCounts v1.6.4

    Examples
    ---------
    >>> from bioUtilities.bam import count_interval_reads
    >>> count_interval_reads("exon_junctions.bed", "reads.bam", "exon_junction_reads.bed")
    """

    # check that featureCounts command exists
    if not shutil.which('featureCounts'):
        raise Exception('\nERROR: featureCounts must be installed.\n')

    # if input_file is in bed format, need to convert to .saf format
    # .saf format its 1-based
    if get_extension(input_file) == ".bed":
        base_input_file = input_file
        working_input_file = "{0}.saf".format(input_file[:-4])
        bed_to_saf(old_input_file, input_file)
    else:
        working_input_file = input_file

    if get_extension(output_file) == ".bed":
        working_output_file = "{0}.saf".format(output_file[:-4])
    else:
        working_output_file = output_file

    # now can use featureCounts to count reads
    # this return the file in 'saf' format
    args = ["featureCounts", "-fO", "-F", "SAF", "-g", "ID"]
    if paired_end:
        args.append("-p")
    if min_qual:
        args.extend(["-Q", min_qual])
    if min_length:
        args.extend(["-d", min_length])
    args.extend(
        ["-a", working_input_file, "-o", working_output_file, input_bam])

    # now run the count
    run_process(args)

    # if the output format is bed, convert the saf output to bed
    if get_extension(output_file) == ".bed":
        entries = read_many_fields(working_output_file)[2:]
        with open(output_file, "w") as outfile:
            for entry in entries:
                output = [
                    entry[1],
                    str(int(entry[2]) - 1),
                    str(int(entry[3]) - 1), entry[0], ".", entry[4]
                ]
                output.extend(entry[5:])
                outfile.write("{0}\n".format("\t".join(output)))

    # now clean up the files
    if working_input_file != input_file:
        remove_file(working_input_file)
    if working_output_file != output_file:
        remove_file(output_file)
Exemple #20
0
import collections
import bioUtilities.files as files
import bioUtilities.seq as seq
import re

gtf = "../source_data/Genomes/hg38/Homo_sapiens.GRCh38.94.gtf"

entries = [i for i in files.read_many_fields(gtf, "\t") if not i[0].startswith('#')]


genes = collections.defaultdict(lambda: collections.defaultdict(lambda: []))


for i in entries[:5000]:
    type = i[2]

    print(i)

    if type == "exon":
        info = i[-1]
        try:
            gene_id = re.findall('gene_id "(.*?)"', info)[0]
            transcript_id = re.findall('transcript_id "(.*?)"', info)[0]
            biotype = re.findall('transcript_biotype "(.*?)"', info)[0]
            exon_number = int(re.findall('exon_number "(.*?)"', info)[0])

            if biotype == "protein_coding":
                # print(gene_id, biotype)
                # print(info)

                genes[gene_id][transcript_id].append(exon_number)