def test_convert_one_to_zero(self): input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR) observed_file = "{0}/tests/data/observed_convert_one_to_zero.bed".format(MODULE_DIR) expected_file = "{0}/tests/data/observed_convert_one_to_zero.bed".format(MODULE_DIR) remove_file(observed_file) convert_one_to_zero(input_bed, observed_file) observed = read_many_fields(observed_file) expected = read_many_fields(expected_file) self.assertEqual(expected, observed) remove_file(observed_file)
def test_get_exon_junctions1(self): input_file = "{0}/tests/data/input_coding_exons.bed".format(MODULE_DIR) expected_file = "{0}/tests/data/expected_get_exon_junctions1.bed".format( MODULE_DIR) observed_file = "{0}/tests/data/observed_get_exon_junctions1.bed".format( MODULE_DIR) remove_file(observed_file) get_exon_junctions(input_file, observed_file) observed = read_many_fields(observed_file) expected = read_many_fields(expected_file) self.assertEqual(observed, expected) remove_file(observed_file)
def test_bed_to_saf(self): input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR) observed_file = "{0}/tests/data/observed_bed_to_saf.saf".format( MODULE_DIR) expected_file = "{0}/tests/data/expected_bed_to_saf.saf".format( MODULE_DIR) remove_file(observed_file) bed_to_saf(input_bed, observed_file) observed = read_many_fields(observed_file) expected = read_many_fields(expected_file) self.assertEqual(expected, observed) remove_file(observed_file)
def test_read_count(self): input_bed = "{0}/tests/data/input2.bed".format(MODULE_DIR) input_bam = "{0}/tests/data/input2.bam".format(MODULE_DIR) observed_file = "{0}/tests/data/observed_count_interval_reads.saf".format( MODULE_DIR) expected_file = "{0}/tests/data/expected_count_interval_reads.saf".format( MODULE_DIR) remove_file(observed_file) count_interval_reads(input_bed, input_bam, observed_file) observed = read_many_fields(observed_file)[2:] expected = read_many_fields(expected_file) self.assertEqual(observed, expected) remove_file(observed_file) remove_file("{0}.summary".format(observed_file))
def test_parse_gtf1(self): input_file = "{0}/tests/data/input.gtf".format(MODULE_DIR) expected_file = "{0}/tests/data/expected_parse_gtf1.bed".format( MODULE_DIR) observed_file = "{0}/tests/data/observed_parse_gtf1.bed".format( MODULE_DIR) parse_gtf(input_file, features=["exon"], protein_coding=True, output_file=observed_file) expected = read_many_fields(expected_file, "\t") observed = read_many_fields(observed_file, "\t") self.assertEqual(observed, expected) remove_file(observed_file)
def test_parse_gtf2(self): input_file = "{0}/tests/data/input.gtf".format(MODULE_DIR) expected_file = "{0}/tests/data/expected_parse_gtf2.bed".format( MODULE_DIR) observed_file = "{0}/tests/data/observed_parse_gtf2.bed".format( MODULE_DIR) parse_gtf(input_file, features=["exon"], transcript_ids=["ENST00000456328"], output_file=observed_file) expected = read_many_fields(expected_file, "\t") observed = read_many_fields(observed_file, "\t") self.assertEqual(observed, expected) remove_file(observed_file)
def test_mapq_filter_lower_limit(self): input_file = "{0}/tests/data/input.bam".format(MODULE_DIR) expected_file = "{0}/tests/data/expected_mapq_filter_1.sam".format(MODULE_DIR) observed_file = "{0}/tests/data/observed_mapq_filter_1.bam".format(MODULE_DIR) mapq_filter(input_file, observed_file, lower_limit = 200) expected = read_many_fields(expected_file, "\t") # convert bam to sam to check correct output # use samtools to extract in the same format as sam temp_observed = "{0}/tests/data/observed_mapq_filter_1.sam".format(MODULE_DIR) samtools_args = ["samtools", "view", observed_file] run_process(samtools_args, file_for_output = temp_observed) observed = read_many_fields(temp_observed, "\t") self.assertEqual(expected, observed) remove_file(temp_observed) remove_file(observed_file)
def convert_chr_name(input_file, output_file, full_name=False, delimiter="\t"): """ Given a bed file, convert the chromosome name for use between formats Parameters --------- input_file : str Path to the file to convert output_file : str Path to the output file header : bool If true, header present and ignore delimiter : str If set, the delimiter for the bed file Examples --------- >>> from bioUtilities.bed import convert_chr_name >>> convert_chr_name("input.bed", "input_converted.bed") >>> convert_chr_name("input1.bed", "input_converted1.bed", full_name = True) """ entries = read_many_fields(input_file, delimiter=delimiter) with open(output_file, "w") as outfile: for entry in entries: if full_name: entry[0] = "chr{0}".format(entry[0]) else: entry[0] = entry[0].strip("chr") outfile.write("{0}\n".format("\t".join(entry)))
def convert_one_to_zero(input_file, output_file, delimiter="\t"): """ Given a bed file in index 1 format, convert entries to index 0 format Parameters --------- input_file : str Path to the file to convert output_file : str Path to the output file delimiter : str If set, the delimiter for the bed file Examples --------- >>> from bioUtilities.bed import convert_one_to_zero >>> convert_one_to_zero("exon_junctions.bed", "exon_junction_index_0.bed") """ entries = read_many_fields(input_file, delimiter) with open(output_file, "w") as outfile: for entry in entries: entry[1] = str(int(entry[1]) - 1) entry[2] = str(int(entry[2]) - 1) outfile.write("{0}\n".format(delimiter.join(entry)))
def test_read_many_fields_comma(self): filepath = "{0}/tests/data/test_file_comma_delimited.txt".format( MODULE_DIR) expected = [["entry1.1", "entry1.2", "entry1.3"], ["entry2.1", "entry2.2", "entry2.3"]] observed = read_many_fields(filepath, ",") self.assertEqual(expected, observed)
def bed_to_saf(input_file, output_file, header = False, delimiter = "\t"): """ Given a bed file, convert to saf format See http://bioinf.wehi.edu.au/featureCounts/ Parameters --------- input_file : str Path to the file to convert output_file : str Path to the output file header : bool If true, header present and ignore delimiter : str If set, the delimiter for the bed file Examples --------- >>> from bioUtilities.bed import bed_to_saf >>> bed_to_saf("input.bed", "output.saf") """ entries = read_many_fields(input_file, delimiter = delimiter) # if header exists, ignore it if header: entries = entries[1:] with open(output_file, "w") as outfile: header = ["GeneID", "Chr", "Start", "End", "Strand"] outfile.write("{0}\n".format("\t".join(header))) for entry in entries: output = [entry[3], entry[0], str(int(entry[1])+1), str(int(entry[2])+1), entry[5]] outfile.write("{0}\n".format("\t".join(output)))
def test_xt_filter(self): input_file = "{0}/tests/data/input.bam".format(MODULE_DIR) expected_file = "{0}/tests/data/expected_xt_filter.sam".format( MODULE_DIR) observed_file = "{0}/tests/data/observed_xt_filter.bam".format( MODULE_DIR) xt_filter(input_file, observed_file, filter="XT:A:U") #convert bam to sam to check correct output temp_observed = "{0}/tests/data/observed_xt_filter.sam".format( MODULE_DIR) samtools_args = ["samtools", "view", observed_file] run_process(samtools_args, file_for_output=temp_observed) expected = read_many_fields(expected_file, "\t") observed = read_many_fields(temp_observed, "\t") self.assertEqual(expected, observed) remove_file(temp_observed) remove_file(observed_file)
def test_intersect_with_bed(self): input_bed = "{0}/tests/data/input.bed".format(MODULE_DIR) input_bam = "{0}/tests/data/input3.bam".format(MODULE_DIR) observed_file = "{0}/tests/data/observed_intersect_with_bed.bam".format( MODULE_DIR) expected_file = "{0}/tests/data/expected_intersect_with_bed.sam".format( MODULE_DIR) remove_file(observed_file) intersect_with_bed(input_bam, input_bed, observed_file) observed_sam = "{0}/tests/data/observed_intersect_with_bed.sam".format( MODULE_DIR) args = ["samtools", "view", "-h", observed_file] run_process(args, file_for_output=observed_sam) observed = read_many_fields(observed_sam) expected = read_many_fields(expected_file) self.assertEqual(expected, observed) remove_file(observed_file) remove_file(observed_sam)
def get_terminal_coordinates(input_file, output_file, delimiter="\t"): """ Given a bed file of sequence coordinates, return both the 5' and 3' terminal coordinates Parameters --------- input_file : str Path to the file to gett the coordinates for output_file : str Path to the output file delimiter : str If set, the delimiter for the bed file Examples --------- >>> from bioUtilities.bed import convert_zero_to_one >>> get_terminal_coordinates("exons.bed", "exon_terminal_nucleotides.bed") """ entries = read_many_fields(input_file, delimiter) with open(output_file, "w") as outfile: for entry in entries: id = entry[3] start = int(entry[1]) end = int(entry[2]) five_prime_entry = [ entry[0], str(start), str(start + 1), "{0}.5".format(id) ] + entry[4:] three_prime_entry = [ entry[0], str(end - 1), str(end), "{0}.3".format(id) ] + entry[4:] outfile.write("{0}\n".format(delimiter.join(five_prime_entry))) outfile.write("{0}\n".format(delimiter.join(three_prime_entry)))
seq = seq_list[id] seq = seq.upper() hits = [] matches = re.finditer(motif_search, seq) [hits.extend(list(range(hit.span()[0], hit.span()[0] + len(hit.group(1))))) for hit in matches] hits = sorted(list(set(hits))) for i in hits: hit_count[i-length] += 1 densities = {i: np.divide(hit_count[i], len(seq_list)) for i in hit_count} return densities length = 50 # get_sequences(length) motifs = [i[0] for i in files.read_many_fields(ess_file, "\t")] decoys = files.read_fasta(decoy_file) decoys = {id: decoys.sequences[i] for i, id in enumerate(decoys.ids)} all = files.read_fasta(output_file) non_decoys = {id: all.sequences[i] for i, id in enumerate(all.ids) if id not in decoys} cds_entries = files.read_fasta(transcripts_file) cds_entries = {id: cds_entries.sequences[i] for i, id in enumerate(cds_entries.ids)} seq = "ATCAGCAGTCAG" query = "GCA" index = seq.index(query) print((index + len(query)) % 3)
def get_exon_junctions(input_bed, output_file, all_exons_file = None): """ Given a .bed file, return all the exon junctions Parameters --------- input_bed : str Path to the input bed file output_file : str Path to the output file all_exons_file : str If set, path to a file containing all exons to get junctions that might not appear in the main set because you may have just coding exons Examples --------- >>> from bioUtilities.bed import get_exon_junctions >>> get_exon_junctions("coding_exons.bed", "exon_junctions.bed", all_exons_file = "all_exons.bed") """ entries = read_many_fields(input_bed) # index each of the entries entry_exons = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict()))) for entry in entries: entry_exons[entry[0]][entry[5]][entry[3].split(".")[0]][int(entry[3].split(".")[1])] = [int(entry[1]), int(entry[2])] # if the file containing all exons has been given too if all_exons_file: all_entries = read_many_fields(all_exons_file) all_exons = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict()))) for entry in all_entries: all_exons[entry[0]][entry[5]][entry[3].split(".")[0]][int(entry[3].split(".")[1])] = [int(entry[1]), int(entry[2])] retained = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict())) for chr in entry_exons: for strand in entry_exons[chr]: for transcript_id in entry_exons[chr][strand]: for exon_id in sorted(entry_exons[chr][strand][transcript_id]): focal = entry_exons[chr][strand][transcript_id][exon_id] # now get the each downstream case try: downstream = entry_exons[chr][strand][transcript_id][exon_id + 1] if strand == "-": junction = [downstream[1], focal[0]] else: junction = [focal[1], downstream[0]] junction_id = "{0}-{1}".format(exon_id, exon_id + 1) retained[chr][transcript_id][junction_id] = [junction, strand] except: try: # these are cases where the downstream exon is a noncoding exon # and the all exons has been defined if all_exons_file: downstream = all_exons[chr][strand][transcript_id][exon_id + 1] if strand == "-": junction = [downstream[1], focal[0]] else: junction = [focal[1], downstream[0]] junction_id = "{0}-{1}".format(exon_id, exon_id + 1) retained[chr][transcript_id][junction_id] = [junction, strand] except: pass # upstream junction upstream_junction_id = "{0}-{1}".format(exon_id - 1, exon_id) if upstream_junction_id not in retained[chr][transcript_id]: # get the case where the first coding exon has a noncoding exon upstream try: upstream = all_exons[chr][strand][transcript_id][exon_id - 1] if strand == "-": junction = [focal[1], upstream[0]] else: junction = [upstream[1], focal[0]] retained[chr][transcript_id][upstream_junction_id] = [junction, strand] except: pass junctions = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict())) for chr in retained: for transcript_id in retained[chr]: for exon_junction in retained[chr][transcript_id]: exon1 = int(exon_junction.split("-")[0]) exon2 = int(exon_junction.split("-")[1]) info = retained[chr][transcript_id][exon_junction] junctions[chr][transcript_id][exon1] = [exon2] + info # now write each of the entries to the output file with open(output_file, "w") as outfile: for chr in sorted(junctions): for transcript_id in sorted(junctions[chr]): for exon1 in sorted(junctions[chr][transcript_id]): info = junctions[chr][transcript_id][exon1] exon2 = junctions[chr][transcript_id][exon1][0] coordinates = junctions[chr][transcript_id][exon1][1] strand = junctions[chr][transcript_id][exon1][2] output = [chr, coordinates[0], coordinates[1], "{0}.{1}-{2}".format(transcript_id, exon1, exon2), ".", strand] outfile.write("{0}\n".format("\t".join([str(i) for i in output])))
import ftp_ops from bioUtilities.files import read_many_fields from bioUtilities.dir import create_directory link = "ftp://ftp.ensembl.org/pub/release-96/bamcov/homo_sapiens/genebuild/" output_dir = "human_bodymap_bams" create_directory(output_dir) files = [i[0] for i in read_many_fields("filelist.txt", "\t")] host = "ftp.ensembl.org" user = None password = None needed_dir = "/pub/release-96/bamcov/homo_sapiens/genebuild/" ftp = ftp_ops.ftp_connect(host, user, password, directory = needed_dir) for file in files: ftp = ftp_ops.ftp_retrieve(ftp, host, user, password, needed_dir, file, destination = output_dir)
import collections from bioUtilities.files import fasta_from_bed, read_many_fields from bioUtilities.bed import convert_chr_name input_file = "source_data/alternative_5_splice_site_exons_chr.bed" entries_file = "source_data/alternative_5_splice_site_exons.bed" output_fasta = "results/alternative_5_splice_site_exons.fa" genome_fasta = "../source_data/Genomes/hg38/Homo_sapiens.GRCh38.dna.primary_assembly.fa" # convert_chr_name(input_file, entries_file) entries = read_many_fields(input_file) entry_list = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) for entry in entries[1:]: entry_list[entry[0]][int(entry[2])].append(entry) output_ids = collections.Counter() with open(entries_file, "w") as outfile: outfile.write("#chr\tstart\tend\tid\t.\tstrand\t{0}\n".format("\t".join(entries[0][4:-1]))) for chr in sorted(entry_list): print(chr) for start in sorted(entry_list[chr]): for entry in entry_list[chr][start]: entry_chr = entry[0].strip("chr") strand = entry[1] start = entry[2] end = entry[3]
def count_interval_reads(input_file, input_bam, output_file, paired_end=False, min_qual=None, min_length=50): """ For each interval in bed format, count the number of reads in the bam file Parameters --------- input_file : str Path to the file containing the intervals input_bam : str Path to the .bam file containing the reads output_file : str Path to the output file Dependencies --------- featureCounts v1.6.4 Examples --------- >>> from bioUtilities.bam import count_interval_reads >>> count_interval_reads("exon_junctions.bed", "reads.bam", "exon_junction_reads.bed") """ # check that featureCounts command exists if not shutil.which('featureCounts'): raise Exception('\nERROR: featureCounts must be installed.\n') # if input_file is in bed format, need to convert to .saf format # .saf format its 1-based if get_extension(input_file) == ".bed": base_input_file = input_file working_input_file = "{0}.saf".format(input_file[:-4]) bed_to_saf(old_input_file, input_file) else: working_input_file = input_file if get_extension(output_file) == ".bed": working_output_file = "{0}.saf".format(output_file[:-4]) else: working_output_file = output_file # now can use featureCounts to count reads # this return the file in 'saf' format args = ["featureCounts", "-fO", "-F", "SAF", "-g", "ID"] if paired_end: args.append("-p") if min_qual: args.extend(["-Q", min_qual]) if min_length: args.extend(["-d", min_length]) args.extend( ["-a", working_input_file, "-o", working_output_file, input_bam]) # now run the count run_process(args) # if the output format is bed, convert the saf output to bed if get_extension(output_file) == ".bed": entries = read_many_fields(working_output_file)[2:] with open(output_file, "w") as outfile: for entry in entries: output = [ entry[1], str(int(entry[2]) - 1), str(int(entry[3]) - 1), entry[0], ".", entry[4] ] output.extend(entry[5:]) outfile.write("{0}\n".format("\t".join(output))) # now clean up the files if working_input_file != input_file: remove_file(working_input_file) if working_output_file != output_file: remove_file(output_file)
import collections import bioUtilities.files as files import bioUtilities.seq as seq import re gtf = "../source_data/Genomes/hg38/Homo_sapiens.GRCh38.94.gtf" entries = [i for i in files.read_many_fields(gtf, "\t") if not i[0].startswith('#')] genes = collections.defaultdict(lambda: collections.defaultdict(lambda: [])) for i in entries[:5000]: type = i[2] print(i) if type == "exon": info = i[-1] try: gene_id = re.findall('gene_id "(.*?)"', info)[0] transcript_id = re.findall('transcript_id "(.*?)"', info)[0] biotype = re.findall('transcript_biotype "(.*?)"', info)[0] exon_number = int(re.findall('exon_number "(.*?)"', info)[0]) if biotype == "protein_coding": # print(gene_id, biotype) # print(info) genes[gene_id][transcript_id].append(exon_number)