def get_conservation(transcript_list, output_file, max_dS_threshold=None, max_omega_threshold=None): """ Get the conversation for a list of sequences and only keep those that pass Args: transcript_list (dict): dict containing transcript id, the cds and the ortholog seqs output_file (str): path to output file max_dS_threshold (float): if set, pass in the dS threshold you wish alignments to be below max_omega_threshold (float): if set, pass in the omega threshold you wish alignments to be below """ print("Getting the most conserved ortholog for each transcript...") temp_dir = "temp_conservation_files" gen.create_output_directories(temp_dir) # get a list of the transcript ids transcript_ids = list(transcript_list.keys()) # transcript_ids = transcript_ids[:200] # run this linearly because it doesnt like being parallelised # outputs = run_conservation_check(transcript_ids, transcript_list, max_dS_threshold, max_omega_threshold, temp_dir) outputs = gen.run_parallel_function( transcript_ids, [transcript_list, max_dS_threshold, max_omega_threshold, temp_dir], run_conservation_check, parallel=False) # remove the old output file if there is one gen.remove_file(output_file) # now concat the output files args = ["cat"] [args.append(i) for i in outputs] gen.run_process(args, file_for_output=output_file) gen.remove_directory(temp_dir)
def convert_bed(input_bed, output_bed = None, to_hg38 = True): """ Convert bed file from hg37 to hg38 and vice versa Args: input_bed (str): path to bed file output_bed (str): if set, path to output_file to_hg38 (bool): if set, convert to hg38, else convert to hg37 """ # create temp file if no output file is given if not output_bed: file_to_write = "temp_files/{0}.bed".format(random.random()) else: file_to_write = output_bed entries = gen.read_many_fields(input_bed, "\t") with open(file_to_write, "w") as outfile: for entry in entries: if to_hg38: entry[0] = entry[0].strip("chr") else: entry[0] = "chr{0}".format(entry[0]) outfile.write("{0}\n".format("\t".join(entry))) # remove the temp file if created if not output_bed: gen.run_process(["mv", file_to_write, input_bed]) gen.remove_file(file_to_write)
def fasta_from_intervals(bed_file, fasta_file, genome_fasta, force_strand = True, names = False): """ Takes a bed file and creates a fasta file with the corresponding sequences. Credit: Rosina Savisaar Args: bed_file (str): the bed file path to create fasta from fasta_file (str): the output fasta file path genome_fasta (str): the file path to the genome fasta file names (bool): if False, the fasta record names will be generated from the sequence coordinates. names (bool): if True, the fasta name will correspond to whatever is in the 'name' field of the bed file """ #if the index file exists, check whether the expected features are present genome_fasta_index = genome_fasta + '.fai' if(os.path.exists(genome_fasta_index)): bed_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(bed_file, "\t")]))) index_chrs = sorted(list(set([entry[0] for entry in gen.read_many_fields(genome_fasta_index, "\t")]))) if(not set(bed_chrs).issubset(set(index_chrs))): gen.remove_file(genome_fasta_index) bedtools_args = ["bedtools", "getfasta", "-s", "-fi", genome_fasta, "-bed", bed_file, "-fo", fasta_file] if not force_strand: del bedtools_args[2] if names: bedtools_args.append("-name") gen.run_process(bedtools_args) names, seqs = gen.read_fasta(fasta_file) seqs = [i.upper() for i in seqs] gen.write_to_fasta(names, seqs, fasta_file)
def bam_xt_filter(input_bam, output, xt_filter=None): ''' Filter a bam/sam file by XT tag. ''' if not xt_filter: print("Please specify XT filter.") raise Exception #create output file if output[-4:] == ".bam": output_file = "{0}.sam".format(output[:-4]) else: output_file = output sam_output = gen.run_process(["samtools", "view", "-h", input_bam]) grep_args = [] #get header lines grep_args.append("^@") #get XT values with xt_filter grep_args.append("\|\tXT:A:{0}\t".format(xt_filter)) grep_args = "".join(grep_args) gen.run_process(["grep", grep_args], input_to_pipe=sam_output, file_for_output=output_file) #if wanting to create bam, create bam and delete sam if output != output_file: samtools_args = ["samtools", "view", "-bh", output_file] gen.run_process(samtools_args, file_for_output=output) gen.remove_file(output_file)
def run_simulations(simulation_sets, required_simulations): ''' Run the simulations ''' for motif_set in simulation_sets: motif_file = motif_set[0] simulation_output_file = motif_set[1] stops_count_output_file = motif_set[2] # clean up and previous simulations gen.remove_file(simulation_output_file) gen.remove_file(stops_count_output_file) motif_list = gen.read_many_fields(motif_file, ",") # get motifs, avoid header if there is one motifs = [i[0] for i in motif_list if i[0][0] != "#"] # get the number of stop codons found in the real set real_count = se.get_stop_codon_count(motifs) # generate simulated motifs using motif set print('Simulating {0}...'.format(motif_file)) se.generate_motifs_sets(motifs, required_simulations, output_file = simulation_output_file) simulated_motif_sets = gen.read_many_fields(simulation_output_file, "|") with open(stops_count_output_file, "w") as output: output.write('id,stop_count\n') output.write('real,{0}\n'.format(real_count)) for i, simulated_set in enumerate(simulated_motif_sets): stop_count = se.get_stop_codon_count(simulated_set) output.write('{0},{1}\n'.format(i+1, stop_count))
def retrieve_bams_core(all_files, local_directory, host, user, password, ftp_directory, expect_string): ''' Core function parallelized in retrieve_bams above. ''' #connect to FTP server ftp = gen.ftp_connect(host, user, password, directory=ftp_directory) #loop over .bam files for pos, bam_file in enumerate(all_files): expect_file = "temp_data/expect_file{0}.txt".format(random.random()) start_time = time.time() print("{0}/{1}".format(pos, len(all_files))) local_bam_file = "{0}/{1}".format(local_directory, bam_file) #retrieve current file if not os.path.isfile(local_bam_file): ftp = gen.ftp_retrieve(ftp, host, user, password, ftp_directory, bam_file, destination=local_directory) #transfer file to Watson current_expect_string = str.replace(expect_string, "foo", bam_file) with open(expect_file, "w") as e_file: e_file.write(current_expect_string) gen.run_process(["expect", expect_file]) print("Transferred to Watson.") gen.remove_file(expect_file) gen.remove_file(local_bam_file) print("Time spent: {0} minutes.\n".format( round((time.time() - start_time) / 60), 3)) ftp = gen.ftp_check(ftp, host, user, password, ftp_directory) ftp.quit()
def bam_nm_filter(input_bam, output, nm_less_equal_to=None): ''' Filters bam reads by NM value. nm_less_equal_to: the NM value you wish to filter by. ''' if not nm_less_equal_to: print("Please provide NM filter value.") raise Exception #create output file if output[-4:] == ".bam": output_file = "{0}.sam".format(output[:-4]) else: output_file = output sam_output = gen.run_process(["samtools", "view", "-h", input_bam]) #create grep args and include header fields if they exist grep_args = ["^@"] #for each nm less than equal to threshold, create grep arg for i in range(nm_less_equal_to + 1): grep_args.append("\|\tNM:i:{0}\t".format(i)) grep_args = "".join(grep_args) gen.run_process(["grep", grep_args], input_to_pipe=sam_output, file_for_output=output_file) #if wanting to create bam, create bam and delete sam if output != output_file: samtools_args = ["samtools", "view", "-bh", output_file] gen.run_process(samtools_args, file_for_output=output) gen.remove_file(output_file)
def calc_ds(aligned_sequences): aligned_sequences_iupac = [ Seq("".join(i), IUPAC.unambiguous_dna) for i in aligned_sequences ] alignment = MultipleSeqAlignment([ SeqRecord(aligned_sequences_iupac[0], id="seq"), SeqRecord(aligned_sequences_iupac[1], id="orth_seq") ]) gen.create_output_directories("temp_files") random_instance = random.random() temp_phylip_file = "temp_files/{0}.phy".format(random_instance) temp_output_file = "temp_files/{0}.out".format(random_instance) fo.write_to_phylip(alignment, temp_phylip_file) # # run paml on sequences working_dir = "temp_dir.{0}".format(random.random()) paml = sequo.PAML_Functions(input_file=temp_phylip_file, output_file=temp_output_file, working_dir=working_dir) # run codeml codeml_output = paml.run_codeml() ds = codeml_output["NSsites"][0]["parameters"]["dS"] # clean up files gen.remove_file(temp_phylip_file) gen.remove_file(temp_output_file) paml.cleanup() return ds
def test_sort_bed(self): infile = "test_data/bam_ops/test_sort_bed/test_intersect_bed_A_file_unsorted.bed" expected_file = "test_data/bam_ops/test_sort_bed/expected_test_intersect_bed_A_file.bed" observed_file = "test_data/bam_ops/test_sort_bed/observed_test_sort_bed.bed" gen.remove_file(observed_file) sort_bed(infile, observed_file) expected = gen.read_many_fields(expected_file, "\t") observed = gen.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_remove_overlaps2(self): in_bed = "test_data/bed_ops/test_remove_overlaps2/in.bed" expected = "test_data/bed_ops/test_remove_overlaps2/expected.bed" observed = "test_data/bed_ops/test_remove_overlaps2/observed.bed" gen.remove_file(observed) remove_overlaps(in_bed, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_compare_PSI_haplotypes(self): SNPs = "test_data/bam_ops/test_compare_PSI_haplotypes/SNPs.bed" bam_folder = "test_data/bam_ops/test_compare_PSI_haplotypes/bam_folder" expected = gen.read_many_fields("test_data/bam_ops/test_compare_PSI_haplotypes/expected.txt", "\t") observed = "test_data/bam_ops/test_compare_PSI_haplotypes/observed.txt" gen.remove_file(observed) compare_PSI_haplotypes(SNPs, bam_folder, observed, 3) observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_filter_by_snp_type(self): input_snps = "test_data/snp_ops/test_filter_by_snp_type/input_snps.bed" expected = "test_data/snp_ops/test_filter_by_snp_type/expected_snps.bed" observed = "test_data/snp_ops/test_filter_by_snp_type/observed_snps.bed" gen.remove_file(observed) filter_by_snp_type(input_snps, observed, "non") expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(observed, expected)
def test_get_snp_relative_cds_position_plus_strand_split(self): relative_exon_position_file = gen.read_many_fields("test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/test_snp_relative_exon_position.bed", "\t") bed_file = "test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/full_bed.bed" expected = gen.read_many_fields("test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/expected_test_snp_relative_cds_position.bed", "\t") observed = "test_data/snp_ops/test_get_snp_relative_cds_position_plus_strand_split/observed_test_snp_relative_cds_position.bed" gen.remove_file(observed) get_snp_relative_cds_position(relative_exon_position_file, observed, bed_file) observed = gen.read_many_fields(observed, "\t") self.assertEqual(observed, expected)
def test_group_flags(self): input_bed = "test_data/bam_ops/test_group_flags/test_tabix.bed" observed = "test_data/bam_ops/test_group_flags/observed_test_group_flags.bed" gen.remove_file(observed) flag_start = 3 group_flags(input_bed, observed, flag_start) expected = gen.read_many_fields("test_data/bam_ops/test_group_flags/expected_test_group_flags.bed", "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_tabix(self): bed_file = "test_data/snp_ops/test_tabix/test_tabix_bed.txt" expected = gen.read_many_fields("test_data/snp_ops/test_tabix/expected_test_tabix.txt", "\t") observed = "test_data/snp_ops/observed_test_tabix.bed" gen.remove_file(observed) vcf = "../source_data/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.gz" tabix(bed_file, observed, vcf) observed = gen.read_many_fields(observed, "\t") self.assertEqual(sorted(observed), sorted(expected))
def test_intersect_bed_overlap(self): A_file = "test_data/bam_ops/test_intersect_bed_overlap/test_intersect_bed_A_file.bed" B_file = "test_data/bam_ops/test_intersect_bed_overlap/test_intersect_bed_B_file.bed" expected_file = "test_data/bam_ops/test_intersect_bed_overlap/expected_test_intersect_bed_overlap.bed" observed_file = "test_data/bam_ops/test_intersect_bed_overlap/observed_test_intersect_bed_overlap.bed" gen.remove_file(observed_file) intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, overlap = 0.5) expected = gen.read_many_fields(expected_file, "\t") observed = gen.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_intersect_bed_force_strand_hit_count(self): A_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/test_intersect_bed_A_file.bed" B_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/test_intersect_bed_B_file.bed" expected_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/expected_test_intersect_bed_force_strand_hit_count.bed" observed_file = "test_data/bam_ops/test_intersect_bed_force_strand_hit_count/observed_test_intersect_bed_force_strand_hit_count.bed" gen.remove_file(observed_file) intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, force_strand = True, hit_count = True) expected = gen.read_many_fields(expected_file, "\t") observed = gen.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_intersect_bed_intersect_bedops(self): A_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/test_intersect_bed_A_file.bed" B_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/test_intersect_bed_B_file.bed" expected_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/expected_test_intersect_bed_intersect_bedops.bed" observed_file = "test_data/bam_ops/test_intersect_bed_intersect_bedops/observed_test_intersect_bed_intersect_bedops.bed" gen.remove_file(observed_file) intersect_bed(A_file, B_file, output_file = observed_file, no_dups = False, use_bedops = True, intersect = True) expected = gen.read_many_fields(expected_file, "\t") observed = gen.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_filter_exon_junctions(self): exon_junctions_file = "test_data/bed_ops/test_filter_exon_junctions/exon_junctions.bed" exons_file = "test_data/bed_ops/test_filter_exon_junctions/exons.bed" expected = "test_data/bed_ops/test_filter_exon_junctions/expected_filter_exon_junctions.bed" observed = "test_data/bed_ops/test_filter_exon_junctions/observed_filter_exon_junctions.bed" gen.remove_file(observed) filter_exon_junctions(exon_junctions_file, exons_file, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_extract_exon_junctions_window(self): exons = "test_data/bed_ops/test_extract_exon_junctions_window/test_extract_exon_junctions.bed" observed = "test_data/bed_ops/test_extract_exon_junctions_window/observed_test_extract_exon_window_junctions.bed" gen.remove_file(observed) extract_exon_junctions(exons, observed, 30) expected = gen.read_many_fields( "test_data/bed_ops/test_extract_exon_junctions_window/expected_test_extract_exon_window_junctions.bed", "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_extract_features_cdss_stops(self): gtf_file = "test_data/bed_ops/test_extract_features_cdss_stops/test_extract_features.gtf" observed = "test_data/bed_ops/test_extract_features_cdss_stops/observed_test_extract_features_cdss_stops.bed" gen.remove_file(observed) extract_features(gtf_file, observed, ['CDS', 'stop_codon']) expected = gen.read_many_fields( "test_data/bed_ops/test_extract_features_cdss_stops/expected_test_extract_features_cdss_stops.bed", "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(observed, expected)
def test_filter_bed_from_fasta(self): bed = "test_data/bed_ops/test_filter_bed_from_fasta/test_filter_bed_from_fasta.bed" fasta = "test_data/bed_ops/test_filter_bed_from_fasta/test_filter_bed_from_fasta.fasta" observed = "test_data/bed_ops/test_filter_bed_from_fasta/observed_test_filter_bed_from_fasta.bed" gen.remove_file(observed) expected = "test_data/bed_ops/test_filter_bed_from_fasta/expected_test_filter_bed_from_fasta.bed" filter_bed_from_fasta(bed, fasta, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_get_descriptions(self): gtf = "test_data/bed_ops/test_get_descriptions/descriptions.gtf" names = ["ENST100", "ENST7", "ENST0003", "ENST5"] expected = "test_data/bed_ops/test_get_descriptions/expected_get_descriptions.txt" observed = "test_data/bed_ops/test_get_descriptions/observed_get_descriptions.txt" gen.remove_file(observed) get_descriptions(names, gtf, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_check_coding(self): exon_file = "test_data/bed_ops/test_check_coding/exons.bed" CDS_file = "test_data/bed_ops/test_check_coding/CDSs.bed" expected = "test_data/bed_ops/test_check_coding/expected_check_coding.bed" observed = "test_data/bed_ops/test_check_coding/observed_check_coding.bed" gen.remove_file(observed) check_coding(exon_file, CDS_file, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_merge_and_header(self): file1 = "test_data/snp_ops/test_merge_and_header/file1.txt" file2 = "test_data/snp_ops/test_merge_and_header/file2.txt" expected = "test_data/snp_ops/test_merge_and_header/expected.txt" observed = "test_data/snp_ops/test_merge_and_header/observed.txt" gen.remove_file(observed) merge_and_header(file1, file2, observed) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_extract_exons(self): gtf = "test_data/bed_ops/test_extract_exons/test_extract_exons.gtf" observed = "test_data/bed_ops/test_extract_exons/observed_test_extract_exons.bed" gen.remove_file(observed) extract_exons(gtf, observed) expected = gen.read_many_fields( "test_data/bed_ops/test_extract_exons/expected_test_extract_exons.bed", "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def test_filter_fasta_intervals_from_fasta(self): fasta = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/test_filter_fasta_intervals_from_fasta_fasta.fasta" fasta_intervals = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/test_filter_fasta_intervals_from_fasta_intervals.fasta" observed = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/observed_filtered_intervals.fasta" gen.remove_file(observed) expected = "test_data/bed_ops/test_filter_fasta_intervals_from_fasta/expected_filtered_intervals.fasta" filter_fasta_intervals_from_fasta(fasta_intervals, fasta, observed) expected = gen.read_fasta(expected) observed = gen.read_fasta(observed) self.assertEqual(expected, observed)
def test_ptc_locations(self): ptc_file = "test_data/snp_ops/test_ptc_locations/test_PTC_file.txt" snp_file = "test_data/snp_ops/test_ptc_locations/test_SNP_relative_exon_position.bed" bam_output_file = "test_data/snp_ops/test_ptc_locations/bam_analysis_output.txt" observed = "test_data/snp_ops/test_ptc_locations/observed_ptc_location.txt" expected = "test_data/snp_ops/test_ptc_locations/expected_ptc_location.txt" gen.remove_file(observed) ptc_locations(ptc_file, snp_file, bam_output_file, observed) observed = gen.read_many_fields(observed, "\t") expected = gen.read_many_fields(expected, "\t") self.assertEqual(observed, expected)
def test_filter_motif_SNPs_complement(self): motifs = "test_data/snp_ops/test_filter_motif_snps_complement/ESEs.txt" fasta = "test_data/snp_ops/test_filter_motif_snps_complement/CDS.fasta" SNPs = "test_data/snp_ops/test_filter_motif_snps_complement/snps.bed" expected = "test_data/snp_ops/test_filter_motif_snps_complement/expected.txt" observed = "test_data/snp_ops/test_filter_motif_snps_complement/observed.txt" gen.remove_file(observed) filter_motif_SNPs(fasta, SNPs, motifs, observed, complement = True) expected = gen.read_many_fields(expected, "\t") observed = gen.read_many_fields(observed, "\t") self.assertEqual(expected, observed)
def sort_bed(input_file_name, output_file_name): ''' Sort a bed file. ''' #This is done via a temp file because that way you can specify the same file as input and output file and thus #overwrite the unsorted file with the sorted one. temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random()) gen.run_process(["sort-bed", input_file_name], file_for_output=temp_file_name) gen.run_process(["mv", temp_file_name, output_file_name]) gen.remove_file(temp_file_name)