def main(): description = "Check whether stop codons are depleted in motif sets by simulating the motif set." args = gen.parse_arguments(description, ["motif_file", "output_dir", "results_dir", "required_simulations", "motif_simulation", "exon_simulation"], flags = [4,5], ints = [3]) motif_file, output_dir, results_dir, required_simulations, motif_simulation, exon_simulation = args.motif_file, args.output_dir, args.results_dir, args.required_simulations, args.motif_simulation, args.exon_simulation if not required_simulations: print('You must specify the number of simulations you require.') raise Exception gen.create_output_directories(output_dir) if motif_simulation: simulation_sets = [] #create the output directory for the particular motif set motif_output_dir = "{0}/{1}".format(output_dir, ".".join(motif_file.split('.')[:-1]).split('/')[-1]) gen.create_output_directories(motif_output_dir) simulated_motifs_output = "{0}/simulations_{1}.txt".format(motif_output_dir, required_simulations) output_file = "{0}/stop_counts_{1}.txt".format(motif_output_dir, required_simulations) # add the files to the required list simulation_sets.append([motif_file, simulated_motifs_output, output_file]) # run the simulations run_simulations(simulation_sets, required_simulations) exon_hexamer_simulation = "{0}/region_hexamer_sim.csv".format(output_dir) if exon_simulation: exon_fasta = "{0}_CDS_intervals.fasta".format(results_dir) run_exon_simulation(motif_file, exon_fasta, output_dir, required_simulations, exon_hexamer_simulation)
def main(): arguments = ["output_directory", "motif_file", "simulations", "controls_directory", "exons_fasta", "motifs_stop_density", "motif_stop_codon_densities_sim", "motif_codon_densities", "motif_densities_exon_dinucleotides", "generate_motif_controls", "match_density", "match_subs"] description = "" args = gen.parse_arguments(description, arguments, opt_flags=[2,3,4], flags = [5,6,7,8,9,10,11]) output_directory, motif_file, simulations, controls_directory, exons_fasta, motifs_stop_density, motif_stop_codon_densities_sim, motif_codon_densities, motif_densities_exon_dinucleotides, generate_motif_controls, match_density, match_subs = args.output_directory, args.motif_file, args.simulations, args.controls_directory, args.exons_fasta, args.motifs_stop_density, args.motif_stop_codon_densities_sim, args.motif_codon_densities, args.motif_densities_exon_dinucleotides, args.generate_motif_controls, args.match_density, args.match_subs # interger the simulations if simulations: simulations = int(simulations) # create the global output directory global_output_directory = "{0}/motif_tests".format(output_directory) gen.create_output_directories(global_output_directory) # if we want to generate the controls if generate_motif_controls: simopc.generate_motif_dinucleotide_controls(motif_file, simulations, output_directory, match_density = match_density, match_subs = match_subs) # get the stop density if motifs and non motifs of same length if motifs_stop_density: mtop.calc_stop_densities(motif_file) # calculate stop codon densities in the motif sets if motif_stop_codon_densities_sim: # create a local output directory local_output_directory = "{0}/motif_stop_density_simulations".format(global_output_directory) gen.create_output_directories(local_output_directory) # output filepath output_file = "{0}/{1}_stop_codon_densities.csv".format(local_output_directory, motif_file.split("/")[-1].split(".")[0]) # run if we need some more controls if simulations > len(os.listdir(controls_directory)): print("Please create more simulants...") raise Exception # # calculate densities mtop.motif_stop_codon_densities(motif_file, controls_directory, simulations, output_file) # calculate other codon densities in motif sets if motif_codon_densities: local_output_directory = "{0}/codon_combination_densities".format(global_output_directory) gen.create_output_directories(local_output_directory) # get all the possible sets of 3 unique codon combinations codon_combinations_file = "{0}/codon_combinations.txt".format(local_output_directory) if not os.path.isfile(codon_combinations_file): seqo.generate_all_motif_combinations(stops, codon_combinations_file) output_file = "{0}/{1}_codon_combination_densities.csv".format(local_output_directory, motif_file.split("/")[-1].split(".")[0]) if simulations > len(os.listdir(controls_directory)): gen.remove_directory(controls_directory) simopc.generate_motif_controls(motif_file, simulations, controls_directory, match_density = False) mtop.motif_codon_densities(motif_file, codon_combinations_file, controls_directory, simulations, output_file)
def main(): description = "Compare expression parameters for transcripts that contain true PTCs vs transcripts that contain pseudo-PTCs." args = gen.parse_arguments(description, ["PTCs_file", "pseudo_PTCs_directory", "expression_file"]) PTCs_file, pseudo_PTCs_directory, expression_file = args.PTCs_file, args.pseudo_PTCs_directory, args.expression_file expression = gen.read_many_fields(expression_file, "\t") #get median expression parameters for true PTCs true_values = PTCs_to_expression(PTCs_file, expression) #do the same for each of the simulant PTC files sim_files = os.listdir(pseudo_PTCs_directory) #I'm doing the first one separately so I could easily stack the outputs sim_values = PTCs_to_expression("{0}/{1}".format(pseudo_PTCs_directory, sim_files[0]), expression) for sim_file in sim_files[1:]: curr_sim_values = PTCs_to_expression("{0}/{1}".format(pseudo_PTCs_directory, sim_file), expression) sim_values = np.vstack((sim_values, curr_sim_values)) display_comparison("breadth", true_values, sim_values, 0) display_comparison("maximum TPM", true_values, sim_values, 1) display_comparison("median TPM", true_values, sim_values, 2) display_comparison("median TPM (if expressed)", true_values, sim_values, 3)
def main(): description = "Miscellaneous tests." args = gen.parse_arguments(description, [ "results_prefix", "disease_output_dir", "ese_file", "get_filtered", "get_info", "disease_locations_chisquare", "large_effect_ese_hits_simulation", "large_effect_locations", "large_effect_lengths" ], flags=[3, 4, 5, 6, 7, 8]) results_prefix, disease_output_dir, ese_file, get_filtered, get_info, disease_locations_chisquare, large_effect_ese_hits_simulation, large_effect_locations, large_effect_lengths = args.results_prefix, args.disease_output_dir, args.ese_file, args.get_filtered, args.get_info, args.disease_locations_chisquare, args.large_effect_ese_hits_simulation, args.large_effect_locations, args.large_effect_lengths if get_filtered: get_filtered_exons() # tests on the large effect cases if large_effect_ese_hits_simulation: ese_hits_simulation(ese_file) if large_effect_locations: large_effect_locations_sim() if large_effect_lengths: large_effects_lengths_sim()
def main(): description = "Check whether stop codons are depleted in motif sets by simulating the motif set." args = gen.parse_arguments(description, [ "required_simulations", "all_sets", "ESR", "Ke", "PESE", "RESCUE", "INT3", "RBP_motifs", "filter_RBPs", "split_RBPs" ], flags=[1, 2, 3, 4, 5, 6, 7, 8, 9]) required_simulations, all_sets, ESR, Ke, PESE, RESCUE, INT3, RBP_motifs, filter_RBPs, split_rbps = args.required_simulations, args.all_sets, args.ESR, args.Ke, args.PESE, args.RESCUE, args.INT3, args.RBP_motifs, args.filter_RBPs, args.split_RBPs if split_rbps and not filter_RBPs: print('You must specify the filtered RBPs if you want to split by ND.') raise Exception if not required_simulations: print('You must specify the number of simulations you require.') raise Exception #create the output_directory output_directory = "output_data" gen.create_directory(output_directory) #set up the simulations we want required_sets = [] if all_sets: required_sets.extend([i for i in ese_sets]) else: if ESR: required_sets.append("ESR") if Ke: required_sets.append("Ke400_ESEs") if PESE: required_sets.append("PESE") if RESCUE: required_sets.append("RESCUE") if INT3: required_sets.append("INT3") if RBP_motifs and not filter_RBPs: required_sets.append("RBP_motifs") if RBP_motifs and filter_RBPs: required_sets.append("RBP_motifs_filtered") #check whether any sets have been chosen if len(required_sets) == 0: print("\nPlease choose a motif set to analyse:\n") [print("--{0}".format(i)) for i in sorted(ese_sets)] print("\n") raise Exception #create the necessary files simulation_sets = [] for ese_set in required_sets: if ese_set == "RBP_motifs_filtered": dir_name = "RBP_motifs" else: dir_name = ese_set #create the output directory for the particular motif set motif_output_directory = "{0}/{1}".format(output_directory, dir_name) gen.create_directory(motif_output_directory) if split_rbps: #if we want to split the rbp motifs based on nd, need to create 2 lots of outputs simulated_set_output_pos_nd = "{0}/{1}_simulants_pos_nd_{2}.txt".format( motif_output_directory, dir_name, required_simulations) output_file_pos_nd = "{0}/{1}_stop_counts_pos_nd_{2}.csv".format( motif_output_directory, dir_name, required_simulations) simulation_sets.append([ ese_set, simulated_set_output_pos_nd, output_file_pos_nd, 1, "Positive ND" ]) simulated_set_output_neg_nd = "{0}/{1}_simulants_neg_nd_{2}.txt".format( motif_output_directory, dir_name, required_simulations) output_file_neg_nd = "{0}/{1}_stop_counts_neg_nd_{2}.csv".format( motif_output_directory, dir_name, required_simulations) simulation_sets.append([ ese_set, simulated_set_output_neg_nd, output_file_neg_nd, -1, "Negative ND" ]) else: #create simulated set output, analysis output file simulated_set_output = "{0}/{1}_simulants_{2}.txt".format( motif_output_directory, dir_name, required_simulations) output_file = "{0}/{1}_stop_counts_{2}.csv".format( motif_output_directory, dir_name, required_simulations) simulation_sets.append( [ese_set, simulated_set_output, output_file]) run_simulations(simulation_sets, int(required_simulations))
def main(): arguments = [ "input_bed", "input_fasta", "output_directory", "input_fasta2", "input_file", "required_simulations", "motif_file", "families_file", "output_prefix", "controls_dir", "extract_sequences", "calc_gc", "density_sim", "get_exon_dint_controls", "get_intron_dint_controls", "exon_region_density", "compare_stop_density", "sim_orf_lengths", "sim_orf_lengths_masked", "sim_stop_density", "sim_stop_density_introns", "sim_stop_density_within_genes", "sim_stop_density_removed_motifs", "sim_stop_density_removed_motifs_sim_seqs", "sim_stop_density_diff", "exon_intron_density", "motif_nd", "excess_test", "single_exon", "motif_overlap", "motif_overlap_density", "clean_alignments", "seq_hits_linc", "upstream_atg", "excess_length_thresholds", "density_regions", "extract_second", "seq_no" ] description = "Container for analysis on lincRNAs" args = gen.parse_arguments(description, arguments, flags=[ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36 ], opt_flags=[3, 4, 5, 6, 7, 8, 9, 37]) input_bed, \ input_fasta, \ output_directory, \ input_fasta2, \ input_file, \ required_simulations, \ motif_file, \ families_file, \ output_prefix, \ controls_dir, \ extract_sequences, \ calc_gc, \ density_sim, \ get_exon_dint_controls, \ get_intron_dint_controls, \ exon_region_density, \ compare_stop_density, \ sim_orf_lengths, \ sim_orf_lengths_masked, \ sim_stop_density, \ sim_stop_density_introns, \ sim_stop_density_within_genes, \ sim_stop_density_removed_motifs, \ sim_stop_density_removed_motifs_sim_seqs, \ sim_stop_density_diff, \ exon_intron_density, \ motif_nd, \ excess_test, \ single_exon,\ motif_overlap, \ motif_overlap_density, \ clean_alignments, \ seq_hits_linc, \ upstream_atg, \ excess_length_thresholds, \ density_regions, \ extract_second, \ seq_no = \ args.input_bed, \ args.input_fasta, \ args.output_directory, \ args.input_fasta2, \ args.input_file, \ args.required_simulations, \ args.motif_file, \ args.families_file, \ args.output_prefix, \ args.controls_dir, \ args.extract_sequences, \ args.calc_gc, \ args.density_sim, \ args.get_exon_dint_controls, \ args.get_intron_dint_controls, \ args.exon_region_density, \ args.compare_stop_density, \ args.sim_orf_lengths, \ args.sim_orf_lengths_masked, \ args.sim_stop_density, \ args.sim_stop_density_introns, \ args.sim_stop_density_within_genes, \ args.sim_stop_density_removed_motifs, \ args.sim_stop_density_removed_motifs_sim_seqs, \ args.sim_stop_density_diff, \ args.exon_intron_density, \ args.motif_nd, \ args.excess_test, \ args.single_exon, \ args.motif_overlap, \ args.motif_overlap_density, \ args.clean_alignments, \ args.seq_hits_linc, \ args.upstream_atg, \ args.excess_length_thresholds, \ args.density_regions, \ args.extract_second, \ args.seq_no # make required simultions an int required_simulations = int( required_simulations) if required_simulations else None # prcoess output prefix output_prefix = output_prefix + "_" if output_prefix else "" seq_no = int(seq_no) if seq_no else None # create output directories global_output_directory = "{0}/tests/lincrna".format(output_directory) gen.create_output_directories(global_output_directory) # set a start time start = time.time() # create the output_directory if it doenst already exist gen.create_output_directories(global_output_directory) # get the sequences if extract_sequences: lincRNA_single_exon_bed = "{0}/lincrna/lincRNA.single_exon.bed".format( output_directory) lincRNA_single_exon_fasta = "{0}/lincrna/lincRNA.single_exon.fasta".format( output_directory) lincRNA_single_exon_families = "{0}/lincrna/lincRNA.single_exon_families.bed".format( output_directory) lincRNA_multi_exon_bed = "{0}/lincrna/lincRNA.multi_exon.bed".format( output_directory) lincRNA_multi_exon_intron_bed = "{0}/lincrna/lincRNA.multi_exon.introns.bed".format( output_directory) lincRNA_multi_exon_fasta = "{0}/lincrna/lincRNA.multi_exon.fasta".format( output_directory) lincRNA_multi_exon_exons_fasta = "{0}/lincrna/lincRNA.multi_exon.exons.fasta".format( output_directory) lincRNA_multi_exon_intron_fasta = "{0}/lincrna/lincRNA.multi_exon.introns.fasta".format( output_directory) lincRNA_multi_exon_families = "{0}/lincrna/lincRNA.multi_exon_families.bed".format( output_directory) cont.extract_lincRNA_sequences(input_bed, input_fasta, lincRNA_single_exon_bed, lincRNA_multi_exon_bed, lincRNA_single_exon_fasta, lincRNA_multi_exon_fasta, lincRNA_multi_exon_intron_bed, lincRNA_multi_exon_intron_fasta, lincRNA_single_exon_families, lincRNA_multi_exon_families, clean_run=None) # clean the alignments to get in usable form # might need this if clean_alignments: output_exon_file = "{0}/clean_exon_alignments.fasta" output_intron_file = "{0}/clean_intron_alignments.fasta" ltests.clean_alignments(input_bed, input_fasta, output_exon_file, output_intron_file) if calc_gc: output_file = "{0}/{1}_gc.csv".format(global_output_directory, output_prefix) ltests.calc_gc(input_fasta, output_file, families_file=families_file) # orf length test if sim_orf_lengths: sim_orf_length_output_file = "{0}/{1}sim_orf_lengths.csv".format( global_output_directory, output_prefix) if families_file: sim_orf_length_z_file = "{0}/{1}sim_orf_lengths_zs_grouped.csv".format( global_output_directory, output_prefix) else: sim_orf_length_z_file = "{0}/{1}sim_orf_lengths_zs.csv".format( global_output_directory, output_prefix) # run the test simopc.sim_orf_length(input_fasta, required_simulations, sim_orf_length_output_file) ltests.process_length_sim(sim_orf_length_output_file, sim_orf_length_z_file, families_file=families_file) if sim_orf_lengths_masked: masked_output_file = "{0}_{1}_masked.csv".format( input_file.split(".")[0], motif_file.split("/")[-1].split(".")[0]) # run the test simopc.sim_orf_length_masked(input_fasta, required_simulations, motif_file, input_file, controls_dir, masked_output_file, families_file=families_file) # stop density test if sim_stop_density: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_stop_density_output_dir = "{0}/{1}_stop_density_simulation_all_genes_grouped_families".format( local_output_directory, output_prefix) sim_stop_density_output_file = "{0}/{1}_stop_density_simulation_all_genes_grouped_families.csv".format( local_output_directory, output_prefix) runs = 10 else: sim_stop_density_output_dir = "{0}/{1}_stop_density_simulation_all_genes".format( local_output_directory, output_prefix) sim_stop_density_output_file = "{0}/{1}_stop_density_simulation_all_genes.csv".format( local_output_directory, output_prefix) runs = 1 gen.create_output_directories(sim_stop_density_output_dir) for run in list(range(runs)): output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_stop_density_output_dir, run + 1) ltests.sim_stop_density(input_fasta, output_file, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_outputs(sim_stop_density_output_dir, sim_stop_density_output_file) # within genes if sim_stop_density_within_genes: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_stop_density_within_gene_output_dir = "{0}/{1}_stop_density_simulation_within_genes_grouped_families".format( local_output_directory, output_prefix) sim_stop_density_within_gene_output_file = "{0}/{1}_stop_density_simulation_within_genes_grouped_families.csv".format( local_output_directory, output_prefix) runs = 10 else: sim_stop_density_within_gene_output_dir = "{0}/{1}_stop_density_simulation_within_genes".format( local_output_directory, output_prefix) sim_stop_density_within_gene_output_file = "{0}/{1}_stop_density_simulation_within_genes.csv".format( local_output_directory, output_prefix) runs = 1 gen.create_output_directories(sim_stop_density_within_gene_output_dir) for run in list(range(runs)): output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_stop_density_within_gene_output_dir, run + 1) ltests.sim_stop_density_within_genes( input_fasta, output_file, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_within_gene_outputs( sim_stop_density_within_gene_output_dir, sim_stop_density_within_gene_output_file) # stop density test in the introns if sim_stop_density_introns: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_stop_density_output_dir = "{0}/{1}_stop_density_introns_simulation_all_genes_grouped_families".format( local_output_directory, output_prefix) sim_stop_density_output_file = "{0}/{1}_stop_density_introns_simulation_all_genes_grouped_families.csv".format( local_output_directory, output_prefix) runs = 1 else: sim_stop_density_output_dir = "{0}/{1}_stop_density_introns_simulation_all_genes".format( local_output_directory, output_prefix) sim_stop_density_output_file = "{0}/{1}_stop_density_introns_simulation_all_genes.csv".format( local_output_directory, output_prefix) runs = 1 gen.create_output_directories(sim_stop_density_output_dir) for run in list(range(runs)): output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_stop_density_output_dir, run + 1) ltests.sim_stop_density(input_fasta, output_file, simulations=int(required_simulations), families_file=families_file, introns=True, input_fasta2=input_fasta2) # process the outputs ltests.process_sim_stop_density_outputs(sim_stop_density_output_dir, sim_stop_density_output_file) # remove motifs and test if sim_stop_density_removed_motifs: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes_grouped_families_removed_motifs".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes_grouped_families_removed_motifs.csv".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 10 else: sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes.csv".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 1 # remove any previous runs gen.remove_directory(sim_output_dir) gen.create_output_directories(sim_output_dir) for run in list(range(runs)): run_output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_output_dir, run + 1) ltests.sim_stop_density_removed_motifs( input_fasta, run_output_file, motif_file, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_outputs(sim_output_dir, sim_output_file, reverse=True) # remove motifs and test within seqs if sim_stop_density_removed_motifs_sim_seqs: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_grouped_families_removed_motifs_seq_sim".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_simulation_grouped_families_removed_motifs_seq_sim.csv".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 10 else: sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes_seq_sim".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes_seq_sim.csv".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 1 # remove any previous runs gen.remove_directory(sim_output_dir) gen.create_output_directories(sim_output_dir) for run in list(range(runs)): run_output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_output_dir, run + 1) ltests.sim_stop_density_removed_motifs_seq_sim( input_fasta, run_output_file, motif_file, controls_dir, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_outputs(sim_output_dir, sim_output_file, reverse=True) if sim_stop_density_diff: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_output_dir = "{0}/{1}_{2}_stop_density_diff_grouped_families".format( global_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_stop_density_diff_grouped_families.csv".format( global_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 10 else: sim_output_dir = "{0}/{1}_{2}_stop_density_stop_density_diff_all_genes".format( global_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_stop_density_diff_all_genes.csv".format( global_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 1 # remove any previous runs gen.remove_directory(sim_output_dir) gen.create_output_directories(sim_output_dir) for run in list(range(runs)): run_output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_output_dir, run + 1) ltests.sim_stop_density_diff(input_fasta, run_output_file, motif_file, controls_dir, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_diffs(sim_output_dir, sim_output_file, greater_than=False) # get density in exons and introns if exon_intron_density: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) output_file = "{0}/exon_intron_stop_density.csv".format( local_output_directory) ltests.exon_intron_stop_density(input_fasta, input_fasta2, output_file, families_file=families_file) # test whether there is an excess in flanks if excess_test: gen.check_files_exists([input_fasta, motif_file]) # local output directory local_output_directory = "{0}/stop_excesses".format( global_output_directory) gen.create_output_directories(local_output_directory) # if the families file exists, group by family if families_file: excess_test_output_file = "{0}/{1}_stop_codon_excesses_grouped.csv".format( local_output_directory, motif_file.split("/")[-1].split(".")[0]) else: excess_test_output_file = "{0}/{1}_stop_codon_excesses.csv".format( local_output_directory, motif_file.split("/")[-1].split(".")[0]) # run the test ltests.excess_test(input_fasta, motif_file, excess_test_output_file, simulations=required_simulations, families_file=families_file) # upstream from the atg if upstream_atg: output_file = "{0}/stop_density/upstream_atg_stop_density.csv".format( global_output_directory) ltests.upstream_atg(input_fasta, output_file, simulations=int(required_simulations), families_file=families_file) # calculate the density in the different regions if density_regions: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) output_file = "{0}/stop_density_regions_chisq.csv".format( local_output_directory) output_file1 = "{0}/stop_density_regions1.csv".format( local_output_directory) output_file2 = "{0}/stop_density_regions_per_seq.csv".format( local_output_directory) ltests.density_regions(input_fasta, motif_file, output_file, output_file1, output_file2, required_simulations=required_simulations, families_file=families_file) # test hits to seqs if seq_hits_linc: local_output_dir = "{0}/ese_hits".format(global_output_directory) if output_prefix: tests_output_dir = "{0}/{1}_{2}".format( local_output_dir, output_prefix[:-1], motif_file.split("/")[-1].split(".")[0]) final_output_file = "{0}/{1}_{2}_processed2.csv".format( local_output_dir, output_prefix, motif_file.split("/")[-1].split(".")[0]) else: tests_output_dir = "{0}/{1}_{2}".format( local_output_dir, output_prefix, motif_file.split("/")[-1].split(".")[0]) final_output_file = "{0}/{1}_{2}_processed2.csv".format( local_output_dir, output_prefix[:-1], motif_file.split("/")[-1].split(".")[0]) gen.create_output_directories(tests_output_dir) runs = 10 for run in range(runs): if output_prefix: output_file = "{0}/{1}_{2}_hits_{3}.csv".format( tests_output_dir, output_prefix, motif_file.split("/")[-1].split(".")[0], run + 1) else: output_file = "{0}/{1}_hits_{2}.csv".format( tests_output_dir, motif_file.split("/")[-1].split(".")[0], run + 1) mto.calc_seq_hits_linc(input_fasta, output_file, motif_file, controls_dir, required_simulations=required_simulations, families_file=families_file) mto.process_seq_hits_linc(tests_output_dir, final_output_file) if excess_length_thresholds: local_output_dir = "{0}/orf_length_thresholds".format( global_output_directory) gen.create_output_directories(local_output_dir) ltests.orf_exceed_length_threshold( input_fasta, local_output_directory, required_simulations=required_simulations, families_file=families_file) # extract second set if extract_second: local_output_dir = "{0}/genome_sequences/lincrna/{1}".format( output_directory, output_prefix) lmisco.extract_second_seqs(input_bed, input_file, input_fasta, local_output_dir)
def main(): arguments = ["working_directory", "output_directory", "genome_path", "input_bed", "input_fasta", "clean_run", "extract_exon_intron_bed", "extract_exons", "extract_introns", "sort_by_exon_number", "build_transcripts", "extract_families", "orf_length_sim"] description = "Wrapper for miscellaneous operations on lincRNA" args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13], opt_flags = [2,3,4]) working_directory, output_directory, genome_path, input_bed, input_fasta, clean_run, extract_exon_intron_bed, extract_exons, extract_introns, sort_by_exon_number, build_transcripts, extract_families, orf_length_sim = args.working_directory, args.output_directory, args.genome_path, args.input_bed, args.input_fasta, args.clean_run, args.extract_exon_intron_bed, args.extract_exons, args.extract_introns, args.sort_by_exon_number, args.build_transcripts, args.extract_families, args.orf_length_sim # create the directories gen.create_output_directories(working_directory) gen.create_output_directories(output_directory) # file paths exons_bed = "{0}/exons.bed".format(working_directory) single_exons_bed = "{0}/single_exons.bed".format(working_directory) multi_exons_bed = "{0}/multi_exons.bed".format(working_directory) exons_fasta = "{0}/exons.fasta".format(working_directory) single_exons_fasta = "{0}/single_exons.fasta".format(working_directory) multi_exons_fasta = "{0}/multi_exons.fasta".format(working_directory) introns_bed = "{0}/introns.bed".format(working_directory) introns_fasta = "{0}/introns.fasta".format(working_directory) transcript_sequences_fasta = "{0}/transcript_sequences.fasta".format(working_directory) multi_exon_transcript_sequences_fasta = "{0}/multi_exon_transcript_sequences.fasta".format(working_directory) multi_exon_blast_file = "{0}/multi_exons_blast_all_against_all.csv".format(working_directory) multi_exon_blast_database = "{0}/multi_exon_blast_all_against_all".format(working_directory) multi_exon_families_file = "{0}/multi_exon_families.txt".format(working_directory) # create the exons and introns files from bed if extract_exon_intron_bed: # copy the main file to the folder gen.copy_file(input_bed, "{0}/{1}".format(working_directory, input_bed.split("/")[-1])) # extract the features lmo.extract_bed_coordinates_block_format(input_bed, exons_bed, introns_bed) # get files for each if sort_by_exon_number: gen.check_files_exists([exons_bed]) lmo.sort_by_exon_number(exons_bed, single_exons_bed, multi_exons_bed) # get exons if extract_exons: gen.check_files_exists([exons_bed]) fo.fasta_from_intervals(exons_bed, exons_fasta, genome_path, names=True) # if the single exons bed file exists, get just the single exon sequences if os.path.isfile(single_exons_bed): lmo.sort_fasta_by_bed(single_exons_bed, exons_fasta, single_exons_fasta) # if the multi exons bed file exists, get just the multi exon sequences if os.path.isfile(multi_exons_bed): lmo.sort_fasta_by_bed(multi_exons_bed, exons_fasta, multi_exons_fasta) # get introns if extract_introns: gen.check_files_exists([introns_bed]) fo.fasta_from_intervals(introns_bed, introns_fasta, genome_path, names=True) # build transcripts if build_transcripts: gen.check_files_exists([exons_fasta]) lmo.build_transcripts(exons_fasta, transcript_sequences_fasta) # if the multi exons bed file exists, get just the multi exon sequences if os.path.isfile(multi_exons_bed): lmo.sort_fasta_by_bed(multi_exons_bed, transcript_sequences_fasta, multi_exon_transcript_sequences_fasta) # now group into paralagous families if extract_families: gen.check_files_exists([multi_exon_transcript_sequences_fasta]) cons.filter_families(multi_exon_transcript_sequences_fasta, multi_exon_blast_file, multi_exon_families_file, database_path = multi_exon_blast_database, clean_run = clean_run)
def main(): description = "Look at disease snps." arguments = ["disease_snps_file", "output_directory", "results_prefix", "simulations", "ese_file", "intersect_snps", "get_relative_positions", "get_snp_status", "get_info", "simulate_ptc_location", "get_possible_ptc_locations", "required_simulations", "get_overlaps", "intersect_ptcs", "compare_ptcs" ,"get_introns", "compare_distances", "clinvar_ptc_locations", "location_simulation", "exclude_cpg", "ese_hit_simulation", "only_disease", "only_kgenomes", "only_ese", "get_unique_ptcs", "get_unique_rel_pos", "excess_test", "disease_locations_chisquare"] args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 21, 22, 23,24,25,26,27], ints=[3]) disease_snps_file, output_directory, results_prefix, simulations, ese_file, intersect_snps, get_relative_positions, get_snp_status, get_info, simulate_ptc_location, get_possible_ptc_locations, required_simulations, get_overlaps, intersect_ptcs, compare_ptcs, get_introns, compare_distances, clinvar_ptc_locations, location_simulation, exclude_cpg, ese_hit_simulation, only_disease, only_kgenomes, only_ese, get_unique_ptcs, get_unique_rel_pos, excess_test, disease_locations_chisquare = args.disease_snps_file, args.output_directory, args.results_prefix, args.simulations, args.ese_file, args.intersect_snps, args.get_relative_positions, args.get_snp_status, args.get_info, args.simulate_ptc_location, args.get_possible_ptc_locations, args.required_simulations, args.get_overlaps, args.intersect_ptcs, args.compare_ptcs, args.get_introns, args.compare_distances, args.clinvar_ptc_locations, args.location_simulation, args.exclude_cpg, args.ese_hit_simulation, args.only_disease, args.only_kgenomes, args.only_ese, args.get_unique_ptcs, args.get_unique_rel_pos, args.excess_test, args.disease_locations_chisquare if simulations and not isinstance(simulations, int): print("\nERROR: Please provide the correct number for simulations.\n") raise Exception # create the output directory if it doesnt already exist gen.create_output_directories(output_directory) # disease_snps_file = "./source_data/clinvar_20180429.vcf.gz" disease_snps_index_file = "{0}.tbi".format(disease_snps_file) if not os.path.isfile(disease_snps_file) or not os.path.isfile(disease_snps_index_file): print("\nERROR: Please provide the required disease SNPs file(s).\n") raise Exception # intersect the coding exons with the disease snps exon_bed = "{0}_coding_exons.bed".format(results_prefix) disease_snp_intersect_file_vcf = "{0}/disease_snp_intersect.vcf".format(output_directory) disease_snp_intersect_file_bed = "{0}/disease_snp_intersect.bed".format(output_directory) if intersect_snps: print("Intersecting snps with exons") so.intersect_snps_parallel(exon_bed, disease_snps_file, disease_snp_intersect_file_vcf) so.intersect_vcf_to_bed(exon_bed, disease_snp_intersect_file_vcf, disease_snp_intersect_file_bed, change_names = True) # get relative positions of the snps in cds and exons full_bed = "{0}_CDS.bed".format(results_prefix) disease_snps_relative_exon_positions = "{0}/disease_snp_relative_exon_positions.bed".format(output_directory) disease_snps_relative_cds_positions = "{0}/disease_snp_relative_cds_positions.bed".format(output_directory) if get_relative_positions: print("Getting snp relative positions...") so.get_snp_relative_exon_position(disease_snp_intersect_file_bed, disease_snps_relative_exon_positions) # output to var because this is how the function was made relative_positions = gen.read_many_fields(disease_snps_relative_exon_positions, "\t") so.get_snp_relative_cds_position(relative_positions, disease_snps_relative_cds_positions, full_bed) # get the change status of the snps to check them cds_fasta = "{0}_CDS.fasta".format(results_prefix) disease_ptcs_file = "{0}/disease_ptcs.txt".format(output_directory) disease_other_file = "{0}/disease_other_snps.txt".format(output_directory) if get_snp_status: print("Getting snp status...") so.get_snp_change_status(disease_snps_relative_cds_positions, cds_fasta, disease_ptcs_file, disease_other_file) # get intersect between the clinvar ptcs and 1000 genomes ptcs ptc_file = "{0}_ptc_file.txt".format(results_prefix) ptc_intersect_file = "{0}/ptc_intersect.bed".format(output_directory) if intersect_ptcs: temp_disease_ptc_file = "temp_data/{0}".format(random.random()) dso.refactor_ptc_file(disease_ptcs_file, temp_disease_ptc_file) temp_k_genomes_ptc_file = "temp_data/{0}".format(random.random()) dso.refactor_ptc_file(ptc_file, temp_k_genomes_ptc_file, header=True) bao.intersect_bed(temp_k_genomes_ptc_file, temp_disease_ptc_file, write_both = True, no_dups=False, output_file = ptc_intersect_file) gen.remove_file(temp_disease_ptc_file) gen.remove_file(temp_k_genomes_ptc_file) # get a list of ptcs unique to each dataset unique_ptcs = "{0}/disease_ptcs_no_intersect.bed".format(output_directory) unique_ptcs_kgenomes = "{0}/kgenomes_ptcs_no_intersect.bed".format(output_directory) if get_unique_ptcs: dso.get_unique_ptcs(disease_ptcs_file, ptc_file, ptc_intersect_file, unique_ptcs, unique_ptcs_kgenomes) # get the relative positions of the ptcs unique to each dataset unique_ptcs_rel_pos_file = "{0}/disease_ptcs_no_intersect_rel_pos.bed".format(output_directory) kgenomes_relative_positions = "{0}_PTC_relative_exon_positions.bed".format(results_prefix) kgenomes_unique_ptcs_rel_pos_file = "{0}/kgenomes_ptcs_no_intersect_rel_pos.bed".format(output_directory) if get_unique_rel_pos: dso.get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, unique_ptcs_kgenomes, kgenomes_relative_positions, unique_ptcs_rel_pos_file, kgenomes_unique_ptcs_rel_pos_file) # get the ese file name ese_file_name = ese_file.split('/')[-1].split('.')[0] # get the coding exons fasta file path coding_exons_fasta = "{0}_coding_exons.fasta".format(results_prefix) # snp_relative_positions_file = "{0}_SNP_relative_exon_position.bed".format(results_prefix) # simulation picking random reference allele matched simulants clinvar_location_simulation_file = "{0}/clinvar_ptc_location_simulation.csv".format(output_directory) clinvar_location_simulation_ese_overlap_file = "{0}/clinvar_ptc_location_simulation_{1}_ese_overlaps.csv".format(output_directory, ese_file_name) kgenomes_location_simulation_file = "{0}/1000_genomes_simulations.csv".format(output_directory) kgenomes_location_simulation_ese_overlap_file = "{0}/1000_genomes_simulations_ese_overlaps.csv".format(output_directory) if location_simulation: if not only_kgenomes: print('Running ptc location simulation on disease PTCs...') dso.ptc_location_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_location_simulation_file, clinvar_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg) if not only_disease: print('Running ptc location simulation on 1000 genomes PTCs...') dso.ptc_location_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_location_simulation_file, kgenomes_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg) window_start = 3 window_end = 69 clinvar_ese_hit_simulation_file = "{0}/clinvar_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name) kgenomes_ese_hit_simulation_file = "{0}/1000_genomes_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name) # do a simulation picking only sites from within the region if ese_hit_simulation: if not only_kgenomes: print("Simulating ESE hits on the {0}-{1} region for disease PTCs...".format(window_start, window_end)) dso.ese_hit_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg) if not only_disease: print("Simulating ESE hits on the {0}-{1} region for 1000 genomes PTCs...".format(window_start, window_end)) dso.ese_hit_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg) excess_test_file = "{0}/clinvar_ptc_{1}_{2}_excesses.csv".format(output_directory, window_start, window_end) if excess_test: dso.excess_test(unique_ptcs_rel_pos_file, coding_exons_fasta, excess_test_file) location_test_file = "{0}/clinvar_locations_chisquare.csv".format(output_directory) if disease_locations_chisquare: dso.disease_ptc_location_test(unique_ptcs_rel_pos_file, coding_exons_fasta, location_test_file)
def main(): description = "Take an output file from prepare_FANTOM.py and make a file with the expression data for each gene." args = gen.parse_arguments(description, [ "clean_fasta", "promoters_file_name", "cage_file_name", "out_prefix", "TPM_threshold" ], ints=[4]) [ clean_fasta, promoters_file_name, cage_file_name, out_prefix, TPM_threshold ] = [ args.clean_fasta, args.promoters_file_name, args.cage_file_name, args.out_prefix, args.TPM_threshold ] #extract transcript coordinates transcripts_file = "{0}_transcripts_clean.bed".format(out_prefix) bo.extract_features("../source_data/Homo_sapiens.GRCh37.87.gtf", transcripts_file, ["transcript"]) #get the names of the transcripts you're interested names = gen.read_fasta(clean_fasta)[0] #write the coordinates of the promoter regions of those transcripts to file with open(promoters_file_name, "w") as out_file, open(transcripts_file, "r") as in_file: for line in in_file: parsed = (line.rstrip("\n")).split("\t") #parse out the transcript name name = parsed[3].split(".")[0] #skip transcripts that aren't among your transcripts of interest if name in names: #determine the coordinates of a 1001 bp region centered on the TSS (the supposed promoter region) if parsed[5] == "+": current_line = [ "chr" + parsed[0], int(parsed[1]) - 500, int(parsed[1]) + 500 + 1, name, ".", parsed[5] ] elif parsed[5] == "-": current_line = [ "chr" + parsed[0], int(parsed[2]) - 500 - 1, int(parsed[2]) + 500, name, ".", parsed[5] ] else: RuntimeError("Invalid strand information!") out_file.write("\t".join([str(i) for i in current_line])) out_file.write("\n") #check which CAGE peaks overlap which promoters overlapping_peaks_file = "{0}_FANTOM_overlap_peaks.bed".format(out_prefix) bmo.intersect_bed(cage_file_name, promoters_file_name, output_file=overlapping_peaks_file, force_strand=True, write_both=True, no_dups=False) #for each transcript, get all overlapping peaks #(store only the expression information) peaks_dict = {name: [] for name in names} with open(overlapping_peaks_file, "r") as peaks: for peak in peaks: peak = peak.split("\t") name = peak[9] peaks_dict[name].append(peak[3]) #for each transcript, #store the mean TPM within each tissue (averaged over the different peaks #associated to that transcript) mean_dict = {} np.set_printoptions(suppress=True) for name in peaks_dict: if len(peaks_dict[name]) > 0: current_mat = np.array([[float(j) for j in i.split("|")] for i in peaks_dict[name]]) means = np.mean(current_mat, axis=0) mean_dict[name] = means #calculate expression parameters final_dict = {} for gene in mean_dict: expressed = len([i for i in mean_dict[gene] if i > TPM_threshold]) fraction = expressed / len(mean_dict[gene]) maximum = np.max(mean_dict[gene]) median_expr = np.median(mean_dict[gene]) median_if_expressed = np.median( [i for i in mean_dict[gene] if i > TPM_threshold]) final_dict[gene] = [ fraction, maximum, median_expr, median_if_expressed ] output_file_name = "{0}_FANTOM_expression_per_transcript.txt".format( out_prefix) with open(output_file_name, "w") as file: file.write("gene\tbreadth\tmax\tmedian\tmedian_expr\n") for i in sorted(list(final_dict.keys())): if final_dict[i] != None: file.write("\t".join([i] + [str(j) for j in final_dict[i]])) file.write("\n")
def main(): description = "Check whether PTCs are associated with greater rates of exon skipping." args = gen.parse_arguments( description, [ "gtf", "genome_fasta", "bams_folder", "vcf_folder", "panel_file", "out_prefix", "bam_analysis_folder", "number_of_simulations", "simulation_output_folder", "motif_file", "filter_genome_data", "get_SNPs", "process_bams", "simulate_ptc_snps", "motif_complement", "overwrite_intersect", "use_old_sims", "out_of_frame", "simulate_ptcs_with_monomorphic", "generate_monomorphic_indices", "ignore_determine_snp_type", "ignore_psi_calculation", "ptc_location_analysis" ], flags=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], ints=[7]) gtf, genome_fasta, bams_folder, vcf_folder, panel_file, out_prefix, bam_analysis_folder, number_of_simulations, simulation_output_folder, motif_file, filter_genome_data, get_SNPs, process_bams, simulate_ptc_snps, motif_complement, overwrite_intersect, use_old_sims, out_of_frame, simulate_ptcs_with_monomorphic, generate_monomorphic_indices, ignore_determine_snp_type, ignore_psi_calculation, ptc_location_analysis = args.gtf, args.genome_fasta, args.bams_folder, args.vcf_folder, args.panel_file, args.out_prefix, args.bam_analysis_folder, args.number_of_simulations, args.simulation_output_folder, args.motif_file, args.filter_genome_data, args.get_SNPs, args.process_bams, args.simulate_ptc_snps, args.motif_complement, args.overwrite_intersect, args.use_old_sims, args.out_of_frame, args.simulate_ptcs_with_monomorphic, args.generate_monomorphic_indices, args.ignore_determine_snp_type, args.ignore_psi_calculation, args.ptc_location_analysis start = time.time() # create any necessary output diretories directory_splits = out_prefix.split('/') directory_paths = "/".join(directory_splits[:-1]) gen.create_output_directories(directory_paths) gen.create_directory('temp_data/') CDS_fasta = "{0}_CDS.fasta".format(out_prefix) CDS_bed = "{0}_CDS.bed".format(out_prefix) exon_bed = "{0}_exons.bed".format(out_prefix) filtered_exon_bed = "{0}_filtered_exons.bed".format(out_prefix) exon_junctions_file = "{0}_exon_junctions.bed".format(out_prefix) coding_exon_bed = "{0}_coding_exons.bed".format(out_prefix) if filter_genome_data: #extract and filter CDS coordinates and sequences print("Extracting and filtering CDSs...") bo.extract_cds(gtf, CDS_bed, CDS_fasta, genome_fasta, all_checks=True, uniquify=True, clean_chrom_only=True, full_chr_name=True) gen.get_time(start) #group the CDS sequences into families based on sequence similarity print("Grouping sequences into families...") names = gen.read_fasta(CDS_fasta)[0] gen.find_families_ensembl( "../source_data/GRCh37_ensembl_protein_families.txt", names, "{0}_families.txt".format(out_prefix)) gen.get_time(start) print("Extracting and filtering exons...") #extract exon coordinates bo.extract_exons(gtf, exon_bed) #only leave exons from transcripts that passed quality control in the extract_cds step above. #also only leave a single gene per family bo.filter_bed_from_fasta( exon_bed, CDS_fasta, filtered_exon_bed, families_file="{0}_families.txt".format(out_prefix)) gen.get_time(start) #extract exon-exon junction coordinates print("Extracting exon-exon junctions...") bo.extract_exon_junctions(exon_bed, exon_junctions_file, window_of_interest=2) gen.get_time(start) #make another exons bed that only contains fully coding exons. #This is because in the final analysis, we should only consider fully protein-coding exons. #However, for getting the exon junctions we need the full exons file because fully protein-coding exons might #be flanked by exons that are not. This is why we couldn't do this filtering step earlier. print( "Filtering out overlapping, non-coding and partially coding, as well as terminal exons..." ) bo.check_coding(filtered_exon_bed, CDS_bed, coding_exon_bed, remove_overlapping=True) gen.get_time(start) SNP_file = "{0}_SNP_file.txt".format(out_prefix) if out_of_frame: out_prefix = out_prefix + "_out_of_frame" PTC_file = "{0}_ptc_file.txt".format(out_prefix) syn_nonsyn_file = "{0}_syn_nonsyn_file.txt".format(out_prefix) CDS_interval_file = "{0}_intervals{1}".format( os.path.splitext(CDS_fasta)[0], os.path.splitext(CDS_fasta)[1]) #check which individuals were included in Geuvadis full_sample_names = os.listdir(bams_folder) full_sample_names = [ i for i in full_sample_names if i[-4:] == ".bam" and "proc" not in i ] sample_names = [(i.split("."))[0] for i in full_sample_names] sample_names = [i for i in sample_names if len(i) > 0] print('{0} samples included in Geuvadis...'.format(len(sample_names))) #for some reason, 17 of the samples from Geuvadis don't appear in the 1000genomes vcf #I'm gonna have to get to the bottom of this at some point #but at the moment I'm just gonna filter them out with open("../source_data/samples_in_vcf.txt") as file: samples_in_vcf = file.readlines() samples_in_vcf = [i.rstrip("\n") for i in samples_in_vcf] sample_names = [i for i in sample_names if i in samples_in_vcf] print('{0} samples also in vcf...'.format(len(sample_names))) sample_file = "{0}_sample_file.txt".format(out_prefix) # create a fasta containing all sequences for exons with snp coding_exons_fasta = "{0}_coding_exons.fasta".format(out_prefix) bo.fasta_from_intervals(coding_exon_bed, coding_exons_fasta, genome_fasta, names=True) if get_SNPs: #get SNPs for the sample intersect_file = "{0}_SNP_CDS_intersect.bed".format(out_prefix) print("Getting SNP data...") so.get_snps_in_cds(coding_exon_bed, CDS_bed, vcf_folder, panel_file, sample_names, sample_file, intersect_file, out_prefix) print("Calculating SNP positions...") so.get_snp_positions(sample_file, SNP_file, CDS_bed, intersect_file, out_prefix) gen.get_time(start) if ignore_determine_snp_type: pass else: print("Determining SNP type...") so.get_snp_change_status(SNP_file, CDS_fasta, PTC_file, syn_nonsyn_file, out_of_frame=out_of_frame, ref_check=True, headers=True) gen.get_time(start) #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step. print( "Filtering exon-exon junctions to only leave those that flank exons with a PTC variant..." ) PTC_exon_junctions_file = "{0}_filtered_exon_junctions.bed".format( out_prefix) bo.filter_exon_junctions(exon_junctions_file, PTC_file, PTC_exon_junctions_file) #make a list of all the .bam files and modify them to have the full path rather than just the file name bam_files = [ "{0}/{1}".format(bams_folder, i) for i in full_sample_names if (i.split("."))[0] in sample_names ] #in parallel, do the processing on individual .bam files exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format( out_prefix) if bam_analysis_folder == "None": bam_analysis_folder = "{0}__analysis_bam_analysis".format(out_prefix) gen.create_directory(bam_analysis_folder) if process_bams: print("Processing RNA-seq data...") if out_of_frame: splits = exon_junctions_bam_output_folder.split('/') splits[-1] = splits[-1].replace('_out_of_frame', '') exon_junctions_bam_output_folder = "/".join(splits) gen.create_directory(exon_junctions_bam_output_folder) #we have to do it like this because you can't pass flags into run_in_parallel keyword_dict = {"overwrite_intersect": overwrite_intersect} processes = gen.run_in_parallel(bam_files, [ "foo", exon_junctions_file, PTC_exon_junctions_file, bam_analysis_folder, PTC_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, keyword_dict ], nao.process_bam_per_individual, workers=36) for process in processes: process.get() gen.get_time(start) #if required, filter PTCs to only leave ones that overlap motifs from a specified set motif_filtering = False if motif_file != "None": print( "Filtering SNPs based on whether or not they overlap a motif from the specified set..." ) motif_suffix = ((motif_file.split("/"))[-1]).split(".")[0] if motif_complement: out_prefix = "{0}_{1}_complement".format(out_prefix, motif_suffix) else: out_prefix = "{0}_{1}".format(out_prefix, motif_suffix) filtered_ptc = "{0}_ptc_file.txt".format(out_prefix) so.filter_motif_SNPs(CDS_fasta, PTC_file, motif_file, filtered_ptc, complement=motif_complement) PTC_file = filtered_ptc final_file = "{0}__analysis_final_output.txt".format(out_prefix) if ignore_psi_calculation: pass else: print("Calculating PSI...") bmo.compare_PSI(PTC_file, bam_analysis_folder, final_file) #run the simulation that swaps ptcs for nonsynonymous snps if simulate_ptc_snps: if simulate_ptc_snps and not number_of_simulations: print("Please specify the number of simulations") raise Exception nao.ptc_snp_simulation(out_prefix, simulation_output_folder, PTC_file, syn_nonsyn_file, exon_junctions_file, bam_files, number_of_simulations, exon_junctions_bam_output_folder, use_old_sims=use_old_sims) # run the simulation that picks monomorphic sites if simulate_ptcs_with_monomorphic: if simulate_ptcs_with_monomorphic and not number_of_simulations: print("Please specify the number of simulations") raise Exception coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix) if not os.path.exists(coding_exon_fasta): print('Coding exon fasta is required...') raise Exception nao.ptc_monomorphic_simulation( out_prefix, simulation_output_folder, sample_file, genome_fasta, PTC_file, syn_nonsyn_file, coding_exon_bed, coding_exon_fasta, exon_junctions_file, bam_files, number_of_simulations, generate_indices=generate_monomorphic_indices, use_old_sims=use_old_sims) # get the locations of the ptcs if ptc_location_analysis: print("PTC locations analysis...") snp_relative_exon_position_file = "{0}_SNP_relative_exon_position.bed".format( out_prefix) ptc_location_analysis_output_file = "{0}_ptc_location_analysis.csv".format( out_prefix) coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix) if not os.path.exists(coding_exon_fasta) or not os.path.exists( snp_relative_exon_position_file) or not os.path.exists( PTC_file): print("Please run --filter_genome_data and --get_SNPs first...") raise Exception # need to work out where and what the analysis outputs need to do so.ptc_locations(PTC_file, snp_relative_exon_position_file, ptc_location_analysis_output_file)
def main(): description = "Filter an osc file to only contain the samples that you want and format it as a bed file so that you could lift over the coordinates." arguments = ["input_file_name", "output_file_name", "filter_samples"] args = parse_arguments(description, arguments, flags=[2]) input_file_name, output_file_name, filter_samples = [ args.input_file_name, args.output_file_name, args.filter_samples ] #this is all the pooled ones except for all the brain subregion ones which I removed because otherwise like #11/40 would have been brain tissues. I left in the retina though. ones_I_want = [ 'of adipose tissue, adult, pool1', 'of adrenal gland, adult, pool1', 'of aorta, adult, pool1', 'of bladder, adult, pool1', 'of blood, adult, pool1', 'of brain, adult, pool1', 'of cervix, adult, pool1', 'of colon, adult, pool1', 'of esophagus, adult, pool1', 'of heart, adult, pool1', 'of kidney, adult, pool1', 'of liver, adult, pool1', 'of lung, adult, pool1', 'of ovary, adult, pool1', 'of placenta, adult, pool1', 'of prostate, adult, pool1', 'of retina, adult, pool1', 'of salivary gland, adult, pool1', 'of skeletal muscle, adult, pool1', 'of small intestine, adult, pool1', 'of smooth muscle, adult, pool1', 'of spleen, adult, pool1', 'of testis, adult, pool1', 'of thymus, adult, pool1', 'of thyroid, adult, pool1', 'of tonsil, adult, pool1', 'of trachea, adult, pool1', 'of uterus, adult, pool1' ] IDs = [] indices = [] full_IDs = [] counter = 0 with open(input_file_name) as file, open(output_file_name, "w") as output_file: for line in file: counter = counter + 1 if counter % 1000 == 0: print(counter) if line[0] == "#": if filter_samples: if "adult, pool1" in line: for search in ones_I_want: if search in line: ID = re.findall("CNhs[\d\.\-\w]*", line)[0] IDs.append(ID) elif line[:6] == "00Anno": if filter_samples: line = line.split("\t") for pos, elem in enumerate(line): for ID in IDs: if ID in elem: indices.append(pos) full_IDs.append(elem) elif line[:3] == "chr": #I'm going to pretend that the actual data bit is just #the name of the bed record so it would survive the CrossMapping line = line.split("\t") coords = line[0] line[-1] = line[-1].rstrip("\n") if filter_samples: line = [line[i] for i in indices] else: line = line[1:] coords = coords.split("..") chrom = coords[0].split(":")[0] start = coords[0].split(":")[1] end = coords[1].split(",")[0] strand = coords[1].split(",")[1] name = "|".join(line) output_line = [chrom, start, end, name, ".", strand] output_file.write("\t".join(output_line)) output_file.write("\n")
def main(): arguments = [ "output_directory", "genome_gtf", "genome_fasta", "ortholog_gtf", "ortholog_fasta", "input_file", "genome_fasta", "mapping_file", "codes_file", "ensembl_links", "extract_protein_coding", "extract_exons", "extract_introns", "extract_coding_exons", "extract_non_coding_exons", "extract_non_transcribed_regions", "extract_lincrna_seqs", "clean_run" ] description = "" args = gen.parse_arguments(description, arguments, opt_flags=[1, 2, 3, 4, 5, 6, 7, 8, 9], flags=[10, 11, 12, 13, 14, 15, 16, 17]) output_directory, genome_gtf, genome_fasta, ortholog_gtf, ortholog_fasta, input_file, genome_fasta, mapping_file, codes_file, ensembl_links, extract_protein_coding, extract_exons, extract_introns, extract_coding_exons, extract_non_coding_exons, extract_non_transcribed_regions, extract_lincrna_seqs, clean_run = args.output_directory, args.genome_gtf, args.genome_fasta, args.ortholog_gtf, args.ortholog_fasta, args.input_file, args.genome_fasta, args.mapping_file, args.codes_file, args.ensembl_links, args.extract_protein_coding, args.extract_exons, args.extract_introns, args.extract_coding_exons, args.extract_non_coding_exons, args.extract_non_transcribed_regions, args.extract_lincrna_seqs, args.clean_run # set a start time start = time.time() # create the output_directory if it doenst already exist gen.create_output_directories(output_directory) # get the sequences if extract_protein_coding: # input_file1 = gtf genome 1, genome_fasta = genome fasta 1, ortholog_gtf = gtf genome 2, ortholog_fasta = genome fasta 2, ensembl_links = orthlogs file cont.extract_clean_sequences(genome_gtf, genome_fasta, ortholog_gtf, ortholog_fasta, ensembl_links, output_directory, clean_run=clean_run) full_exon_file = "{0}/genome_sequences/human/human.exons.bed".format( output_directory) if extract_exons: cont.extract_exons(genome_gtf, genome_fasta, output_directory, full_exon_file, clean_run=clean_run) sequo.clean_feature_file(full_exon_file) exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_filtered_exons.bed".format( output_directory, "human") coding_exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_coding_exons.bed".format( output_directory, "human") coding_exons_fasta = "{0}/genome_sequences/{1}/{1}.cds.clean_coding_exons.fasta".format( output_directory, "human") if extract_coding_exons: sequo.get_coding_exon_coordinates(full_exon_file, exons_bed, coding_exons_bed) fo.fasta_from_intervals(coding_exons_bed, coding_exons_fasta, genome_fasta, names=True) if extract_non_coding_exons: non_coding_exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_non_coding_exons.bed".format( output_directory, "human") non_coding_exons_fasta = "{0}/genome_sequences/{1}/{1}.cds.clean_non_coding_exons.fasta".format( output_directory, "human") sequo.get_non_coding_exon_coordinates(full_exon_file, exons_bed, non_coding_exons_bed) fo.fasta_from_intervals(non_coding_exons_bed, non_coding_exons_fasta, genome_fasta, names=True) if extract_introns: intron_bed = "{0}/genome_sequences/human/human.clean_introns.bed".format( output_directory) intron_fasta = "{0}/genome_sequences/human/human.clean_introns.fasta".format( output_directory) sequo.get_intron_coordinates(coding_exons_bed, intron_bed) fo.fasta_from_intervals(intron_bed, intron_fasta, genome_fasta, names=True) if extract_non_transcribed_regions: all_features_bed = "{0}/genome_sequences/human/human.all_features.bed".format( output_directory) non_transcribed_bed = "{0}/genome_sequences/human/human.non_transcribed.bed".format( output_directory) non_transcribed_fasta = "{0}/genome_sequences/human/human.non_transcribed.fasta".format( output_directory) seqo.get_non_transcribed_regions(genome_gtf, genome_fasta, all_features_bed, non_transcribed_bed, non_transcribed_fasta, output_directory) # extract sequences from source file if extract_lincrna_seqs: # set up the output fasta to contain the exon seqs lincrna_exons_bed = "{0}/lincRNA_exons.bed".format(output_directory) lincrna_exons_fasta = "{0}/lincRNA_exons.fasta".format( output_directory) lincrna_seqs_fasta = "{0}/lincRNA_seqs.fasta".format(output_directory) print("Extracting lincRNA seqs...") fo.extract_seqs(input_file, genome_fasta, lincrna_exons_bed, lincrna_exons_fasta, lincrna_seqs_fasta, mapping_file, codes_file, exclude_XY=True, hg38=hg38, NONCODE=NONCODE) print("Use lincRNA_misc.py to do further filtering...")