def main(): description = "Check whether stop codons are depleted in motif sets by simulating the motif set." args = gen.parse_arguments(description, ["motif_file", "output_dir", "results_dir", "required_simulations", "motif_simulation", "exon_simulation"], flags = [4,5], ints = [3]) motif_file, output_dir, results_dir, required_simulations, motif_simulation, exon_simulation = args.motif_file, args.output_dir, args.results_dir, args.required_simulations, args.motif_simulation, args.exon_simulation if not required_simulations: print('You must specify the number of simulations you require.') raise Exception gen.create_output_directories(output_dir) if motif_simulation: simulation_sets = [] #create the output directory for the particular motif set motif_output_dir = "{0}/{1}".format(output_dir, ".".join(motif_file.split('.')[:-1]).split('/')[-1]) gen.create_output_directories(motif_output_dir) simulated_motifs_output = "{0}/simulations_{1}.txt".format(motif_output_dir, required_simulations) output_file = "{0}/stop_counts_{1}.txt".format(motif_output_dir, required_simulations) # add the files to the required list simulation_sets.append([motif_file, simulated_motifs_output, output_file]) # run the simulations run_simulations(simulation_sets, required_simulations) exon_hexamer_simulation = "{0}/region_hexamer_sim.csv".format(output_dir) if exon_simulation: exon_fasta = "{0}_CDS_intervals.fasta".format(results_dir) run_exon_simulation(motif_file, exon_fasta, output_dir, required_simulations, exon_hexamer_simulation)
def motif_codon_density(motif_file, output_directory): stops = ["TAA", "TAG", "TGA"] gc_matchd_motifs_file = "{0}/gc_matched_combinations.bed".format( output_directory) if not os.path.isfile(gc_matchd_motifs_file): seqo.get_gc_matched_motifs(stops, gc_matchd_motifs_file) temp_dir = "temp_motif_density" gen.create_output_directories(temp_dir) motif_sets = gen.read_many_fields(gc_matchd_motifs_file, "\t") motif_sets.append(["TAA", "TAG", "TGA"]) args = [motif_file, temp_dir] outputs = simoc.run_simulation_function(motif_sets, args, ops.calc_codon_density_in_motifs, sim_run=False) new_output_dir = "{0}/motif_densities".format(output_directory) gen.create_output_directories(new_output_dir) output_file = "{0}/{1}.csv".format(new_output_dir, motif_file.split("/")[-1].split(".")[0]) with open(output_file, "w") as outfile: outfile.write("id,motifs,density\n") for i, file in enumerate(sorted(outputs)): data = gen.read_many_fields(file, ",")[0] outfile.write("{0},{1},{2}\n".format(i + 1, data[0], data[1])) gen.remove_directory(temp_dir)
def get_conservation(transcript_list, output_file, max_dS_threshold=None, max_omega_threshold=None): """ Get the conversation for a list of sequences and only keep those that pass Args: transcript_list (dict): dict containing transcript id, the cds and the ortholog seqs output_file (str): path to output file max_dS_threshold (float): if set, pass in the dS threshold you wish alignments to be below max_omega_threshold (float): if set, pass in the omega threshold you wish alignments to be below """ print("Getting the most conserved ortholog for each transcript...") temp_dir = "temp_conservation_files" gen.create_output_directories(temp_dir) # get a list of the transcript ids transcript_ids = list(transcript_list.keys()) # transcript_ids = transcript_ids[:200] # run this linearly because it doesnt like being parallelised # outputs = run_conservation_check(transcript_ids, transcript_list, max_dS_threshold, max_omega_threshold, temp_dir) outputs = gen.run_parallel_function( transcript_ids, [transcript_list, max_dS_threshold, max_omega_threshold, temp_dir], run_conservation_check, parallel=False) # remove the old output file if there is one gen.remove_file(output_file) # now concat the output files args = ["cat"] [args.append(i) for i in outputs] gen.run_process(args, file_for_output=output_file) gen.remove_directory(temp_dir)
def calc_ds(aligned_sequences): aligned_sequences_iupac = [ Seq("".join(i), IUPAC.unambiguous_dna) for i in aligned_sequences ] alignment = MultipleSeqAlignment([ SeqRecord(aligned_sequences_iupac[0], id="seq"), SeqRecord(aligned_sequences_iupac[1], id="orth_seq") ]) gen.create_output_directories("temp_files") random_instance = random.random() temp_phylip_file = "temp_files/{0}.phy".format(random_instance) temp_output_file = "temp_files/{0}.out".format(random_instance) fo.write_to_phylip(alignment, temp_phylip_file) # # run paml on sequences working_dir = "temp_dir.{0}".format(random.random()) paml = sequo.PAML_Functions(input_file=temp_phylip_file, output_file=temp_output_file, working_dir=working_dir) # run codeml codeml_output = paml.run_codeml() ds = codeml_output["NSsites"][0]["parameters"]["dS"] # clean up files gen.remove_file(temp_phylip_file) gen.remove_file(temp_output_file) paml.cleanup() return ds
def extract_second_seqs(input_bed, input_file, genome_fasta, output_dir): """ Extract the second set of sequences """ # get a set of ids that correspond only to lincrna entries id_file = "{0}/lncrna_ids.txt".format(output_dir) extract_lncrna_only(input_file, id_file) # now keep only the bed entries that are in the id list filtered_bed = "{0}.filtered".format(input_bed) ids = gen.read_many_fields(id_file, "\t") bed_entries = gen.read_many_fields(input_bed, "\t") with open(filtered_bed, "w") as outfile: for entry in bed_entries: if entry[3] in ids: outfile.write("{0}\n".format("\t".join(entry))) # now write the bed to an exon bed exons_bed = "{0}.exons.bed".format(input_bed) fo.entries_to_bed(filtered_bed, exons_bed, hg38=True) # now get the exon sequences exons_fasta = "{0}.exons.fasta".format(input_bed) fo.fasta_from_intervals(exons_bed, exons_fasta, genome_fasta, force_strand=True, names=True) # now generate the full transcript for multi exon transcripts transcripts_fasta = "{0}.multi_exon_transcripts.fasta".format(input_bed) names, seqs = gen.read_fasta(exons_fasta) seq_list = collections.defaultdict(lambda: collections.defaultdict()) for i, name in enumerate(names): id = ".".join(name.split("(")[0].split(".")[:-1]) exon = int(name.split("(")[0].split(".")[-1]) seq_list[id][exon] = seqs[i] with open(transcripts_fasta, "w") as outfile: for id in sorted(seq_list): if len(seq_list[id]) > 1: exon_list = [] for exon in sorted(seq_list[id]): exon_list.append(seq_list[id][exon]) seq = "".join(exon_list) if "N" not in seq and len(seq) >= 200: # convert names to : here as otherwise it will run sorting later id = ":".join(id.split(".")) outfile.write(">{0}\n{1}\n".format(id, seq)) # blast to get paralogous families blast_db_path = "{0}/bast_db".format(output_directory) output_blast_file = "{0}/blast_output.csv".format(output_directory) families_file = "{0/families.txt".format(output_directory) gen.create_output_directories(blast_db_path) cons.filter_families(transcripts_fasta, output_blast_file, families_file, database_path=blast_db_path, clean_run=True)
def intron_hexamer_test(input_fasta, motif_file, output_directory, output_file, required_simulations = None, families_file = None): """ Generate random hexamers from introns and calculate purine content Args: input_fasta (str): path to intron fasta motif_file (str): path to file containing real motifs output_directory (str): path to output directory output_file (str): path to output file required_simulations (int): if set, the number of simulations to run families_file (str): if set, path to families file """ hexamers_dir = "{0}/random_hexamers".format(output_directory) gen.create_output_directories(hexamers_dir) # get the motifs motifs = sequo.read_motifs(motif_file) # if there are not enough simulations, generate them if len(os.listdir(hexamers_dir)) < required_simulations: gen.create_output_directories(hexamers_dir) required = list(range(required_simulations - len(os.listdir(hexamers_dir)))) names, seqs = gen.read_fasta(sequences_file) seqs_list = collections.defaultdict(lambda: []) for i, name in enumerate(names): seqs_list[name.split(".")[0]].append(seqs[i]) if families_file: seqs_list = sequo.pick_random_family_member(families_file, seqs_list) all_seqs = [] [all_seqs.extend(seqs_list[i]) for i in seqs_list] full_seq = "X".join(all_seqs) simopc.run_simulation_function(required, [full_seq, motifs, hexamers_dir], sequo.locate_random_motifs, sim_run = False) # calculate the purine contents real_purine_content = sequo.calc_purine_content(motifs) real_nt_content = sequo.calc_nucleotide_content(motifs) test_purine_content = [] test_nt_content = [] for file in os.listdir(hexamers_dir): filepath = "{0}/{1}".format(hexamers_dir, file) test_motifs = sequo.read_motifs(filepath) test_purine_content.append(sequo.calc_purine_content(test_motifs)) test_nt_content.append(sequo.calc_nucleotide_content(test_motifs)) with open(output_file, "w") as outfile: outfile.write("id,purine_content,a_content,c_content,g_content,t_content\n") outfile.write("real,{0},{1}\n".format(real_purine_content, ",".join(gen.stringify([real_nt_content[i] for i in sorted(real_nt_content)])))) for i in range(len(test_purine_content)): outfile.write("{0},{1},{2}\n".format(i+1, test_purine_content[i], ",".join(gen.stringify([test_nt_content[i][j] for j in sorted(test_nt_content[i])])))) # remove the output directory gen.remove_directory(hexamers_dir)
def align_sequences(muscle_exe, seq1, seq2, seq1_id=None, seq2_id=None, temp_input_file=None, temp_output_file=None): """ Align two protein sequences using Muscle. Args: muscle_exe (str): path to muscle tool seq1 (str): protein sequence 1 seq2 (str): protein sequence 2 seq_id1 (str): if set, the id for sequence 1 seq_id2 (str): if set, the id for sequence 2 temp_input_file (str): if set, the path to the alignment input file temp_output_file (str): if set, the path to the alignment output file """ # create temp files for running alignment temp_dir = "temp_alignment" gen.create_output_directories(temp_dir) # create the random alignment files random_alignment = random.random() if not temp_input_file: temp_input_file = "{0}/protein_alignment_input_{1}.fasta".format( temp_dir, random_alignment) if not temp_output_file: temp_output_file = "{0}/protein_alignment_output_{1}.fasta".format( temp_dir, random_alignment) # in case the sequence ids are not set if not seq1_id: seq1_id = "seq_id_{0}_1".format(random.random()) if not seq2_id: seq2_id = "{0}_2".format(seq1_id[:-2]) # write the temporary alignment file with open(temp_input_file, "w") as temp_file: temp_file.write(">{0}\n{1}\n>{2}\n{3}\n".format( seq1_id, seq1, seq2_id, seq2)) # run muscle alignment muscle_output = MuscleCommandline(muscle_exe, input=temp_input_file, out=temp_output_file) # get object muscle_output() return temp_input_file, temp_output_file
def sort_bed(input_file, output_file): """ Sort a bed file. Args: input_file (str): path to the input file output_file (str): path to the output file """ # Do like this so we can sort a file and keep the same name gen.create_output_directories("temp_data") temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random()) gen.run_process(["sortBed", "-i", input_file], file_for_output=temp_file_name) gen.run_process(["mv", temp_file_name, output_file]) gen.remove_file(temp_file_name)
def motif_stop_codon_densities(motif_file, motif_controls_directory, required_simulations, output_file): filelist = {"real": motif_file} for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]): filelist[i] = "{0}/{1}".format(motif_controls_directory, file) file_ids = [i for i in filelist] temp_dir = "temp_motif_dir" gen.create_output_directories(temp_dir) args = [filelist, temp_dir] outputs = simopc.run_simulation_function(file_ids, args, calculate_stop_codon_densities, sim_run = False) with open(output_file, "w") as outfile: outfile.write("sim_id,stop_density\n") for file in outputs: outfile.write("{0}\n".format(",".join(gen.read_many_fields(file, "\t")[0])))
def cds_motif_test(cds_fasta, output_file): nts = ["A", "C", "G", "T"] stops = ["TAA", "TAG", "TGA"] codon_list = sorted( ["".join(codon) for codon in it.product(nts, nts, nts)]) combinations = [sorted(i) for i in it.combinations(codon_list, 3)] # combination_not_all_stops = [i for i in combinations if len(list(set(i) & set(stops))) < 3] # combinations = combinations[:30] temp_dir = "temp_motif_densities" gen.create_output_directories(temp_dir) args = [cds_fasta, temp_dir] outputs = simoc.run_simulation_function(combinations, args, ops.calc_motif_densities, sim_run=False) temp_filelist = [] for output in outputs: temp_filelist.append(output) densities = collections.defaultdict(lambda: collections.defaultdict()) for file in temp_filelist: motif = file.split("/")[-1].split(".")[0] data = gen.read_many_fields(file, ",")[0] gc = data[0] density = data[1] densities[gc][motif] = density iterator = 0 with open(output_file, "w") as outfile: outfile.write("id,motif,gc,density\n") for gc in sorted(densities): for motif in sorted(densities[gc]): iterator += 1 outfile.write("{0},{1},{2},{3}\n".format( iterator, motif, gc, densities[gc][motif]))
def blast_all_against_all(fasta_file, output_file, database_path=None, remove_database=None, clean_run=None): """ Blast all sequences against all other sequences Args: fasta_file (str): path to fasta file containing sequences output_file (str): path to output file database_path (str): if not set, use temp dir remove_database (bool): if set, remove the database once blast has run clean_run (bool): if set, run new blast """ print("BLASTing sequences against each other...") # remove the old database if we want a clean run if clean_run: gen.remove_directory(database_path) # create the blast database if not database_path: database_path = "temp_blast_db/{0}".format(random.random()) print("Temp blast db: {0}".format(database_path)) gen.create_output_directories(database_path) # get the list of files filelist = ["blast.nhr", "blast.nin", "blast.nsq"] files_present = [i for i in filelist if i in os.listdir(database_path)] database_path = "{0}/blast".format(database_path) if len(filelist) != len(files_present) or clean_run: make_blast_database(fasta_file, database_path) # now blast each sequence against each other blast_sequences(fasta_file, database_path, output_file) # remove the database if remove_database: gen.remove_directory(database_path)
def run_conservation_check(input_list, transcript_list, max_dS_threshold, max_omega_threshold, temp_dir): """ Wrapper to run the conservation check in parallel Args: input_list (list): list of transcript ids to iterate over transcript_list (dict): dict containing transcript id, the cds and the ortholog seqs output_file (str): path to output file max_dS_threshold (float): if set, pass in the dS threshold you wish alignments to be below max_omega_threshold (float): if set, pass in the omega threshold you wish alignments to be below """ # create a list to keep temporary outputs temp_filelist = [] temp_instance_dir = "temp_codeml_dir.{0}".format(random.random()) gen.create_output_directories(temp_instance_dir) if input_list: temp_file = "{0}/best_ortholog_match.{1}.bed".format( temp_dir, random.random()) temp_filelist.append(temp_file) with open(temp_file, "w") as outfile: # get best ortholog for each transcript for i, transcript_id in enumerate(input_list): print("{0}/{1}".format(i + 1, len(input_list))) ortholog_id = check_conservation( transcript_id, transcript_list[transcript_id][0], transcript_list[transcript_id][1], temp_instance_dir, max_dS_threshold=max_dS_threshold, max_omega_threshold=max_omega_threshold) if ortholog_id: outfile.write("{0}\t{1}\n".format(transcript_id, ortholog_id)) gen.remove_directory(temp_instance_dir) return temp_filelist
def motif_codon_densities(motif_file, codon_combinations_file, motif_controls_directory, required_simulations, output_file): filelist = {"real": motif_file} for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]): filelist[i] = "{0}/{1}".format(motif_controls_directory, file) file_ids = [i for i in filelist] codon_sets = gen.read_many_fields(codon_combinations_file, "\t") temp_dir = "temp_motif_dir" gen.create_output_directories(temp_dir) args = [filelist, codon_sets, temp_dir] outputs = simopc.run_simulation_function(file_ids, args, calculate_motif_densities, sim_run = False) real_density_list = {} sim_density_list = collections.defaultdict(lambda: []) for file in outputs: results = gen.read_many_fields(file, "\t") if "real" in file: for i in results: real_density_list[i[0]] = float(i[1]) else: for i in results: sim_density_list[i[0]].append(float(i[1])) with open(output_file, "w") as outfile: outfile.write("codons,gc_content,purine_content,density,nd\n") for codon_set in sorted(real_density_list): nd = np.divide(real_density_list[codon_set] - np.mean(sim_density_list[codon_set]), np.mean(sim_density_list[codon_set])) outputs = [codon_set, seqo.calc_gc_seqs_combined(codon_set.split("_")), sequo.calc_purine_content(codon_set.split("_")), real_density_list[codon_set], nd] outfile.write("{0}\n".format(",".join(gen.stringify(outputs)))) gen.remove_directory(temp_dir)
def main(): arguments = ["output_directory", "motif_file", "simulations", "controls_directory", "exons_fasta", "motifs_stop_density", "motif_stop_codon_densities_sim", "motif_codon_densities", "motif_densities_exon_dinucleotides", "generate_motif_controls", "match_density", "match_subs"] description = "" args = gen.parse_arguments(description, arguments, opt_flags=[2,3,4], flags = [5,6,7,8,9,10,11]) output_directory, motif_file, simulations, controls_directory, exons_fasta, motifs_stop_density, motif_stop_codon_densities_sim, motif_codon_densities, motif_densities_exon_dinucleotides, generate_motif_controls, match_density, match_subs = args.output_directory, args.motif_file, args.simulations, args.controls_directory, args.exons_fasta, args.motifs_stop_density, args.motif_stop_codon_densities_sim, args.motif_codon_densities, args.motif_densities_exon_dinucleotides, args.generate_motif_controls, args.match_density, args.match_subs # interger the simulations if simulations: simulations = int(simulations) # create the global output directory global_output_directory = "{0}/motif_tests".format(output_directory) gen.create_output_directories(global_output_directory) # if we want to generate the controls if generate_motif_controls: simopc.generate_motif_dinucleotide_controls(motif_file, simulations, output_directory, match_density = match_density, match_subs = match_subs) # get the stop density if motifs and non motifs of same length if motifs_stop_density: mtop.calc_stop_densities(motif_file) # calculate stop codon densities in the motif sets if motif_stop_codon_densities_sim: # create a local output directory local_output_directory = "{0}/motif_stop_density_simulations".format(global_output_directory) gen.create_output_directories(local_output_directory) # output filepath output_file = "{0}/{1}_stop_codon_densities.csv".format(local_output_directory, motif_file.split("/")[-1].split(".")[0]) # run if we need some more controls if simulations > len(os.listdir(controls_directory)): print("Please create more simulants...") raise Exception # # calculate densities mtop.motif_stop_codon_densities(motif_file, controls_directory, simulations, output_file) # calculate other codon densities in motif sets if motif_codon_densities: local_output_directory = "{0}/codon_combination_densities".format(global_output_directory) gen.create_output_directories(local_output_directory) # get all the possible sets of 3 unique codon combinations codon_combinations_file = "{0}/codon_combinations.txt".format(local_output_directory) if not os.path.isfile(codon_combinations_file): seqo.generate_all_motif_combinations(stops, codon_combinations_file) output_file = "{0}/{1}_codon_combination_densities.csv".format(local_output_directory, motif_file.split("/")[-1].split(".")[0]) if simulations > len(os.listdir(controls_directory)): gen.remove_directory(controls_directory) simopc.generate_motif_controls(motif_file, simulations, controls_directory, match_density = False) mtop.motif_codon_densities(motif_file, codon_combinations_file, controls_directory, simulations, output_file)
names, seqs = gen.read_fasta(seqs_file) seq_list = { name: seqs[i] for i, name in enumerate(names) if "N" not in seqs[i] } seq_list = sequo.pick_random_family_member(families_file, seq_list) def randomise_seq(seq): nts = list(seq) np.random.shuffle(nts) return "".join(nts) output_directory = "temp_shuffle_linc" gen.create_output_directories(output_directory) def run_simulations(iterations, seq_list, codon_sets, output_directory): outputs = [] if len(iterations) > 0: np.random.seed() for i, iteration in enumerate(iterations): print("{0}/{1}".format(i + 1, len(iterations))) if iteration != "real": new_seqs = [] for id in seq_list: new_seqs.append(randomise_seq(seq_list[id])) else:
def main(): arguments = ["working_directory", "output_directory", "genome_path", "input_bed", "input_fasta", "clean_run", "extract_exon_intron_bed", "extract_exons", "extract_introns", "sort_by_exon_number", "build_transcripts", "extract_families", "orf_length_sim"] description = "Wrapper for miscellaneous operations on lincRNA" args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13], opt_flags = [2,3,4]) working_directory, output_directory, genome_path, input_bed, input_fasta, clean_run, extract_exon_intron_bed, extract_exons, extract_introns, sort_by_exon_number, build_transcripts, extract_families, orf_length_sim = args.working_directory, args.output_directory, args.genome_path, args.input_bed, args.input_fasta, args.clean_run, args.extract_exon_intron_bed, args.extract_exons, args.extract_introns, args.sort_by_exon_number, args.build_transcripts, args.extract_families, args.orf_length_sim # create the directories gen.create_output_directories(working_directory) gen.create_output_directories(output_directory) # file paths exons_bed = "{0}/exons.bed".format(working_directory) single_exons_bed = "{0}/single_exons.bed".format(working_directory) multi_exons_bed = "{0}/multi_exons.bed".format(working_directory) exons_fasta = "{0}/exons.fasta".format(working_directory) single_exons_fasta = "{0}/single_exons.fasta".format(working_directory) multi_exons_fasta = "{0}/multi_exons.fasta".format(working_directory) introns_bed = "{0}/introns.bed".format(working_directory) introns_fasta = "{0}/introns.fasta".format(working_directory) transcript_sequences_fasta = "{0}/transcript_sequences.fasta".format(working_directory) multi_exon_transcript_sequences_fasta = "{0}/multi_exon_transcript_sequences.fasta".format(working_directory) multi_exon_blast_file = "{0}/multi_exons_blast_all_against_all.csv".format(working_directory) multi_exon_blast_database = "{0}/multi_exon_blast_all_against_all".format(working_directory) multi_exon_families_file = "{0}/multi_exon_families.txt".format(working_directory) # create the exons and introns files from bed if extract_exon_intron_bed: # copy the main file to the folder gen.copy_file(input_bed, "{0}/{1}".format(working_directory, input_bed.split("/")[-1])) # extract the features lmo.extract_bed_coordinates_block_format(input_bed, exons_bed, introns_bed) # get files for each if sort_by_exon_number: gen.check_files_exists([exons_bed]) lmo.sort_by_exon_number(exons_bed, single_exons_bed, multi_exons_bed) # get exons if extract_exons: gen.check_files_exists([exons_bed]) fo.fasta_from_intervals(exons_bed, exons_fasta, genome_path, names=True) # if the single exons bed file exists, get just the single exon sequences if os.path.isfile(single_exons_bed): lmo.sort_fasta_by_bed(single_exons_bed, exons_fasta, single_exons_fasta) # if the multi exons bed file exists, get just the multi exon sequences if os.path.isfile(multi_exons_bed): lmo.sort_fasta_by_bed(multi_exons_bed, exons_fasta, multi_exons_fasta) # get introns if extract_introns: gen.check_files_exists([introns_bed]) fo.fasta_from_intervals(introns_bed, introns_fasta, genome_path, names=True) # build transcripts if build_transcripts: gen.check_files_exists([exons_fasta]) lmo.build_transcripts(exons_fasta, transcript_sequences_fasta) # if the multi exons bed file exists, get just the multi exon sequences if os.path.isfile(multi_exons_bed): lmo.sort_fasta_by_bed(multi_exons_bed, transcript_sequences_fasta, multi_exon_transcript_sequences_fasta) # now group into paralagous families if extract_families: gen.check_files_exists([multi_exon_transcript_sequences_fasta]) cons.filter_families(multi_exon_transcript_sequences_fasta, multi_exon_blast_file, multi_exon_families_file, database_path = multi_exon_blast_database, clean_run = clean_run)
def main(): description = "Look at disease snps." arguments = ["disease_snps_file", "output_directory", "results_prefix", "simulations", "ese_file", "intersect_snps", "get_relative_positions", "get_snp_status", "get_info", "simulate_ptc_location", "get_possible_ptc_locations", "required_simulations", "get_overlaps", "intersect_ptcs", "compare_ptcs" ,"get_introns", "compare_distances", "clinvar_ptc_locations", "location_simulation", "exclude_cpg", "ese_hit_simulation", "only_disease", "only_kgenomes", "only_ese", "get_unique_ptcs", "get_unique_rel_pos", "excess_test", "disease_locations_chisquare"] args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 21, 22, 23,24,25,26,27], ints=[3]) disease_snps_file, output_directory, results_prefix, simulations, ese_file, intersect_snps, get_relative_positions, get_snp_status, get_info, simulate_ptc_location, get_possible_ptc_locations, required_simulations, get_overlaps, intersect_ptcs, compare_ptcs, get_introns, compare_distances, clinvar_ptc_locations, location_simulation, exclude_cpg, ese_hit_simulation, only_disease, only_kgenomes, only_ese, get_unique_ptcs, get_unique_rel_pos, excess_test, disease_locations_chisquare = args.disease_snps_file, args.output_directory, args.results_prefix, args.simulations, args.ese_file, args.intersect_snps, args.get_relative_positions, args.get_snp_status, args.get_info, args.simulate_ptc_location, args.get_possible_ptc_locations, args.required_simulations, args.get_overlaps, args.intersect_ptcs, args.compare_ptcs, args.get_introns, args.compare_distances, args.clinvar_ptc_locations, args.location_simulation, args.exclude_cpg, args.ese_hit_simulation, args.only_disease, args.only_kgenomes, args.only_ese, args.get_unique_ptcs, args.get_unique_rel_pos, args.excess_test, args.disease_locations_chisquare if simulations and not isinstance(simulations, int): print("\nERROR: Please provide the correct number for simulations.\n") raise Exception # create the output directory if it doesnt already exist gen.create_output_directories(output_directory) # disease_snps_file = "./source_data/clinvar_20180429.vcf.gz" disease_snps_index_file = "{0}.tbi".format(disease_snps_file) if not os.path.isfile(disease_snps_file) or not os.path.isfile(disease_snps_index_file): print("\nERROR: Please provide the required disease SNPs file(s).\n") raise Exception # intersect the coding exons with the disease snps exon_bed = "{0}_coding_exons.bed".format(results_prefix) disease_snp_intersect_file_vcf = "{0}/disease_snp_intersect.vcf".format(output_directory) disease_snp_intersect_file_bed = "{0}/disease_snp_intersect.bed".format(output_directory) if intersect_snps: print("Intersecting snps with exons") so.intersect_snps_parallel(exon_bed, disease_snps_file, disease_snp_intersect_file_vcf) so.intersect_vcf_to_bed(exon_bed, disease_snp_intersect_file_vcf, disease_snp_intersect_file_bed, change_names = True) # get relative positions of the snps in cds and exons full_bed = "{0}_CDS.bed".format(results_prefix) disease_snps_relative_exon_positions = "{0}/disease_snp_relative_exon_positions.bed".format(output_directory) disease_snps_relative_cds_positions = "{0}/disease_snp_relative_cds_positions.bed".format(output_directory) if get_relative_positions: print("Getting snp relative positions...") so.get_snp_relative_exon_position(disease_snp_intersect_file_bed, disease_snps_relative_exon_positions) # output to var because this is how the function was made relative_positions = gen.read_many_fields(disease_snps_relative_exon_positions, "\t") so.get_snp_relative_cds_position(relative_positions, disease_snps_relative_cds_positions, full_bed) # get the change status of the snps to check them cds_fasta = "{0}_CDS.fasta".format(results_prefix) disease_ptcs_file = "{0}/disease_ptcs.txt".format(output_directory) disease_other_file = "{0}/disease_other_snps.txt".format(output_directory) if get_snp_status: print("Getting snp status...") so.get_snp_change_status(disease_snps_relative_cds_positions, cds_fasta, disease_ptcs_file, disease_other_file) # get intersect between the clinvar ptcs and 1000 genomes ptcs ptc_file = "{0}_ptc_file.txt".format(results_prefix) ptc_intersect_file = "{0}/ptc_intersect.bed".format(output_directory) if intersect_ptcs: temp_disease_ptc_file = "temp_data/{0}".format(random.random()) dso.refactor_ptc_file(disease_ptcs_file, temp_disease_ptc_file) temp_k_genomes_ptc_file = "temp_data/{0}".format(random.random()) dso.refactor_ptc_file(ptc_file, temp_k_genomes_ptc_file, header=True) bao.intersect_bed(temp_k_genomes_ptc_file, temp_disease_ptc_file, write_both = True, no_dups=False, output_file = ptc_intersect_file) gen.remove_file(temp_disease_ptc_file) gen.remove_file(temp_k_genomes_ptc_file) # get a list of ptcs unique to each dataset unique_ptcs = "{0}/disease_ptcs_no_intersect.bed".format(output_directory) unique_ptcs_kgenomes = "{0}/kgenomes_ptcs_no_intersect.bed".format(output_directory) if get_unique_ptcs: dso.get_unique_ptcs(disease_ptcs_file, ptc_file, ptc_intersect_file, unique_ptcs, unique_ptcs_kgenomes) # get the relative positions of the ptcs unique to each dataset unique_ptcs_rel_pos_file = "{0}/disease_ptcs_no_intersect_rel_pos.bed".format(output_directory) kgenomes_relative_positions = "{0}_PTC_relative_exon_positions.bed".format(results_prefix) kgenomes_unique_ptcs_rel_pos_file = "{0}/kgenomes_ptcs_no_intersect_rel_pos.bed".format(output_directory) if get_unique_rel_pos: dso.get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, unique_ptcs_kgenomes, kgenomes_relative_positions, unique_ptcs_rel_pos_file, kgenomes_unique_ptcs_rel_pos_file) # get the ese file name ese_file_name = ese_file.split('/')[-1].split('.')[0] # get the coding exons fasta file path coding_exons_fasta = "{0}_coding_exons.fasta".format(results_prefix) # snp_relative_positions_file = "{0}_SNP_relative_exon_position.bed".format(results_prefix) # simulation picking random reference allele matched simulants clinvar_location_simulation_file = "{0}/clinvar_ptc_location_simulation.csv".format(output_directory) clinvar_location_simulation_ese_overlap_file = "{0}/clinvar_ptc_location_simulation_{1}_ese_overlaps.csv".format(output_directory, ese_file_name) kgenomes_location_simulation_file = "{0}/1000_genomes_simulations.csv".format(output_directory) kgenomes_location_simulation_ese_overlap_file = "{0}/1000_genomes_simulations_ese_overlaps.csv".format(output_directory) if location_simulation: if not only_kgenomes: print('Running ptc location simulation on disease PTCs...') dso.ptc_location_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_location_simulation_file, clinvar_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg) if not only_disease: print('Running ptc location simulation on 1000 genomes PTCs...') dso.ptc_location_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_location_simulation_file, kgenomes_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg) window_start = 3 window_end = 69 clinvar_ese_hit_simulation_file = "{0}/clinvar_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name) kgenomes_ese_hit_simulation_file = "{0}/1000_genomes_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name) # do a simulation picking only sites from within the region if ese_hit_simulation: if not only_kgenomes: print("Simulating ESE hits on the {0}-{1} region for disease PTCs...".format(window_start, window_end)) dso.ese_hit_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg) if not only_disease: print("Simulating ESE hits on the {0}-{1} region for 1000 genomes PTCs...".format(window_start, window_end)) dso.ese_hit_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg) excess_test_file = "{0}/clinvar_ptc_{1}_{2}_excesses.csv".format(output_directory, window_start, window_end) if excess_test: dso.excess_test(unique_ptcs_rel_pos_file, coding_exons_fasta, excess_test_file) location_test_file = "{0}/clinvar_locations_chisquare.csv".format(output_directory) if disease_locations_chisquare: dso.disease_ptc_location_test(unique_ptcs_rel_pos_file, coding_exons_fasta, location_test_file)
def get_coding_exons(exons_file, cds_file, output_file, remove_overlapping=False): """ Given a bed file of exon coordinates and a bed file of CDS coordinates, write a new bed file that only contains those exon coordinates form the former file that are 1) fully coding 2) internal NB! Assumes that all the coordinates are from non-overlapping transcripts. If this is not the case, set remove_overlaps to True and it'll remove overlapping intervals. Modified from LA and RS. Args: exons_file (str): path to the bed file containing exon coordinates cds_file (str): path to bed file containing the cds coordinates output_file (str): path to output file remove_overlapping (bool): if true, remove overlapping intervals """ if remove_overlapping: sort_bed(exons_file, exons_file) remove_bed_overlaps(exons_file, exons_file) #filter out anything that isn't fully coding #you have to write_both because you want to make sure that they #haven't been kept because of an overlap to a transcript that doesn't appear in the exons file gen.create_output_directories("temp_data") temp_file = "temp_data/temp{0}.txt".format(random.random()) intersect_bed(exons_file, cds_file, overlap=1, overlap_rec=True, output_file=temp_file, force_strand=True, write_both=True, no_dups=False, no_name_check=False) #filter out terminal exons #in theory, there shouldn't be any left after the previous step #in practice, there may be unannotated UTRs, so it looks like we have a fully coding terminal exon, #whereas in reality, the exon is only partially coding temp_file2 = "temp_data/temp{0}.txt".format(random.random()) with open(temp_file2, "w") as outfile: #figure out the rank of the last exon for each transcript filt_exons = gen.read_many_fields(exons_file, "\t") filt_exons = [i for i in filt_exons if len(i) > 3] names = [i[3].split(".") for i in filt_exons] names = gen.list_to_dict(names, 0, 1, as_list=True) names = {i: max([int(j) for j in names[i]]) for i in names} coding_exons = gen.read_many_fields(temp_file, "\t") for exon in coding_exons: overlap_name = exon[10].split(".") if overlap_name[0] in names: name = exon[3].split(".") if name[-1] != "1": last_exon = names[name[0]] if int(name[-1]) != last_exon: exon = [str(i) for i in exon[:7]] outfile.write("{0}\n".format("\t".join(exon))) sort_bed(temp_file2, temp_file2) gen.run_process([ "mergeBed", "-i", temp_file2, "-c", "4,5,6,7", "-o", "distinct,distinct,distinct,distinct" ], file_for_output=output_file) gen.remove_file(temp_file) gen.remove_file(temp_file2)
def main(): arguments = [ "input_bed", "input_fasta", "output_directory", "input_fasta2", "input_file", "required_simulations", "motif_file", "families_file", "output_prefix", "controls_dir", "extract_sequences", "calc_gc", "density_sim", "get_exon_dint_controls", "get_intron_dint_controls", "exon_region_density", "compare_stop_density", "sim_orf_lengths", "sim_orf_lengths_masked", "sim_stop_density", "sim_stop_density_introns", "sim_stop_density_within_genes", "sim_stop_density_removed_motifs", "sim_stop_density_removed_motifs_sim_seqs", "sim_stop_density_diff", "exon_intron_density", "motif_nd", "excess_test", "single_exon", "motif_overlap", "motif_overlap_density", "clean_alignments", "seq_hits_linc", "upstream_atg", "excess_length_thresholds", "density_regions", "extract_second", "seq_no" ] description = "Container for analysis on lincRNAs" args = gen.parse_arguments(description, arguments, flags=[ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36 ], opt_flags=[3, 4, 5, 6, 7, 8, 9, 37]) input_bed, \ input_fasta, \ output_directory, \ input_fasta2, \ input_file, \ required_simulations, \ motif_file, \ families_file, \ output_prefix, \ controls_dir, \ extract_sequences, \ calc_gc, \ density_sim, \ get_exon_dint_controls, \ get_intron_dint_controls, \ exon_region_density, \ compare_stop_density, \ sim_orf_lengths, \ sim_orf_lengths_masked, \ sim_stop_density, \ sim_stop_density_introns, \ sim_stop_density_within_genes, \ sim_stop_density_removed_motifs, \ sim_stop_density_removed_motifs_sim_seqs, \ sim_stop_density_diff, \ exon_intron_density, \ motif_nd, \ excess_test, \ single_exon,\ motif_overlap, \ motif_overlap_density, \ clean_alignments, \ seq_hits_linc, \ upstream_atg, \ excess_length_thresholds, \ density_regions, \ extract_second, \ seq_no = \ args.input_bed, \ args.input_fasta, \ args.output_directory, \ args.input_fasta2, \ args.input_file, \ args.required_simulations, \ args.motif_file, \ args.families_file, \ args.output_prefix, \ args.controls_dir, \ args.extract_sequences, \ args.calc_gc, \ args.density_sim, \ args.get_exon_dint_controls, \ args.get_intron_dint_controls, \ args.exon_region_density, \ args.compare_stop_density, \ args.sim_orf_lengths, \ args.sim_orf_lengths_masked, \ args.sim_stop_density, \ args.sim_stop_density_introns, \ args.sim_stop_density_within_genes, \ args.sim_stop_density_removed_motifs, \ args.sim_stop_density_removed_motifs_sim_seqs, \ args.sim_stop_density_diff, \ args.exon_intron_density, \ args.motif_nd, \ args.excess_test, \ args.single_exon, \ args.motif_overlap, \ args.motif_overlap_density, \ args.clean_alignments, \ args.seq_hits_linc, \ args.upstream_atg, \ args.excess_length_thresholds, \ args.density_regions, \ args.extract_second, \ args.seq_no # make required simultions an int required_simulations = int( required_simulations) if required_simulations else None # prcoess output prefix output_prefix = output_prefix + "_" if output_prefix else "" seq_no = int(seq_no) if seq_no else None # create output directories global_output_directory = "{0}/tests/lincrna".format(output_directory) gen.create_output_directories(global_output_directory) # set a start time start = time.time() # create the output_directory if it doenst already exist gen.create_output_directories(global_output_directory) # get the sequences if extract_sequences: lincRNA_single_exon_bed = "{0}/lincrna/lincRNA.single_exon.bed".format( output_directory) lincRNA_single_exon_fasta = "{0}/lincrna/lincRNA.single_exon.fasta".format( output_directory) lincRNA_single_exon_families = "{0}/lincrna/lincRNA.single_exon_families.bed".format( output_directory) lincRNA_multi_exon_bed = "{0}/lincrna/lincRNA.multi_exon.bed".format( output_directory) lincRNA_multi_exon_intron_bed = "{0}/lincrna/lincRNA.multi_exon.introns.bed".format( output_directory) lincRNA_multi_exon_fasta = "{0}/lincrna/lincRNA.multi_exon.fasta".format( output_directory) lincRNA_multi_exon_exons_fasta = "{0}/lincrna/lincRNA.multi_exon.exons.fasta".format( output_directory) lincRNA_multi_exon_intron_fasta = "{0}/lincrna/lincRNA.multi_exon.introns.fasta".format( output_directory) lincRNA_multi_exon_families = "{0}/lincrna/lincRNA.multi_exon_families.bed".format( output_directory) cont.extract_lincRNA_sequences(input_bed, input_fasta, lincRNA_single_exon_bed, lincRNA_multi_exon_bed, lincRNA_single_exon_fasta, lincRNA_multi_exon_fasta, lincRNA_multi_exon_intron_bed, lincRNA_multi_exon_intron_fasta, lincRNA_single_exon_families, lincRNA_multi_exon_families, clean_run=None) # clean the alignments to get in usable form # might need this if clean_alignments: output_exon_file = "{0}/clean_exon_alignments.fasta" output_intron_file = "{0}/clean_intron_alignments.fasta" ltests.clean_alignments(input_bed, input_fasta, output_exon_file, output_intron_file) if calc_gc: output_file = "{0}/{1}_gc.csv".format(global_output_directory, output_prefix) ltests.calc_gc(input_fasta, output_file, families_file=families_file) # orf length test if sim_orf_lengths: sim_orf_length_output_file = "{0}/{1}sim_orf_lengths.csv".format( global_output_directory, output_prefix) if families_file: sim_orf_length_z_file = "{0}/{1}sim_orf_lengths_zs_grouped.csv".format( global_output_directory, output_prefix) else: sim_orf_length_z_file = "{0}/{1}sim_orf_lengths_zs.csv".format( global_output_directory, output_prefix) # run the test simopc.sim_orf_length(input_fasta, required_simulations, sim_orf_length_output_file) ltests.process_length_sim(sim_orf_length_output_file, sim_orf_length_z_file, families_file=families_file) if sim_orf_lengths_masked: masked_output_file = "{0}_{1}_masked.csv".format( input_file.split(".")[0], motif_file.split("/")[-1].split(".")[0]) # run the test simopc.sim_orf_length_masked(input_fasta, required_simulations, motif_file, input_file, controls_dir, masked_output_file, families_file=families_file) # stop density test if sim_stop_density: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_stop_density_output_dir = "{0}/{1}_stop_density_simulation_all_genes_grouped_families".format( local_output_directory, output_prefix) sim_stop_density_output_file = "{0}/{1}_stop_density_simulation_all_genes_grouped_families.csv".format( local_output_directory, output_prefix) runs = 10 else: sim_stop_density_output_dir = "{0}/{1}_stop_density_simulation_all_genes".format( local_output_directory, output_prefix) sim_stop_density_output_file = "{0}/{1}_stop_density_simulation_all_genes.csv".format( local_output_directory, output_prefix) runs = 1 gen.create_output_directories(sim_stop_density_output_dir) for run in list(range(runs)): output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_stop_density_output_dir, run + 1) ltests.sim_stop_density(input_fasta, output_file, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_outputs(sim_stop_density_output_dir, sim_stop_density_output_file) # within genes if sim_stop_density_within_genes: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_stop_density_within_gene_output_dir = "{0}/{1}_stop_density_simulation_within_genes_grouped_families".format( local_output_directory, output_prefix) sim_stop_density_within_gene_output_file = "{0}/{1}_stop_density_simulation_within_genes_grouped_families.csv".format( local_output_directory, output_prefix) runs = 10 else: sim_stop_density_within_gene_output_dir = "{0}/{1}_stop_density_simulation_within_genes".format( local_output_directory, output_prefix) sim_stop_density_within_gene_output_file = "{0}/{1}_stop_density_simulation_within_genes.csv".format( local_output_directory, output_prefix) runs = 1 gen.create_output_directories(sim_stop_density_within_gene_output_dir) for run in list(range(runs)): output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_stop_density_within_gene_output_dir, run + 1) ltests.sim_stop_density_within_genes( input_fasta, output_file, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_within_gene_outputs( sim_stop_density_within_gene_output_dir, sim_stop_density_within_gene_output_file) # stop density test in the introns if sim_stop_density_introns: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_stop_density_output_dir = "{0}/{1}_stop_density_introns_simulation_all_genes_grouped_families".format( local_output_directory, output_prefix) sim_stop_density_output_file = "{0}/{1}_stop_density_introns_simulation_all_genes_grouped_families.csv".format( local_output_directory, output_prefix) runs = 1 else: sim_stop_density_output_dir = "{0}/{1}_stop_density_introns_simulation_all_genes".format( local_output_directory, output_prefix) sim_stop_density_output_file = "{0}/{1}_stop_density_introns_simulation_all_genes.csv".format( local_output_directory, output_prefix) runs = 1 gen.create_output_directories(sim_stop_density_output_dir) for run in list(range(runs)): output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_stop_density_output_dir, run + 1) ltests.sim_stop_density(input_fasta, output_file, simulations=int(required_simulations), families_file=families_file, introns=True, input_fasta2=input_fasta2) # process the outputs ltests.process_sim_stop_density_outputs(sim_stop_density_output_dir, sim_stop_density_output_file) # remove motifs and test if sim_stop_density_removed_motifs: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes_grouped_families_removed_motifs".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes_grouped_families_removed_motifs.csv".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 10 else: sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes.csv".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 1 # remove any previous runs gen.remove_directory(sim_output_dir) gen.create_output_directories(sim_output_dir) for run in list(range(runs)): run_output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_output_dir, run + 1) ltests.sim_stop_density_removed_motifs( input_fasta, run_output_file, motif_file, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_outputs(sim_output_dir, sim_output_file, reverse=True) # remove motifs and test within seqs if sim_stop_density_removed_motifs_sim_seqs: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_grouped_families_removed_motifs_seq_sim".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_simulation_grouped_families_removed_motifs_seq_sim.csv".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 10 else: sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes_seq_sim".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes_seq_sim.csv".format( local_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 1 # remove any previous runs gen.remove_directory(sim_output_dir) gen.create_output_directories(sim_output_dir) for run in list(range(runs)): run_output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_output_dir, run + 1) ltests.sim_stop_density_removed_motifs_seq_sim( input_fasta, run_output_file, motif_file, controls_dir, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_outputs(sim_output_dir, sim_output_file, reverse=True) if sim_stop_density_diff: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) if families_file: sim_output_dir = "{0}/{1}_{2}_stop_density_diff_grouped_families".format( global_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_stop_density_diff_grouped_families.csv".format( global_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 10 else: sim_output_dir = "{0}/{1}_{2}_stop_density_stop_density_diff_all_genes".format( global_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) sim_output_file = "{0}/{1}_{2}_stop_density_stop_density_diff_all_genes.csv".format( global_output_directory, output_prefix, motif_file.split("/")[-1].split(".")[0]) runs = 1 # remove any previous runs gen.remove_directory(sim_output_dir) gen.create_output_directories(sim_output_dir) for run in list(range(runs)): run_output_file = "{0}/stop_density_simulation_{1}.csv".format( sim_output_dir, run + 1) ltests.sim_stop_density_diff(input_fasta, run_output_file, motif_file, controls_dir, simulations=int(required_simulations), families_file=families_file) # process the outputs ltests.process_sim_stop_density_diffs(sim_output_dir, sim_output_file, greater_than=False) # get density in exons and introns if exon_intron_density: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) output_file = "{0}/exon_intron_stop_density.csv".format( local_output_directory) ltests.exon_intron_stop_density(input_fasta, input_fasta2, output_file, families_file=families_file) # test whether there is an excess in flanks if excess_test: gen.check_files_exists([input_fasta, motif_file]) # local output directory local_output_directory = "{0}/stop_excesses".format( global_output_directory) gen.create_output_directories(local_output_directory) # if the families file exists, group by family if families_file: excess_test_output_file = "{0}/{1}_stop_codon_excesses_grouped.csv".format( local_output_directory, motif_file.split("/")[-1].split(".")[0]) else: excess_test_output_file = "{0}/{1}_stop_codon_excesses.csv".format( local_output_directory, motif_file.split("/")[-1].split(".")[0]) # run the test ltests.excess_test(input_fasta, motif_file, excess_test_output_file, simulations=required_simulations, families_file=families_file) # upstream from the atg if upstream_atg: output_file = "{0}/stop_density/upstream_atg_stop_density.csv".format( global_output_directory) ltests.upstream_atg(input_fasta, output_file, simulations=int(required_simulations), families_file=families_file) # calculate the density in the different regions if density_regions: local_output_directory = "{0}/stop_density".format( global_output_directory) gen.create_output_directories(local_output_directory) output_file = "{0}/stop_density_regions_chisq.csv".format( local_output_directory) output_file1 = "{0}/stop_density_regions1.csv".format( local_output_directory) output_file2 = "{0}/stop_density_regions_per_seq.csv".format( local_output_directory) ltests.density_regions(input_fasta, motif_file, output_file, output_file1, output_file2, required_simulations=required_simulations, families_file=families_file) # test hits to seqs if seq_hits_linc: local_output_dir = "{0}/ese_hits".format(global_output_directory) if output_prefix: tests_output_dir = "{0}/{1}_{2}".format( local_output_dir, output_prefix[:-1], motif_file.split("/")[-1].split(".")[0]) final_output_file = "{0}/{1}_{2}_processed2.csv".format( local_output_dir, output_prefix, motif_file.split("/")[-1].split(".")[0]) else: tests_output_dir = "{0}/{1}_{2}".format( local_output_dir, output_prefix, motif_file.split("/")[-1].split(".")[0]) final_output_file = "{0}/{1}_{2}_processed2.csv".format( local_output_dir, output_prefix[:-1], motif_file.split("/")[-1].split(".")[0]) gen.create_output_directories(tests_output_dir) runs = 10 for run in range(runs): if output_prefix: output_file = "{0}/{1}_{2}_hits_{3}.csv".format( tests_output_dir, output_prefix, motif_file.split("/")[-1].split(".")[0], run + 1) else: output_file = "{0}/{1}_hits_{2}.csv".format( tests_output_dir, motif_file.split("/")[-1].split(".")[0], run + 1) mto.calc_seq_hits_linc(input_fasta, output_file, motif_file, controls_dir, required_simulations=required_simulations, families_file=families_file) mto.process_seq_hits_linc(tests_output_dir, final_output_file) if excess_length_thresholds: local_output_dir = "{0}/orf_length_thresholds".format( global_output_directory) gen.create_output_directories(local_output_dir) ltests.orf_exceed_length_threshold( input_fasta, local_output_directory, required_simulations=required_simulations, families_file=families_file) # extract second set if extract_second: local_output_dir = "{0}/genome_sequences/lincrna/{1}".format( output_directory, output_prefix) lmisco.extract_second_seqs(input_bed, input_file, input_fasta, local_output_dir)
def process_bam_per_individual(bam_files, global_exon_junctions_file, PTC_exon_junctions_file, out_folder, PTC_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, kw_dict): ''' Do all of the processing on an individual bam, from filtering out low quality data to mapping reads to exon-exon junctions. For each exon, return information on how many reads fall at different exon-exon junctions. ''' #parse keyword_dict #it's done like this to make it easier to parallelize this process if "ptc_snp_simulation" in kw_dict: ptc_snp_simulation = kw_dict["ptc_snp_simulation"] else: ptc_snp_simulation = False if "simulation_instance_folder" in kw_dict: simulation_instance_folder = kw_dict["simulation_instance_folder"] else: simulation_instance_folder = None if "simulation_number" in kw_dict: simulation_number = kw_dict["simulation_number"] else: simulation_number = None if "overwrite_intersect" in kw_dict: overwrite_intersect = kw_dict["overwrite_intersect"] else: overwrite_intersect = False if "phase" in kw_dict: phase = kw_dict["phase"] else: phase = False bam_file_number = len(bam_files) for pos, bam_file in enumerate(bam_files): #Process: # 1. get the number of reads in bam # 2. Filter out reads that don't overlap exon-exon junctions # 3. Filter out reads that don't overlap exon-exon junctions flanking PTC-containing exons # 4. Filter bams by quality # This gives us a set of "good" quality reads. # 5. scale down total read number proportionally to how many reads were lost in the quality filtering # 6. Count reads either skipping or including each exon print("{0}/{1}: {2}".format(pos, bam_file_number, bam_file)) sample_name = (bam_file.split("/")[-1]).split(".")[0] if ptc_snp_simulation: output_file = "{0}/{1}_simulation_{2}.txt".format( out_folder, sample_name, simulation_number) else: output_file = "{0}/{1}.txt".format(out_folder, sample_name) #folder that will contain all of the intermediate steps in the processing of the bam file if ptc_snp_simulation: proc_folder = "{0}/bam_proc_files".format( simulation_instance_folder) else: proc_folder = "{0}__analysis_bam_proc_files".format(out_prefix) gen.create_output_directories(proc_folder) bam_file_parts = os.path.split(bam_file) mapq_filtered_bam = "{0}/{1}_filtered_mapq.bam".format( proc_folder, bam_file_parts[1]) mapq_flag_filtered_bam = "{0}_flag.bam".format(mapq_filtered_bam[:-4]) mapq_flag_xt_filtered_bam = "{0}_xt.bam".format( mapq_flag_filtered_bam[:-4]) mapq_flag_xt_nm_filtered_bam = "{0}_nm.bam".format( mapq_flag_xt_filtered_bam[:-4]) if not os.path.isfile(output_file): #1: We get a count of the total reads in the sample which can be used for normalisation #I'm initializing it to None for safety. That way, if the process fails, #it won't just silently go with whatever the value was at the end of the previous loop. #also, writing it down cause this bit takes forever, don't want to do it again every time. read_count_file_name = "{0}/read_count_sample_name.txt".format( exon_junctions_bam_output_folder) read_count = None if os.path.isfile(read_count_file_name): with open(read_count_file_name) as file: read_count = int("".join(file)) else: read_count = int( gen.run_process(["samtools", "view", "-c", bam_file])) with open(read_count_file_name, "w") as file: file.write(str(read_count)) #2: intersect the bam with all exon-exon junctions #only has to be done once for each bam #also removing "_out_of_frame" from out_prefix if it is present global_out_prefix = out_prefix if "out_of_frame" in global_out_prefix: global_out_prefix = global_out_prefix[:6] global_intersect_bam = "{0}/{1}_exon_junctions.bam".format( exon_junctions_bam_output_folder, bam_file_parts[1][:-4]) if not os.path.isfile(global_intersect_bam) or overwrite_intersect: #intersect the filtered bam and the global exon junctions file # print(global_intersect_bam) bmo.intersect_bed(bam_file, global_exon_junctions_file, output_file=global_intersect_bam, intersect_bam=True) #3: filter to relevant exon-exon junctions ##Intersect junctions and .bam, and write down the overlapping .bam alignments, without counting. #this uses intersect bed, with the intersect bam parameter intersect_bam = "{0}/{1}_exon_junction_bam_intersect.bam".format( proc_folder, bam_file_parts[1][:-4]) #intersect the filtered bam and the ptc exon junctions file bmo.intersect_bed(global_intersect_bam, PTC_exon_junctions_file, output_file=intersect_bam, intersect_bam=True) #count how many reads there are in the sample after filtering to relevant exon-exon junctions but before quality filtering read_count_junctions_no_filter = int( gen.run_process(["samtools", "view", "-c", intersect_bam])) #4. filter .bam alignments by quality. #takes both upper and lower bam thresholds #outputs bam file with "_quality_filter_{lower_lim}_{upper_lim}" appended # need to do this twice and merge, so we use both intervals used by Geuvadis #set the mapq filter parameters here mapq_intervals = [[251, 255], [175, 181]] mapq_filter_filelist = [] for mapq_interval in mapq_intervals: lower_threshold, upper_threshold = mapq_interval[ 0], mapq_interval[1] mapq_filter_file = "{0}/{1}_mapq_filter_{2}_{3}.bam".format( proc_folder, bam_file_parts[1][:-4], lower_threshold, upper_threshold) mapq_filter_filelist.append(mapq_filter_file) ##run the mapq filter bmo.bam_quality_filter( intersect_bam, mapq_filter_file, quality_greater_than_equal_to=lower_threshold, quality_less_than_equal_to=upper_threshold) ##merge files in filelist bmo.merge_bams(mapq_filter_filelist, mapq_filtered_bam) ##filter by flags: get all mapped reads #Leaves: mapped unpaired and paired reads bmo.bam_flag_filter(mapq_filtered_bam, mapq_flag_filtered_bam, get_mapped_reads=True) ##filter bam by xt tag XT=U bmo.bam_xt_filter(mapq_flag_filtered_bam, mapq_flag_xt_filtered_bam, xt_filter="U") ##filter bam by nm tag NM<=6 bmo.bam_nm_filter(mapq_flag_xt_filtered_bam, mapq_flag_xt_nm_filtered_bam, nm_less_equal_to=6) #5. scale down the initial count of reads in the sample by the proportion lost during quality filtering read_count_junctions_filter = int( gen.run_process( ["samtools", "view", "-c", mapq_flag_xt_nm_filtered_bam])) prop_kept = np.divide(read_count_junctions_filter, read_count_junctions_no_filter) read_count = prop_kept * read_count #convert to sam format and phase reads intersect_sam = "{0}_phased.sam".format( mapq_flag_xt_nm_filtered_bam[:-4]) if phase: temp_snp_file = "temp_data/snps{0}.txt".format(random.random()) so.merge_and_header(PTC_file, syn_nonsyn_file, temp_snp_file) bmo.phase_bams(temp_snp_file, mapq_flag_xt_nm_filtered_bam, sample_name, intersect_sam) gen.remove_file(temp_snp_file) else: gen.run_process( ["samtools", "view", mapq_flag_xt_nm_filtered_bam], file_for_output=intersect_sam) #6. count the number of reads supporting either the skipping or the inclusion of each exon junctions = bmo.read_exon_junctions(PTC_exon_junctions_file) bmo.count_junction_reads(intersect_sam, junctions, output_file, read_count)
def main(): description = "Check whether PTCs are associated with greater rates of exon skipping." args = gen.parse_arguments( description, [ "gtf", "genome_fasta", "bams_folder", "vcf_folder", "panel_file", "out_prefix", "bam_analysis_folder", "number_of_simulations", "simulation_output_folder", "motif_file", "filter_genome_data", "get_SNPs", "process_bams", "simulate_ptc_snps", "motif_complement", "overwrite_intersect", "use_old_sims", "out_of_frame", "simulate_ptcs_with_monomorphic", "generate_monomorphic_indices", "ignore_determine_snp_type", "ignore_psi_calculation", "ptc_location_analysis" ], flags=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], ints=[7]) gtf, genome_fasta, bams_folder, vcf_folder, panel_file, out_prefix, bam_analysis_folder, number_of_simulations, simulation_output_folder, motif_file, filter_genome_data, get_SNPs, process_bams, simulate_ptc_snps, motif_complement, overwrite_intersect, use_old_sims, out_of_frame, simulate_ptcs_with_monomorphic, generate_monomorphic_indices, ignore_determine_snp_type, ignore_psi_calculation, ptc_location_analysis = args.gtf, args.genome_fasta, args.bams_folder, args.vcf_folder, args.panel_file, args.out_prefix, args.bam_analysis_folder, args.number_of_simulations, args.simulation_output_folder, args.motif_file, args.filter_genome_data, args.get_SNPs, args.process_bams, args.simulate_ptc_snps, args.motif_complement, args.overwrite_intersect, args.use_old_sims, args.out_of_frame, args.simulate_ptcs_with_monomorphic, args.generate_monomorphic_indices, args.ignore_determine_snp_type, args.ignore_psi_calculation, args.ptc_location_analysis start = time.time() # create any necessary output diretories directory_splits = out_prefix.split('/') directory_paths = "/".join(directory_splits[:-1]) gen.create_output_directories(directory_paths) gen.create_directory('temp_data/') CDS_fasta = "{0}_CDS.fasta".format(out_prefix) CDS_bed = "{0}_CDS.bed".format(out_prefix) exon_bed = "{0}_exons.bed".format(out_prefix) filtered_exon_bed = "{0}_filtered_exons.bed".format(out_prefix) exon_junctions_file = "{0}_exon_junctions.bed".format(out_prefix) coding_exon_bed = "{0}_coding_exons.bed".format(out_prefix) if filter_genome_data: #extract and filter CDS coordinates and sequences print("Extracting and filtering CDSs...") bo.extract_cds(gtf, CDS_bed, CDS_fasta, genome_fasta, all_checks=True, uniquify=True, clean_chrom_only=True, full_chr_name=True) gen.get_time(start) #group the CDS sequences into families based on sequence similarity print("Grouping sequences into families...") names = gen.read_fasta(CDS_fasta)[0] gen.find_families_ensembl( "../source_data/GRCh37_ensembl_protein_families.txt", names, "{0}_families.txt".format(out_prefix)) gen.get_time(start) print("Extracting and filtering exons...") #extract exon coordinates bo.extract_exons(gtf, exon_bed) #only leave exons from transcripts that passed quality control in the extract_cds step above. #also only leave a single gene per family bo.filter_bed_from_fasta( exon_bed, CDS_fasta, filtered_exon_bed, families_file="{0}_families.txt".format(out_prefix)) gen.get_time(start) #extract exon-exon junction coordinates print("Extracting exon-exon junctions...") bo.extract_exon_junctions(exon_bed, exon_junctions_file, window_of_interest=2) gen.get_time(start) #make another exons bed that only contains fully coding exons. #This is because in the final analysis, we should only consider fully protein-coding exons. #However, for getting the exon junctions we need the full exons file because fully protein-coding exons might #be flanked by exons that are not. This is why we couldn't do this filtering step earlier. print( "Filtering out overlapping, non-coding and partially coding, as well as terminal exons..." ) bo.check_coding(filtered_exon_bed, CDS_bed, coding_exon_bed, remove_overlapping=True) gen.get_time(start) SNP_file = "{0}_SNP_file.txt".format(out_prefix) if out_of_frame: out_prefix = out_prefix + "_out_of_frame" PTC_file = "{0}_ptc_file.txt".format(out_prefix) syn_nonsyn_file = "{0}_syn_nonsyn_file.txt".format(out_prefix) CDS_interval_file = "{0}_intervals{1}".format( os.path.splitext(CDS_fasta)[0], os.path.splitext(CDS_fasta)[1]) #check which individuals were included in Geuvadis full_sample_names = os.listdir(bams_folder) full_sample_names = [ i for i in full_sample_names if i[-4:] == ".bam" and "proc" not in i ] sample_names = [(i.split("."))[0] for i in full_sample_names] sample_names = [i for i in sample_names if len(i) > 0] print('{0} samples included in Geuvadis...'.format(len(sample_names))) #for some reason, 17 of the samples from Geuvadis don't appear in the 1000genomes vcf #I'm gonna have to get to the bottom of this at some point #but at the moment I'm just gonna filter them out with open("../source_data/samples_in_vcf.txt") as file: samples_in_vcf = file.readlines() samples_in_vcf = [i.rstrip("\n") for i in samples_in_vcf] sample_names = [i for i in sample_names if i in samples_in_vcf] print('{0} samples also in vcf...'.format(len(sample_names))) sample_file = "{0}_sample_file.txt".format(out_prefix) # create a fasta containing all sequences for exons with snp coding_exons_fasta = "{0}_coding_exons.fasta".format(out_prefix) bo.fasta_from_intervals(coding_exon_bed, coding_exons_fasta, genome_fasta, names=True) if get_SNPs: #get SNPs for the sample intersect_file = "{0}_SNP_CDS_intersect.bed".format(out_prefix) print("Getting SNP data...") so.get_snps_in_cds(coding_exon_bed, CDS_bed, vcf_folder, panel_file, sample_names, sample_file, intersect_file, out_prefix) print("Calculating SNP positions...") so.get_snp_positions(sample_file, SNP_file, CDS_bed, intersect_file, out_prefix) gen.get_time(start) if ignore_determine_snp_type: pass else: print("Determining SNP type...") so.get_snp_change_status(SNP_file, CDS_fasta, PTC_file, syn_nonsyn_file, out_of_frame=out_of_frame, ref_check=True, headers=True) gen.get_time(start) #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step. print( "Filtering exon-exon junctions to only leave those that flank exons with a PTC variant..." ) PTC_exon_junctions_file = "{0}_filtered_exon_junctions.bed".format( out_prefix) bo.filter_exon_junctions(exon_junctions_file, PTC_file, PTC_exon_junctions_file) #make a list of all the .bam files and modify them to have the full path rather than just the file name bam_files = [ "{0}/{1}".format(bams_folder, i) for i in full_sample_names if (i.split("."))[0] in sample_names ] #in parallel, do the processing on individual .bam files exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format( out_prefix) if bam_analysis_folder == "None": bam_analysis_folder = "{0}__analysis_bam_analysis".format(out_prefix) gen.create_directory(bam_analysis_folder) if process_bams: print("Processing RNA-seq data...") if out_of_frame: splits = exon_junctions_bam_output_folder.split('/') splits[-1] = splits[-1].replace('_out_of_frame', '') exon_junctions_bam_output_folder = "/".join(splits) gen.create_directory(exon_junctions_bam_output_folder) #we have to do it like this because you can't pass flags into run_in_parallel keyword_dict = {"overwrite_intersect": overwrite_intersect} processes = gen.run_in_parallel(bam_files, [ "foo", exon_junctions_file, PTC_exon_junctions_file, bam_analysis_folder, PTC_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, keyword_dict ], nao.process_bam_per_individual, workers=36) for process in processes: process.get() gen.get_time(start) #if required, filter PTCs to only leave ones that overlap motifs from a specified set motif_filtering = False if motif_file != "None": print( "Filtering SNPs based on whether or not they overlap a motif from the specified set..." ) motif_suffix = ((motif_file.split("/"))[-1]).split(".")[0] if motif_complement: out_prefix = "{0}_{1}_complement".format(out_prefix, motif_suffix) else: out_prefix = "{0}_{1}".format(out_prefix, motif_suffix) filtered_ptc = "{0}_ptc_file.txt".format(out_prefix) so.filter_motif_SNPs(CDS_fasta, PTC_file, motif_file, filtered_ptc, complement=motif_complement) PTC_file = filtered_ptc final_file = "{0}__analysis_final_output.txt".format(out_prefix) if ignore_psi_calculation: pass else: print("Calculating PSI...") bmo.compare_PSI(PTC_file, bam_analysis_folder, final_file) #run the simulation that swaps ptcs for nonsynonymous snps if simulate_ptc_snps: if simulate_ptc_snps and not number_of_simulations: print("Please specify the number of simulations") raise Exception nao.ptc_snp_simulation(out_prefix, simulation_output_folder, PTC_file, syn_nonsyn_file, exon_junctions_file, bam_files, number_of_simulations, exon_junctions_bam_output_folder, use_old_sims=use_old_sims) # run the simulation that picks monomorphic sites if simulate_ptcs_with_monomorphic: if simulate_ptcs_with_monomorphic and not number_of_simulations: print("Please specify the number of simulations") raise Exception coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix) if not os.path.exists(coding_exon_fasta): print('Coding exon fasta is required...') raise Exception nao.ptc_monomorphic_simulation( out_prefix, simulation_output_folder, sample_file, genome_fasta, PTC_file, syn_nonsyn_file, coding_exon_bed, coding_exon_fasta, exon_junctions_file, bam_files, number_of_simulations, generate_indices=generate_monomorphic_indices, use_old_sims=use_old_sims) # get the locations of the ptcs if ptc_location_analysis: print("PTC locations analysis...") snp_relative_exon_position_file = "{0}_SNP_relative_exon_position.bed".format( out_prefix) ptc_location_analysis_output_file = "{0}_ptc_location_analysis.csv".format( out_prefix) coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix) if not os.path.exists(coding_exon_fasta) or not os.path.exists( snp_relative_exon_position_file) or not os.path.exists( PTC_file): print("Please run --filter_genome_data and --get_SNPs first...") raise Exception # need to work out where and what the analysis outputs need to do so.ptc_locations(PTC_file, snp_relative_exon_position_file, ptc_location_analysis_output_file)
def main(): arguments = [ "output_directory", "genome_gtf", "genome_fasta", "ortholog_gtf", "ortholog_fasta", "input_file", "genome_fasta", "mapping_file", "codes_file", "ensembl_links", "extract_protein_coding", "extract_exons", "extract_introns", "extract_coding_exons", "extract_non_coding_exons", "extract_non_transcribed_regions", "extract_lincrna_seqs", "clean_run" ] description = "" args = gen.parse_arguments(description, arguments, opt_flags=[1, 2, 3, 4, 5, 6, 7, 8, 9], flags=[10, 11, 12, 13, 14, 15, 16, 17]) output_directory, genome_gtf, genome_fasta, ortholog_gtf, ortholog_fasta, input_file, genome_fasta, mapping_file, codes_file, ensembl_links, extract_protein_coding, extract_exons, extract_introns, extract_coding_exons, extract_non_coding_exons, extract_non_transcribed_regions, extract_lincrna_seqs, clean_run = args.output_directory, args.genome_gtf, args.genome_fasta, args.ortholog_gtf, args.ortholog_fasta, args.input_file, args.genome_fasta, args.mapping_file, args.codes_file, args.ensembl_links, args.extract_protein_coding, args.extract_exons, args.extract_introns, args.extract_coding_exons, args.extract_non_coding_exons, args.extract_non_transcribed_regions, args.extract_lincrna_seqs, args.clean_run # set a start time start = time.time() # create the output_directory if it doenst already exist gen.create_output_directories(output_directory) # get the sequences if extract_protein_coding: # input_file1 = gtf genome 1, genome_fasta = genome fasta 1, ortholog_gtf = gtf genome 2, ortholog_fasta = genome fasta 2, ensembl_links = orthlogs file cont.extract_clean_sequences(genome_gtf, genome_fasta, ortholog_gtf, ortholog_fasta, ensembl_links, output_directory, clean_run=clean_run) full_exon_file = "{0}/genome_sequences/human/human.exons.bed".format( output_directory) if extract_exons: cont.extract_exons(genome_gtf, genome_fasta, output_directory, full_exon_file, clean_run=clean_run) sequo.clean_feature_file(full_exon_file) exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_filtered_exons.bed".format( output_directory, "human") coding_exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_coding_exons.bed".format( output_directory, "human") coding_exons_fasta = "{0}/genome_sequences/{1}/{1}.cds.clean_coding_exons.fasta".format( output_directory, "human") if extract_coding_exons: sequo.get_coding_exon_coordinates(full_exon_file, exons_bed, coding_exons_bed) fo.fasta_from_intervals(coding_exons_bed, coding_exons_fasta, genome_fasta, names=True) if extract_non_coding_exons: non_coding_exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_non_coding_exons.bed".format( output_directory, "human") non_coding_exons_fasta = "{0}/genome_sequences/{1}/{1}.cds.clean_non_coding_exons.fasta".format( output_directory, "human") sequo.get_non_coding_exon_coordinates(full_exon_file, exons_bed, non_coding_exons_bed) fo.fasta_from_intervals(non_coding_exons_bed, non_coding_exons_fasta, genome_fasta, names=True) if extract_introns: intron_bed = "{0}/genome_sequences/human/human.clean_introns.bed".format( output_directory) intron_fasta = "{0}/genome_sequences/human/human.clean_introns.fasta".format( output_directory) sequo.get_intron_coordinates(coding_exons_bed, intron_bed) fo.fasta_from_intervals(intron_bed, intron_fasta, genome_fasta, names=True) if extract_non_transcribed_regions: all_features_bed = "{0}/genome_sequences/human/human.all_features.bed".format( output_directory) non_transcribed_bed = "{0}/genome_sequences/human/human.non_transcribed.bed".format( output_directory) non_transcribed_fasta = "{0}/genome_sequences/human/human.non_transcribed.fasta".format( output_directory) seqo.get_non_transcribed_regions(genome_gtf, genome_fasta, all_features_bed, non_transcribed_bed, non_transcribed_fasta, output_directory) # extract sequences from source file if extract_lincrna_seqs: # set up the output fasta to contain the exon seqs lincrna_exons_bed = "{0}/lincRNA_exons.bed".format(output_directory) lincrna_exons_fasta = "{0}/lincRNA_exons.fasta".format( output_directory) lincrna_seqs_fasta = "{0}/lincRNA_seqs.fasta".format(output_directory) print("Extracting lincRNA seqs...") fo.extract_seqs(input_file, genome_fasta, lincrna_exons_bed, lincrna_exons_fasta, lincrna_seqs_fasta, mapping_file, codes_file, exclude_XY=True, hg38=hg38, NONCODE=NONCODE) print("Use lincRNA_misc.py to do further filtering...")