def get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, kgenomes_ptcs_file, kgenomes_ptcs_exon_positions, unique_ptcs_rel_pos_file, kgenomes_ptcs_rel_pos_file): ''' Get the relative positions of the unique ptcs ''' snps = gen.read_many_fields(disease_snps_relative_exon_positions, "\t") snp_list = collections.defaultdict(lambda: collections.defaultdict()) for snp in snps: snp_pos = int(snp[7]) rel_pos = int(snp[11]) snp_list[snp_pos] = rel_pos ptcs = gen.read_many_fields(unique_ptcs, "\t") with open(unique_ptcs_rel_pos_file, "w") as outfile: for ptc in ptcs: ptc_pos = int(ptc[7]) ptc[11] = snp_list[ptc_pos] outfile.write("{0}\n".format("\t".join(gen.stringify(ptc)))) kgenomes_ptc_positions = gen.read_many_fields(kgenomes_ptcs_exon_positions, "\t") kgenomes_ptc_list = collections.defaultdict(lambda: collections.defaultdict()) for ptc in kgenomes_ptc_positions[1:]: snp_pos = int(ptc[7]) rel_pos = int(ptc[11]) kgenomes_ptc_list[snp_pos] = rel_pos kgenomes_ptcs = gen.read_many_fields(kgenomes_ptcs_file, "\t") with open(kgenomes_ptcs_rel_pos_file, "w") as outfile: for ptc in kgenomes_ptcs: ptc_pos = int(ptc[7]) ptc[11] = kgenomes_ptc_list[ptc_pos] outfile.write("{0}\n".format("\t".join(gen.stringify(ptc))))
def get_introns(exon_bed, output_file): ''' Get the introns between exons in a file ''' exons = gen.read_many_fields(exon_bed, "\t") # get a dictionary of exons split by transcript and number exon_list = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict())) for item in exons: exon = Define_Exon(item) if exon.type == "stop_codon": exon.exon_no = 999999 exon_list[exon.transcript_id][exon.exon_no] = item # now get the introns and write to file with open(output_file, "w") as outfile: for transcript in exon_list: for exon_no in sorted(exon_list[transcript]): exon = Define_Exon(exon_list[transcript][exon_no]) # check that the next exon exists, assuming its id will not be higher than 999999 if exon.exon_no + 1 in exon_list[transcript]: next_exon = Define_Exon(exon_list[exon.transcript_id][exon.exon_no + 1]) if exon.strand == "-": intron_start = next_exon.stop intron_stop = exon.start else: intron_start = exon.stop intron_stop = next_exon.start outlist = gen.stringify([exon.chr, intron_start, intron_stop, "{0}.{1}-{2}".format(exon.transcript_id, exon.exon_no, next_exon.exon_no), ".", exon.strand]) outfile.write("{0}\n".format("\t".join(outlist)))
def get_genome_bed_from_fasta_index(features_bed, fasta_index, output_file): """ Given a list of features, get the genome coordinates as a bed file. Args: features_bed (str): path to bed file containing features fasta_index (str): path to fasta index file output_file (str): path to output file """ # get all the chromosomes required first_column = [ i.strip("chr") for i in list( set( gen.run_process(["awk", "{print $1}"], file_for_input=features_bed).split('\n'))) if len(i) ] # get index lines index = gen.read_many_fields(fasta_index, "\t") with open(output_file, "w") as outfile: for i in index: if i[0] in first_column: start = 0 length = int(i[1]) out_info = [i[0], start, start + length, ".", "."] outfile.write("{0}\t+\n{0}\t-\n".format("\t".join( gen.stringify(out_info))))
def entries_to_bed(source_path, output_file, exclude_XY=None, hg38=None, NONCODE=None): """ Generate a file containing the exon info from a bed file Args: source_path (str): the source path for the origin .bed file output_file (str): output .bed file to contain the exon info exclude_XY (bool): if true, exclude cases on X and Y chr hg38 (bool): if true, using hg38 """ # read the file in lines = gen.read_many_fields(source_path, "\t") with open(output_file, "w") as outfile: for line in lines: features = gen.Bed_Feature(line) starts = [int(start) for start in features.featureStarts.split(',')[:-1]] sizes = [int(size) for size in features.featureSizes.split(',')[:-1]] expected = features.featureCount if len(sizes) == expected and len(starts) == expected: for i, start_pos in enumerate(starts): # get the end position start_pos = features.start + start_pos end_pos = start_pos + sizes[i] # create a list of the new bed line write=True if hg38: features.chr = features.chr.strip("chr") if NONCODE: features.name = features.name.split(".")[0] if "NONHSAT" not in features.name: write=False output_list = [features.chr, start_pos, end_pos, "{0}.{1}".format(features.name, i+1), ".", features.strand, features.thickStart, features.thickEnd, ".", features.featureCount, ".", "."] # only add if a transcript, used for NONCODE sequences if write: # add the info if exists if hasattr(features, "info"): output_list.extend(features.info) if exclude_XY: if features.chr not in ["chrX", "chrY"]: outfile.write("{0}\n".format("\t".join(gen.stringify(output_list)))) else: outfile.write("{0}\n".format("\t".join(gen.stringify(output_list)))) else: print('Error in the number of features')
def intron_hexamer_test(input_fasta, motif_file, output_directory, output_file, required_simulations = None, families_file = None): """ Generate random hexamers from introns and calculate purine content Args: input_fasta (str): path to intron fasta motif_file (str): path to file containing real motifs output_directory (str): path to output directory output_file (str): path to output file required_simulations (int): if set, the number of simulations to run families_file (str): if set, path to families file """ hexamers_dir = "{0}/random_hexamers".format(output_directory) gen.create_output_directories(hexamers_dir) # get the motifs motifs = sequo.read_motifs(motif_file) # if there are not enough simulations, generate them if len(os.listdir(hexamers_dir)) < required_simulations: gen.create_output_directories(hexamers_dir) required = list(range(required_simulations - len(os.listdir(hexamers_dir)))) names, seqs = gen.read_fasta(sequences_file) seqs_list = collections.defaultdict(lambda: []) for i, name in enumerate(names): seqs_list[name.split(".")[0]].append(seqs[i]) if families_file: seqs_list = sequo.pick_random_family_member(families_file, seqs_list) all_seqs = [] [all_seqs.extend(seqs_list[i]) for i in seqs_list] full_seq = "X".join(all_seqs) simopc.run_simulation_function(required, [full_seq, motifs, hexamers_dir], sequo.locate_random_motifs, sim_run = False) # calculate the purine contents real_purine_content = sequo.calc_purine_content(motifs) real_nt_content = sequo.calc_nucleotide_content(motifs) test_purine_content = [] test_nt_content = [] for file in os.listdir(hexamers_dir): filepath = "{0}/{1}".format(hexamers_dir, file) test_motifs = sequo.read_motifs(filepath) test_purine_content.append(sequo.calc_purine_content(test_motifs)) test_nt_content.append(sequo.calc_nucleotide_content(test_motifs)) with open(output_file, "w") as outfile: outfile.write("id,purine_content,a_content,c_content,g_content,t_content\n") outfile.write("real,{0},{1}\n".format(real_purine_content, ",".join(gen.stringify([real_nt_content[i] for i in sorted(real_nt_content)])))) for i in range(len(test_purine_content)): outfile.write("{0},{1},{2}\n".format(i+1, test_purine_content[i], ",".join(gen.stringify([test_nt_content[i][j] for j in sorted(test_nt_content[i])])))) # remove the output directory gen.remove_directory(hexamers_dir)
def write_features_to_bed(feature_list, output_file): """ Write a set of features to bed file Args: feature_list (dict): dictionary containing features with transcripts as keys output_file (str): path to output file """ with open(output_file, "w") as outfile: for feature in feature_list: for item in feature_list[feature]: item[3] = "{0}.{1}".format(item[3], item[4]) item[4] = "." outfile.write("{0}\n".format("\t".join(gen.stringify(item))))
def get_filtered_exons(): ''' Create a ptc file containing only the filtered final ptcs and large effect cases that overlap disease ''' output_prefix = "results/clean_run_2/clean_run" disease_prefix = "results/clinvar" ptc_file = "{0}_ptc_file.txt".format(output_prefix) relative_positions_file = "{0}_PTC_relative_exon_positions.bed".format( output_prefix) final_output_file = "{0}__analysis_final_output.txt".format(output_prefix) intersect_file = "{0}/ptc_intersect.bed".format(disease_prefix) intersect_ptcs = gen.read_many_fields(intersect_file, "\t") intersect_list = [] for ptc in intersect_ptcs: exon = ptc[-4] intersect_list.append(exon) filtered_list = get_filtered_skipped_exons(final_output_file) large_effects, non_large_effects = get_large_effect_overlaps( filtered_list, 5, 0.025) filtered_list = [ exon[0] for exon in get_filtered_skipped_exons(final_output_file) ] ptcs = gen.read_many_fields(ptc_file, "\t") ptc_list = [] le_overlap = [] for ptc in ptcs: exon = ptc[3] if exon in filtered_list and exon not in intersect_list: ptc_list.append(ptc) if exon in large_effects and exon in intersect_list: le_overlap.append(ptc) large_effect_disease_overlap = "{0}_large_effect_disease_overlap.txt".format( output_prefix) with open(large_effect_disease_overlap, "w") as outfile: for ptc in le_overlap: exon = ptc[3] outfile.write('{0}\n'.format(exon)) filtered_no_overlaps = "{0}_ptc_file_filtered_no_disease.txt".format( output_prefix) with open(filtered_no_overlaps, "w") as outfile: for ptc in ptc_list: outfile.write("{0}\n".format("\t".join(gen.stringify(ptc))))
def get_relative_exon_positions(relative_positions_file, large_effect_exons, ptc_file): snp_pos = {} relative_positions = gen.read_many_fields(relative_positions_file, "\t") for snp in relative_positions: id = snp[8] pos = int(snp[11]) snp_pos[id] = pos with open("results/clean_run_2/clean_run_PTC_relative_exon_positions.bed", "w") as outfile: ptcs = gen.read_many_fields(ptc_file, "\t") outfile.write('{0}'.format("\t".join(ptcs[0]))) for ptc in ptcs[1:]: id = ptc[8] ptc[11] = snp_pos[id] outfile.write('{0}\n'.format("\t".join(gen.stringify(ptc))))
def get_introns_from_bed(input_bed, output_file): exons = gen.read_many_fields(input_bed, "\t") exon_list = collections.defaultdict( lambda: collections.defaultdict(lambda: [])) strands = {} chrs = {} for exon in exons: transcript = exon[3].split(".")[0] exon_id = exon[3].split(".")[1] if "(" in exon_id: exon_id = exon_id.split("(")[0] exon_id = int(exon_id) start = int(exon[1]) end = int(exon[2]) chr = exon[0] strand = exon[5] strands[transcript] = strand chrs[transcript] = chr exon_list[transcript][exon_id] = [start, end] with open(output_file, "w") as outfile: for transcript in exon_list: for exon_id in sorted(exon_list[transcript]): # check whether the next exon exists if exon_id + 1 in exon_list[transcript]: if strands[transcript] == "+": intron_start = exon_list[transcript][exon_id][1] intron_end = exon_list[transcript][exon_id + 1][0] else: intron_start = exon_list[transcript][exon_id + 1][1] intron_end = exon_list[transcript][exon_id][0] outdata = [ chrs[transcript], intron_start, intron_end, "{0}.{1}-{2}".format(transcript, exon_id, exon_id + 1), ".", strands[transcript] ] outfile.write("{0}\n".format("\t".join( gen.stringify(outdata))))
def ese_hit_simulation(rel_pos_file, coding_exons_fasta, simulations, output_file, ese_file, window_start, window_end, exclude_cpg, clinvar=None): ''' Simulate ese hits strictly within a region ''' # get a list of the relative positions of the ptcs relative_positions_list = get_relative_position_list(rel_pos_file) # get a list of eses ese_list = get_eses_from_file(ese_file) # get the coding exons coding_exons = get_coding_exons(coding_exons_fasta) long_exons = get_long_exons(relative_positions_list, coding_exons, window_end*2) # get the ptcs that are in the 3-69 bp region for each exon of exon # this requires exons at least 128 bp in length for comparison window_ptcs = get_ptcs_in_window(long_exons, window_start, window_end, coding_exons) real_ese_hits = get_ese_hits(window_ptcs, coding_exons, ese_list) # simulate the hit counts for nt matched mutations simulation_list = list(range(simulations)) # simulate_ese_hits(simulation_list, simulations, window_ptcs, coding_exons_fasta, ese_list, window_start, window_end) processes = gen.run_in_parallel(simulation_list, ["foo", simulations, window_ptcs, coding_exons_fasta, ese_list, window_start, window_end], simulate_ese_hits) # simulation_outputs = {} for process in processes: simulation_hits = process.get() simulation_outputs = {**simulation_outputs, **simulation_hits} with open(output_file, "w") as outfile: outfile.write("simulation,ese_hit_count,cant_count\n") outfile.write("real,{0},0\n".format(real_ese_hits)) for simulation in sorted(simulation_outputs): outlist = [simulation+1, simulation_outputs[simulation][0], simulation_outputs[simulation][1]] outfile.write("{0}\n".format(",".join(gen.stringify(outlist))))
def motif_codon_densities(motif_file, codon_combinations_file, motif_controls_directory, required_simulations, output_file): filelist = {"real": motif_file} for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]): filelist[i] = "{0}/{1}".format(motif_controls_directory, file) file_ids = [i for i in filelist] codon_sets = gen.read_many_fields(codon_combinations_file, "\t") temp_dir = "temp_motif_dir" gen.create_output_directories(temp_dir) args = [filelist, codon_sets, temp_dir] outputs = simopc.run_simulation_function(file_ids, args, calculate_motif_densities, sim_run = False) real_density_list = {} sim_density_list = collections.defaultdict(lambda: []) for file in outputs: results = gen.read_many_fields(file, "\t") if "real" in file: for i in results: real_density_list[i[0]] = float(i[1]) else: for i in results: sim_density_list[i[0]].append(float(i[1])) with open(output_file, "w") as outfile: outfile.write("codons,gc_content,purine_content,density,nd\n") for codon_set in sorted(real_density_list): nd = np.divide(real_density_list[codon_set] - np.mean(sim_density_list[codon_set]), np.mean(sim_density_list[codon_set])) outputs = [codon_set, seqo.calc_gc_seqs_combined(codon_set.split("_")), sequo.calc_purine_content(codon_set.split("_")), real_density_list[codon_set], nd] outfile.write("{0}\n".format(",".join(gen.stringify(outputs)))) gen.remove_directory(temp_dir)
def ptc_location_simulation(rel_pos_file, coding_exons_fasta, simulations, output_file, ese_overlap_output_file, ese_file=None, only_ese=None, exclude_cpg=None): ''' Simulation mutation locations of PTCs. Take the exon in which each PTC is location and randomly pick a site with the same nt composition. Locations of these matched mutations are used for null. ''' # get a list of the relative positions of the ptcs relative_positions_list = get_relative_position_list(rel_pos_file) # get a list of eses ese_list = get_eses_from_file(ese_file) # get the coding exons coding_exons = get_coding_exons(coding_exons_fasta) # get the positions of the ptcs real_positions = get_ptc_positions(relative_positions_list, coding_exons) # get the number of ptcs with ese overlaps real_ese_overlap = get_ptc_ese_overlap(relative_positions_list, coding_exons, ese_list) # now do the simulations simulant_list = list(range(1, simulations+1)) processes = gen.run_in_parallel(simulant_list, ["foo", simulations, relative_positions_list, coding_exons_fasta, ese_list, exclude_cpg], simulate_mutation_locations) position_list = {} ese_overlap_list = {} for process in processes: result = process.get() position_list = {**position_list, **result[0]} ese_overlap_list = {**ese_overlap_list, **result[1]} # ignore writing this to file if we just want the ese overlap if not only_ese: with open(output_file, "w") as outfile: outfile.write('simulation,0.2,3.69,70+\n') outfile.write('real,{0}\n'.format(",".join(gen.stringify(real_positions)))) for simulant in position_list: outfile.write("{0},{1}\n".format(simulant, ",".join(gen.stringify(position_list[simulant])))) with open(ese_overlap_output_file, "w") as outfile: outfile.write('simulation,0.2,3.69,70+\n') outfile.write('real,{0}\n'.format(",".join(gen.stringify(real_ese_overlap)))) for simulant in ese_overlap_list: outfile.write("{0},{1}\n".format(simulant, ",".join(gen.stringify(ese_overlap_list[simulant])))) # def ptc_location_simulation(snp_file, full_bed, cds_fasta, possible_positions_dir, output_directory, required_simulations, coding_exons_file): ''' Simulate the snp location. For each snp, pick another site that has the same reference allele and that would generate a ptc with the mutated allele. Repeat n times. ''' # return all the possible_locations possible_locations = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: []))) nts = ["A", "C", "G", "T"] for nt in nts: location_file = "{0}/possible_ptc_locations_{1}.fasta".format(possible_positions_dir, nt) entry_names, entry_locations = gen.read_fasta(location_file) for i, name in enumerate(entry_names): exon = name.split(':')[0] aa = name.split(':')[1][0] ma = name.split(':')[1][-1] possible_locations[exon][aa][ma].append(entry_locations[i]) # get a list of exons and their lengths exons = gen.read_many_fields(coding_exons_file, "\t") exon_list = {} for exon in exons: exon_list[exon[3]] = int(exon[2]) - int(exon[1]) # create a list of required simulations simulations = list(range(1, int(required_simulations) + 1)) run_location_simulations(simulations, snp_file, possible_locations, exon_list, output_directory)
def get_output(entry): output = [] for id in entry: output.extend(entry[id]) return gen.stringify(output)
cds_names, cds_seqs = gen.read_fasta(cds_fasta) cds_list = {name: cds_seqs[i] for i, name in enumerate(cds_names)} multi_cds_names, multi_cds_seqs = gen.read_fasta(multi_utr) multi_cds_list = { name: [multi_cds_seqs[i]] for i, name in enumerate(multi_cds_names) if len(multi_cds_seqs[i]) > 50 and name in cds_list } multi_cds_list = sequo.pick_random_family_member(families_file, multi_cds_list) # # 6086 multi_densities, multi_nds = calc_values(multi_cds_list) motifs = sequo.read_motifs(motif_file) multi_seqs = [multi_cds_list[i][0] for i in multi_cds_list] single_seqs = [single_cds_list[i][0] for i in single_cds_list] single_ese_desities = [ seqo.calc_motif_density([i], motifs) for i in single_seqs ] multi_ese_densities = [ seqo.calc_motif_density([i], motifs) for i in multi_seqs ] output_file = "clean_run/utr_ese_densities.csv" with open(output_file, "w") as outfile: outfile.write("single,{0}\n".format(",".join( gen.stringify(single_ese_desities)))) outfile.write("multi,{0}\n".format(",".join( gen.stringify(multi_ese_densities))))