def get_MSA_gene_list(coords, coords_file, method, species_set, version, query_species, MSA_file): ''' Given a dictionary of lists of lists of CDS coordinates, retrieve the Compara MSAs. ''' with open(coords_file, "w") as file: for trans in coords: for exon in coords[trans]: phase = exon[1] current_coords = exon[0] current_coords = [str(i) for i in current_coords] current_coords.append(str(phase)) current_coords = "|".join(current_coords) file.write(current_coords) file.write("\n") remove_file(MSA_file) run_process([ "perl", "MSA_list.pl", method, species_set, version, coords_file, query_species, MSA_file ]) with open(MSA_file) as file: string = "".join(file) string = re.sub("([a-z])\n([a-z])", "\\1\\2", string) with open(MSA_file, "w") as file: file.write(string)
def get_pairwise_alignment(coords, coords_file, query_species, other_species, version, output_file): ''' Given a list of feature coordinates and two species, get the corresponding pairwise alignments from Compara. ''' #write the coordinates to file in a way that can be read by the downstream perl script with open(coords_file, "w") as file: for feature in coords: feature = [str(i) for i in feature] feature = "|".join(feature) file.write(feature) file.write("\n") remove_file(output_file) #get the alignments from the database run_process([ "perl", "pairwise_from_ensembl.pl", coords_file, query_species, other_species, output_file, version ]) #parse them from the output file produced by the perl script with open(output_file) as file: string = "".join(file) string = string.split("***") string = [(i.rstrip("\n")).lstrip("\n") for i in string] string = [i.split("|||") for i in string] string = [[j for j in i if len(j) > 0] for i in string] #getting rid of cases where there's multiple GABs string = flatten([i for i in string if len(i) == 1]) #write alignments to a pretty FASTA with open(output_file, "w") as file: for feature in string: temp = feature.split("\n") name = temp[0] name = name.split("|") antisense = False if name[6] == "-": antisense = True try: alignments = [temp[2], temp[3]] alignments = [i.split(" ") for i in alignments] #covert to upper case alignments = [([j for j in i if j][1]).upper() for i in alignments] #only keep alignments with no ambiguous bases in either sequence if "N" not in alignments[0] and "N" not in alignments[1]: #reverse complement, if necessary if antisense: alignments = [ str( Seq(i, IUPAC.unambiguous_dna).reverse_complement( )) for i in alignments ] file.write(">{0}\n".format("|".join(name))) file.write("|".join(alignments)) file.write("\n") except IndexError: pass
def sort_bed(input_file_name, output_file_name): ''' Sort a bed file. ''' #This is done via a temp file because that way you can specify the same file as input and output file and thus #overwrite the unsorted file with the sorted one. temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random()) hk.run_process(["sort-bed", input_file_name], file_for_output = temp_file_name) hk.run_process(["mv", temp_file_name, output_file_name]) hk.remove_file(temp_file_name)
def peak_pos_in_exon(exon_starts_file, peaks_file, from_end = False, reads_file = False, reads_mode = False): """ Given a set of exons and a set of peaks, make a dictionary with the peaks overlapping each exon. :param exon_starts_file: BED file with the starting regions of those exons that have been chosen for study :param peaks_file: BED file of peaks :param from_end: if True, distances will be calculated from the ends of exons rather than the starts :param reads_mode: if True, assume that output is reads and not peaks. The difference is that peaks are flat: every position that overlaps with a peak is a 1. Whereas with reads, if a position overlaps with more than one read, then you should count it as more than one. :return: dictionary with junction IDs as keys and a list of all the positions that overlap a peak (relative to the start of the exon, counting in the direction of transcription). Also return a second dictionary with the centres of the peaks (median rounded down to nearest integer). """ # intersect the exon starts and the peaks intersect_output = "{0}_{1}".format(exon_starts_file[:-4], peaks_file.split("/")[-1]) intersect_bed(exon_starts_file, peaks_file, write_both=True, output_file=intersect_output, force_strand=True, no_dups=False, write_zero=True) plus = "+" if from_end: plus = "-" out_dict = {} with open(intersect_output) as file: for line in file: line = line.split("\t") junction = line[3] hk.add_key(junction, [], out_dict) # if this exon overlaps with peaks if line[6] != ".": out_dict[junction].append([]) peak_start = int(line[7]) peak_end = int(line[8]) if line[5] == plus: exon_start = int(line[1]) # loop over the nucleotides in the peak for nt in range(peak_start, peak_end): out_dict[junction][-1].append(nt - exon_start) else: exon_end = int(line[2]) for nt in range(peak_start, peak_end): out_dict[junction][-1].append(exon_end - nt - 1) # calculate the centres of the peaks out_dict_centres = {i: [int(np.median(j)) for j in out_dict[i]] for i in out_dict} # break up separate peaks and just record all the positions once if reads_mode: out_dict = {i: sorted(list(hk.flatten(out_dict[i]))) for i in out_dict} else: out_dict = {i: sorted(list(set(hk.flatten(out_dict[i])))) for i in out_dict} hk.remove_file(intersect_output) return(out_dict, out_dict_centres)
def get_reads_per_pos(reads_file, transcript_bed): """ Given a BED file of reads and a BED file of transcript coordinates, make a dictionary with transcript IDs as keys and number of reads per position, as well as the absolute coordinates of the nucleotides, as values. :param reads_file: BED file with read coordinates :param transcript_bed: BED file with transcript coordinates :return: dictionary with numbers of reads per position """ # intersect the transcripts and the reads, so you'd have an output file where # the transcript coordinates are followed by the overlapping read intermediate_file = "{0}_{1}read_per_pos_intermediate.bed".format( reads_file[:-4], transcript_bed.split("/")[-1][:-4]) co.intersect_bed(transcript_bed, reads_file, force_strand=True, write_both=True, no_dups=False, write_zero=False, output_file=intermediate_file) reads_per_pos = {} total = hk.line_count(intermediate_file) print("Calculating the number of reads per position in each transcript...") with open(intermediate_file, newline="") as file: file_reader = csv.reader(file, delimiter="\t") for pos, line in enumerate(file_reader): if pos % 100000 == 0: print("{0}/{1}".format(pos, total)) # prefix the chromosome and the strand to the transcript name cause you'll # need it later trans_name = line[3] trans_name = "{0}.{1}.{2}".format(line[0], line[5], trans_name) reads_per_pos = hk.add_key(trans_name, {"reads": {}}, reads_per_pos) strand = line[5] if strand == "+": position = int(line[8]) - 1 else: position = int(line[7]) reads_per_pos[trans_name]["reads"] = hk.add_key( position, 0, reads_per_pos[trans_name]["reads"]) reads_per_pos[trans_name]["reads"][ position] = reads_per_pos[trans_name]["reads"][position] + 1 reads_per_pos[trans_name] = hk.add_key( "coords", (int(line[1]), int(line[2])), reads_per_pos[trans_name]) hk.remove_file(intermediate_file) return reads_per_pos
def get_pairwise_alignment(coords, coords_file, query_species, other_species, version, output_file): ''' Given a list of feature coordinates and two species, get the corresponding pairwise alignments from Compara. ''' #write the coordinates to file in a way that can be read by the downstream perl script with open(coords_file, "w") as file: for feature in coords: feature = [str(i) for i in feature] feature = "|".join(feature) file.write(feature) file.write("\n") remove_file(output_file) #get the alignments from the database run_process(["perl", "pairwise_from_ensembl.pl", coords_file, query_species, other_species, output_file, version]) #parse them from the output file produced by the perl script with open(output_file) as file: string = "".join(file) string = string.split("***") string = [(i.rstrip("\n")).lstrip("\n") for i in string] string = [i.split("|||") for i in string] string = [[j for j in i if len(j) > 0] for i in string] #getting rid of cases where there's multiple GABs string = flatten([i for i in string if len(i) == 1]) #write alignments to a pretty FASTA with open(output_file, "w") as file: for feature in string: temp = feature.split("\n") name = temp[0] name = name.split("|") antisense = False if name[6] == "-": antisense = True try: alignments = [temp[2], temp[3]] alignments = [i.split(" ") for i in alignments] #covert to upper case alignments = [([j for j in i if j][1]).upper() for i in alignments] #only keep alignments with no ambiguous bases in either sequence if "N" not in alignments[0] and "N" not in alignments[1]: #reverse complement, if necessary if antisense: alignments = [str(Seq(i, IUPAC.unambiguous_dna).reverse_complement()) for i in alignments] file.write(">{0}\n".format("|".join(name))) file.write("|".join(alignments)) file.write("\n") except IndexError: pass
def intersect_bed(bed_file1, bed_file2, use_bedops = False, overlap = False, overlap_rec = False, write_both = False, sort = False, output_file = None, force_strand = False, force_opposite_strand = False, no_name_check = False, no_dups = True, chrom = None, intersect = False, hit_count = False, bed_path = None, intersect_bam=None, write_zero = False, write_bed = False, exclude = False): '''Use bedtools/bedops to intersect coordinates from two bed files. Return those lines in bed file 1 that overlap with intervals in bed file 2. OPTIONS output_file: write output to this file use_bedops: use bedops rather than bedtools. Certain options are only valid with one of the two, see below. overlap: minimum oxverlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the overlap has to be at least 80% of the intervals in bed file 1). overlap_rec: require that the overlap as a fraction of the intervals in file 2 be at least as high as the threshold indicated in -f. write_both: if True, return not only the interval from bed file 1 but, tagged onto the end, also the interval from bed file 2 that it overlaps (only valid when using bedtools). exclude: if True, report intervals that DON'T overlap sort: sort bed files before taking the intersection force_strand: check that the feature and the bed interval are on the same strand (only valid with bedtools) force_opposite_strand: if True, check that the feature and the interval are on OPPOSITE strands no_name_check: if set to False, checks whether the chromosome names are the same in the too bed files (only valid with bedtools) no_dups: if True, only returns each interval once. If set to false, intervals in bed file 1 that overlap several intervals in bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2) chrom: limit search to a specific chromosome (only valid with bedops, can help in terms of efficiency) intersect: rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2. hit_count: for each element in bed file 1, return the number of elements it overlaps in bed file 2 (only valid with bedtools) intersect_bam: intersect a bam file with a bed file. Requires bam file to be called first write_zero: like write_both but also write A intervals that don't overlap with any B intervals, write_bed: when intersecting a bam file, write output as bed.''' if force_strand and force_opposite_strand: raise Exception("force_strand and force_opposite_strand can't both be True") hk.make_dir("temp_data/") temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random()) #have it write the output to a temporary file if use_bedops: bedtools_output = run_bedops(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, no_dups = no_dups, intersect_bam = intersect_bam, overlap_rec = overlap_rec) else: bedtools_output = run_bedtools(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, no_name_check, no_dups, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, bed_path = bed_path, intersect_bam = intersect_bam, write_zero = write_zero, overlap_rec = overlap_rec, write_bed = write_bed, exclude = exclude) #move it to a permanent location only if you want to keep it if output_file: hk.run_process(["mv", temp_file_name, output_file]) else: bedtools_output = rw.read_many_fields(temp_file_name, "\t") hk.remove_file(temp_file_name) return(bedtools_output)
def get_sim_p_core(simulations, hits, controls, fasta, correspondances, alignments, method, statistic, reverse_site_numbers, degen_hits_file, degen_controls_file): ''' Core function for get_sim_p. ''' sim_norm_ds = [] counter = 0 for sim in simulations: counter = update_counter(counter, 10) #shuffle hits and controls if not reverse_site_numbers: temp_hits, temp_controls = shuffle_dictionaries(hits, controls) else: temp_controls, temp_hits = shuffle_dictionaries(hits, controls) hit_phylip = "temp_data/temp{0}.phy".format(random.random()) control_phylip = "temp_data/temp{0}.phy".format(random.random()) #write phylip alignments with the pseudo-hit and pseudo-control positions conservation.write_hits_to_phylip(fasta, temp_hits, hit_phylip, correspondances, alignments, degen_hits_file) conservation.write_hits_to_phylip(fasta, temp_controls, control_phylip, correspondances, alignments, degen_controls_file) #get PAML estimates hit_ds = conservation.run_codeml(hit_phylip, "temp_data/temp_{0}.phy".format( random.random()), method=method)[statistic] control_ds = conservation.run_codeml(control_phylip, "temp_data/temp_{0}.phy".format( random.random()), method=method)[statistic] sim_norm_ds.append((hit_ds - control_ds) / control_ds) remove_file(hit_phylip) remove_file(control_phylip) return (sim_norm_ds)
def get_MSA_gene_list(coords, coords_file, method, species_set, version, query_species, MSA_file): ''' Given a list of lists of CDS coordinates, retrieve the Compara MSAs. ''' with open(coords_file, "w") as file: for trans in coords: for exon in coords[trans]: phase = exon[1] current_coords = exon[0] current_coords = [str(i) for i in current_coords] current_coords.append(str(phase)) current_coords = "|".join(current_coords) file.write(current_coords) file.write("\n") remove_file(MSA_file) run_process(["perl", "MSA_list.pl", method, species_set, version, coords_file, query_species, MSA_file]) with open(MSA_file) as file: string = "".join(file) string = re.sub("([a-z])\n([a-z])", "\\1\\2", string) with open(MSA_file, "w") as file: file.write(string)
def get_lambda(lambda_file_outroot, phy_file, subst_model, min_inf = None): ''' Calculate lambda input parameter for INSIGHT. ''' lambda_file = "{0}.mod".format(lambda_file_outroot) #to make sure you catch it if the phyloFit process fails remove_file(lambda_file) #from UCSC tree_file = "DFE/UCSC_model.mod" #subst_model is JC69, for instance #scale-only, cause you don't want it to estimate a new tree, just to scale the whole thing arguments = ["phyloFit", "--init-model", tree_file, "--out-root", lambda_file_outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--scale-only", phy_file] #must be set to False for testing if min_inf: arguments.extend(["-I", min_inf]) results = run_process(arguments) with open(lambda_file) as file: lambda_b = file.read() lambda_b = re.findall(lambda_regex, lambda_b)[0] return(lambda_b)
def main(): description = "Run mDFEest with shuffled input to check the false positive rate." args = parse_arguments(description, [ "hits_file", "controls_file", "output_file", "n_sim", "SNP_file", "SNP_number", "hit_reduce", "control_reduce", "const_pop" ], ints=[3, 5], floats=[6, 7], flags=[8]) hits_file, controls_file, output_file, n_sim, SNP_file, SNP_number, hit_reduce, control_reduce, const_pop = args.hits_file, args.controls_file, args.output_file, args.n_sim, args.SNP_file, args.SNP_number, args.hit_reduce, args.control_reduce, args.const_pop with open(output_file, "w") as file: for sim in range(n_sim): print(sim) temp_hits_file = "temp_data/hits_file{0}.txt".format( random.random()) temp_controls_file = "temp_data/controls_file{0}.txt".format( random.random()) temp_input_file = "temp_data/input_file{0}.txt".format( random.random()) #shuffle hits and controls for negative control run_process([ "python3", "shuffle_hits_and_controls.py", hits_file, controls_file, temp_hits_file, temp_controls_file, hit_reduce, control_reduce ]) #generate multiDFEest input file run_process([ "python3", "mDFEest_input.py", temp_hits_file, temp_controls_file, SNP_file, SNP_number, temp_input_file ]) output = mDFEest("beta", temp_input_file, pop_change=True) print(output) print(output["Nes_0.0_0.1"]) print(output["Nes_0.1_1.0"]) file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"], output["Nes_0.1_1.0"])) #if you also want to run with fixed population size if const_pop: output = mDFEest("beta", temp_input_file, pop_change=False) file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"], output["Nes_0.1_1.0"])) file.write("\n") remove_file(temp_hits_file) remove_file(temp_controls_file) remove_file(temp_input_file)
def get_ancestral_CG(outroot, subst_model, phy_files, model_file, tuples_mapping_dict, anc_CG_file_name, high_CG = None, min_inf = None, macaque = False, comprehensive = False, from_model = False): ''' Get a dictionary that says for each transcript which positions were ancestrally CpG/GpC. ''' #if a file name hasn't been supplied or if the file with the supplied name doesn't exist, determine #CpG positions again, otherwise just read them in from the file if not anc_CG_file_name or anc_CG_file_name == "None" or not os.path.exists(anc_CG_file_name): #you need several in case you have a high_CG dictionary pps = [] for phy_file in phy_files: if subst_model == "JC69" or from_model: #use an existing substitution model arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--post-probs", "--scale-only", phy_file] else: #estimate a new model arguments = ["phyloFit", "--out-root", outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--tree", "DFE/full_tree.tree", "--post-probs", phy_file] if subst_model == "JC69": block_size = 4 tuple_pos_lim = 2 shift_in_tuple = 0 else: #for dinucleotide models block_size = 16 tuple_pos_lim = 3 shift_in_tuple = 9 #turn off when testing if min_inf: arguments.extend(["-I", min_inf]) results = run_process(arguments) #read in posterior probabilities of having various nucelotides ancestrally pp_file = "{0}.postprob".format(outroot) pp = rw.read_many_fields(pp_file, " ") pp = [[j for j in i if j] for i in pp] pp = pp[2:] #the posterior probability that you had a CpG at a position has to be greater #than threshold for a position to be counted as ancestrally CpG threshold = 0.5 #will be over-written if you're doing big tree human_pos = 0 #the outgroup nodes are labelled from the outside in, starting from 1 if macaque: #it's to know whether we're doing big tree or little tree if len(pp[0]) == 14: #little tree, mononucleotide pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (3 * block_size): len(i) - (2 * block_size)]] for i in pp} elif len(pp[0]) > 14: #big tree/dinucleotide (i.e. it'll give you nonsense if you're trying to do context with the little tree) #the shift_in_tuple is to do with the fact that if you're doing U2S, you want the second tuple and not the first human_pos = 3 + shift_in_tuple if comprehensive: #you want to get all nodes except for node 0, which is the outgroup-ingroup ancestor pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (j * block_size): len(i) - ((j - 1) * block_size)] for j in range(1, 7)] for i in pp} else: pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (6 * block_size): len(i) - (5 * block_size)]] for i in pp} else: #for tests etc. where you might only have, say, two species pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp} else: pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp} pps.append(pp) anc_CG = {} #just to get the length example_pp = pps[0][list(pps[0].keys())[0]] for trans in tuples_mapping_dict: #tuples_mapping_dict has the alignment tuple corresponding to each position #because the phyloFit output is organized by tuples, not by positions anc_CG[trans] = [] for node_pos in range(len(example_pp)): #if you're using dinucleotides if subst_model != "JC69": for pos in sorted(tuples_mapping_dict[trans].keys())[1:]: try: pp_number = 0 #if you're gonna produce different output dictionaries for high and low GC regions if high_CG: if pos in high_CG[trans]: pp_number = 1 current_tuple = tuples_mapping_dict[trans][pos] #don't consider positions where there is an alignment gap for human if current_tuple[human_pos] != "*": ## print(current_tuple) ## print(pps[pp_number]) ## print("\n") if current_tuple in pps[pp_number]: current_pp = pps[pp_number][current_tuple][node_pos] else: current_pp = pps[abs(pp_number - 1)][current_tuple][node_pos] #because it can be either GC or CG, hence 6 or 9 if float(current_pp[6]) > threshold or float(current_pp[9]) > threshold: #you're always testing the second member in the dinucleotide anc_CG[trans].append(pos - 1) anc_CG[trans].append(pos) except KeyError: if pos % 100 == 0: pass else: raise KeyError else: #if you're using mononucleotides, you have to keep track of what the previous neuclotide was C_prev = False G_prev = False for pos in sorted(tuples_mapping_dict[trans].keys()): pp_number = 0 if high_CG: if pos in high_CG[trans]: pp_number = 1 current_C = False current_G = False current_tuple = tuples_mapping_dict[trans][pos] if current_tuple[human_pos] != "*": current_pp = pps[pp_number][current_tuple][node_pos] #if current is C and previous was G if float(current_pp[1]) > threshold: if G_prev: anc_CG[trans].append(G_pos) anc_CG[trans].append(pos) current_C = True #if current is G and previous was C if float(current_pp[2]) > threshold: if C_prev: anc_CG[trans].append(C_pos) anc_CG[trans].append(pos) current_G = True C_prev = False G_prev = False if current_C: C_prev = True #you need to specify the position explicitly because it's not necessarily #the last one if there were dashes C_pos = pos if current_G: G_prev = True G_pos = pos anc_CG[trans] = sorted(list(set(anc_CG[trans]))) remove_file(pp_file) if anc_CG_file_name and anc_CG_file_name != "None": with open(anc_CG_file_name, "w") as file: for trans in anc_CG: to_write = "\t".join([trans, ",".join([str(i) for i in anc_CG[trans]])]) file.write(to_write) file.write("\n") else: #parse anc_CG = rw.read_many_fields(anc_CG_file_name, "\t") anc_CG = [i for i in anc_CG if len(i) == 2] anc_CG = list_to_dict(anc_CG, 0, 1) anc_CG = {i: [int(i) for i in anc_CG[i].split(",") if i != ""] for i in anc_CG} return(anc_CG)
def get_new_method_results(hit_file, control_file, hit_phylip, control_phylip, correspondances, alignments, fasta, baseml=False, return_CpG=False, global_fasta=None, return_overall=False, motifs=None, fs=None, regions=False): ''' Calculate normalized dS. ''' #if you're meant to ignore degenerate substitutions, #the degeneracy file will have been supplied as the hit file #and the real hit file name can be derived from the name of the #degeneracy file if "_degen.txt" in hit_file: degen_hits_file = hit_file degen_controls_file = control_file hit_file = hit_file[:-10] control_file = control_file[:-10] else: degen_hits_file = None degen_controls_file = None #read in hit and control positions hits = parse_basinhoppin_pos(hit_file) controls = parse_basinhoppin_pos(control_file) try: #write control and hit sequences to PHYLIP files conservation.write_hits_to_phylip(fasta, hits, hit_phylip, correspondances, alignments, degen_hits_file, baseml=baseml, fs=fs, regions=regions, global_fasta=global_fasta) conservation.write_hits_to_phylip(fasta, controls, control_phylip, correspondances, alignments, degen_controls_file, baseml=baseml, fs=fs, regions=regions, global_fasta=global_fasta) #if you're doing nucleotide-based rather than codon-based if baseml: method = "baseml" statistic = "tree length" else: method = "gy" statistic = "dS" #if you want to return the density * normalized dS statistic, you need the density if return_overall: density = nc.get_sequence_set_density(fasta, None, motifs, None, False, "temp_data/temp_dens1.txt", "temp_data/temp_dens2.txt", "temp_data/temp_pos.txt", None, feature_set=fs, concat=True, positions=False)["density"] print("Density: {0}.".format(density)) #get dS estimates from PAML hit_ds = conservation.run_codeml(hit_phylip, "temp_data/temp_{0}.phy".format( random.random()), method=method)[statistic] control_ds = conservation.run_codeml(control_phylip, "temp_data/temp_{0}.phy".format( random.random()), method=method)[statistic] remove_file(control_phylip) #report CpG frequency in hits vs controls hit_freq, control_freq = CpG_frequency(fasta, hits, controls) print("Hit dS: {0}.".format(hit_ds)) print("Control dS: {0}.".format(control_ds)) norm_ds = (hit_ds - control_ds) / control_ds print("Normalized dS: {0}.\n".format(norm_ds)) if return_overall: overall = norm_ds * density print("Overall decrease: {0}.\n".format(overall)) return (norm_ds, density, overall) if return_CpG: return (norm_ds, hit_freq, control_freq) return ((hit_ds - control_ds) / control_ds) except conservation.NoDataException: print("No input sequence available.") if return_CpG: return (None, None, None) return (None)
def main(): description = "Directly compare the frequency of segregating sites/mean allele frequency between hits and controls." args = parse_arguments(description, [ "hit_file", "control_file", "INSIGHT_hit_file", "INSIGHT_control_file", "SFS_file", "trial_file", "trials", "shuffle" ], ints=[6], flags=[7]) hit_file, control_file, INSIGHT_hit_file, INSIGHT_control_file, SFS_file, trial_file, trials, shuffle = args.hit_file, args.control_file, args.INSIGHT_hit_file, args.INSIGHT_control_file, args.SFS_file, args.trial_file, args.trials, args.shuffle true_hits = rw.read_pos(hit_file) true_controls = rw.read_pos(control_file) #to store the original data in case this is a negative control and you will be shuffling #hits and controls original_INSIGHT_hit_file = INSIGHT_hit_file original_INSIGHT_control_file = INSIGHT_control_file print(hit_file) with open(trial_file, "w") as file: file.write( "trial\tpoly_fraction_hits - poly_fraction_controls\tmedian_hit_MAF - median_control_MAF\n" ) for trial in range(trials): to_write = "{0}\t".format(trial) #if this is a negative control if shuffle: INSIGHT_hit_file = re.sub("_0_", "_{0}_".format(trial), original_INSIGHT_hit_file) INSIGHT_control_file = re.sub("_0_", "_{0}_".format(trial), original_INSIGHT_control_file) temp_hits_file = "temp_data/temp_hits{0}.txt".format( random.random()) temp_controls_file = "temp_data/temp_controls{0}.txt".format( random.random()) #shuffle hits and controls temp_hits, temp_controls = shuffle_dictionaries( true_hits, true_controls) rw.write_pos(temp_hits, temp_hits_file) rw.write_pos(temp_controls, temp_controls_file) SFS_file = "temp_data/temp_SFS_file{0}.txt".format( random.random()) #generate an ISNIGHT input file that you could then use for the manual analysis run_process([ "python3", "mDFEest_input.py", temp_hits_file, temp_controls_file, "general/1000genomes/filtered_hg38_85_pc_multiexon_Yoruban_SNPs_relative.txt", 216, SFS_file ]) remove_file(temp_hits_file) remove_file(temp_controls_file) hit_data = get_data(INSIGHT_hit_file) control_data = get_data(INSIGHT_control_file) poly_ratio_diff = get_chisq_site_freq(hit_data, control_data) to_write = to_write + "{0}\t".format(poly_ratio_diff) temp, median_diff = get_mean_freq(SFS_file) to_write = to_write + "{0}\n".format(median_diff) if shuffle: remove_file(SFS_file) file.write(to_write)
def main(): description = "Calculate the normalized dS of a dataset." args = parse_arguments(description, [ "dataset", "feature_set", "genome", "families_file", "fasta", "hit_file_prefix", "motifs_file", "correspondances", "alignments", "suffix", "trials", "trial_file", "old_trial_file", "region_fasta", "old_motif_format", "nonsense", "no_families", "newest_only", "top_set_only", "calc_p", "reverse_site_numbers", "matched", "degen", "regions" ], ints=[10], flags=[14, 15, 16, 17, 18, 19, 20, 21, 22, 23]) dataset, feature_set, genome, families_file, fasta, hit_file_prefix, motifs_file, correspondances, alignments, suffix, trials, trial_file, old_trial_file, region_fasta, old_motif_format, nonsense, no_families, newest_only, top_set_only, calc_p, reverse_site_numbers, matched, degen, regions = args.dataset, args.feature_set, args.genome, args.families_file, args.fasta, args.hit_file_prefix, args.motifs_file, args.correspondances, args.alignments, args.suffix, args.trials, args.trial_file, args.old_trial_file, args.region_fasta, args.old_motif_format, args.nonsense, args.no_families, args.newest_only, args.top_set_only, args.calc_p, args.reverse_site_numbers, args.matched, args.degen, args.regions n_sim = 1000 print(suffix) #set up feature set and families fs = Feature_Set(feature_set, genome) fs.set_dataset(dataset) if no_families: picked = fs.names else: families = rw.read_families(families_file) fs.add_families(families) picked = fs.pick_random_members() hit_phylip = "temp_data/temp_{0}.phy".format(random.random()) control_phylip = "temp_data/temp_control_{0}.phy".format(random.random()) if not nonsense: if old_motif_format: motifs = rw.read_names(motifs_file)[1:] else: motifs = rw.read_motifs(motifs_file) if top_set_only: summary_data = rw.read_many_fields( "RBP/RBP_hg38_introncontaining_new.txt", "\t") summary_dict = list_to_dict(summary_data, 0, 4, floatify=True) motifs = { RBP: motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) } motifs = list(set(flatten(motifs.values()))) if reverse_site_numbers: site_number_suffix = "_reversed_site_numbers_" else: site_number_suffix = "" if matched: matched_suff = "_matched" else: matched_suff = "" if degen: degen_suff = "_degen.txt" else: degen_suff = "" with open(trial_file, "w") as trial_out: trial_out.write( "trial\tA\tT\tC\tG\told\told_no_hum_CG\tnew_no_human_CG\tnew_no_hum_no_anc_CG\tnew_w_CG\tnew_no_anc_CG\tnew_no_anc_CG_macaque\tnewer_no_human_CG\tnewer_no_hum_no_anc_CG\tnewer_w_CG\tnewer_no_anc_CG\n" ) if old_trial_file != "None": old_trials = rw.read_many_fields(old_trial_file, "\t") old_trials = old_trials[1:] old_trials = [i[1:5] for i in old_trials] seed_kmers = 1 else: seed_kmers = None #you can do this for loads of trials #useful as a negative control if you're generating a new set of nonsense motifs #each time for trial in range(trials): print(trial) trial_output = [trial] #if you're meant to generate a load of nonsense motifs rather than using real motifs if nonsense: if old_trial_file != "None": #read in the intended nucleotide composition of the nonsense #motifs from file scaled_comp = [float(i) for i in old_trials[trial]] else: #pick nonsense motifs nucleotide composition by chance comp = [random.random() for i in range(4)] scaled_comp = [i / np.sum(comp) for i in comp] comp_dict = { i: scaled_comp[pos] for pos, i in enumerate(nc._canon_bases_) } motifs, obtained_dict = nc.kmers_from_nc(6, 50, comp_dict=comp_dict, return_freqs=True, seed=seed_kmers) motifs = ["motifs"] + motifs trial_output = trial_output + [ obtained_dict[i] for i in nc._canon_bases_ ] temp_motifs_file = "temp_data/temp_motifs.txt" rw.write_names(motifs, temp_motifs_file) print( "===NEW METHOD WITH NO ANCESTRAL CpG (MACAQUE, BIG TREE, CONTEXT), REPLACEMENT CONTROL===" ) hit_file = "{0}_hits_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format( hit_file_prefix, matched_suff, degen_suff) control_file = "{0}_controls_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format( hit_file_prefix, matched_suff, degen_suff) if nonsense: hit_file = "temp_data/temp_hits{0}.txt".format(random.random()) control_file = "temp_data/temp_controls{0}.txt".format( random.random()) error_file = "temp_data/temp_error{0}.txt".format( random.random()) get_control_sites( fasta, genome, feature_set, families_file, dataset, temp_motifs_file, hit_file, control_file, error_file, "DFE/for_everybody/filtered_hg38_85_pc_multiexon_anc_CG_big_context_threshold05.txt", [ "--leave_CG", "--context", "--remove_ancestral_CpG", "--macaque_anc", "--big_tree", "--replacement_control" ]) get_density(fasta, motifs, fs) norm_ds = get_new_method_results(hit_file, control_file, hit_phylip, control_phylip, correspondances, alignments, fasta, regions=regions, global_fasta=region_fasta, fs=fs) trial_output.append(norm_ds) if calc_p: p, low_CI, high_CI, sd, Z = get_sim_p( norm_ds, hit_file, control_file, correspondances, alignments, fasta, n_sim, reverse_site_numbers=reverse_site_numbers, sim_ds_file= "{0}{1}_sim_norm_ds_no_anc_CG_only_macaque_big_context{2}_replace.txt{3}" .format(hit_file_prefix, site_number_suffix, matched_suff, degen_suff)) trial_output = "\t".join([str(i) for i in trial_output]) trial_out.write(trial_output) trial_out.write("\n") remove_file(hit_phylip)
def get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = False, pseudoCG = False, comprehensive = False, subst_model = None, return_tuples = False, regions = False): ''' Get two dictionaries, one that says for each transcript which positions are CpG/GpC in macaque and one which positions were likely CpG/GpC in the human-macaque ancestor. ''' names, seqs = rw.read_fasta(fasta) #if you're gonna determine ancestral CpG positions from scratch rather than reading them in from an existing file #if you want to have the name of the file determined automatically if (not anc_CG_file_name) or (anc_CG_file_name == "None"): new_CG = True phy_file = "temp_data/temp_anc_CG{0}.txt".format(random.random()) #if you want to give the file a name yourself elif not os.path.exists(anc_CG_file_name): new_CG = True else: new_CG = False if new_CG: print("Will get new CpG data...") if len(phylip_data) < 8 and comprehensive: print("Comprehensive CpG filtering only in big tree mode!") raise Exception #if you want to pretend some other dinucleotide are CpG if pseudoCG: CG_kmers = ["C[\-]*T", "A[\-]*G"] #the hyphens are there in case the two nucleotides are separated by an indel else: CG_kmers = ["C[\-]*G", "G[\-]*C"] CG_kmers = [re.compile(i) for i in CG_kmers] macaque_CG_dict = {} anc_CG_concat_full = [[[""]], [[""]]] tuples_mapping_dict_full = {} for chrom in chroms: print(chrom) #only leave those CDSs that are on the current chromosome current_CDSs = {i: CDSs[i] for i in CDSs if CDSs[i][0][0][0] == chrom} coords_file = "temp_data/coords_file{0}.txt".format(random.random()) #check if the MSA is already at the specified location, otherwise retrieve it MSA_file = "{0}_{1}.txt".format(MSA_file_name_prefix, chrom) if not os.path.isfile(MSA_file): print("Obtaining MSA...") eo.get_MSA_gene_list(current_CDSs, coords_file, "EPO", "primates", 85, "homo_sapiens", MSA_file) os.remove(coords_file) eo.flush_tables("localhost", "mysql", "fackel") MSA_raw = eo.parse_MSA_output(MSA_file) if high_CG_file_name != "None": high_CG = rw.read_many_fields(high_CG_file_name, "\t") high_CG = {i[0]: [int(j) for j in i[1:]] for i in high_CG} else: high_CG = None #get concatenated sequences (for determining ancestral CpG positions) and macaque CpG information for this chromosome anc_CG_concat, macaque_CG_dict, tuples_mapping_dict = get_CpG_dicts_core(MSA_raw, lengths, phylip_data, CG_kmers, macaque_anc, macaque_CG_dict, high_CG, comprehensive = comprehensive, subst_model = subst_model) remove_file(coords_file) #add that information to the global dictionaries anc_CG_concat_full, tuples_mapping_dict_full = update_anc_CG(anc_CG_concat_full, anc_CG_concat, tuples_mapping_dict_full, tuples_mapping_dict) phy_files = write_anc_CG(anc_CG_concat_full, anc_CG_file_name, clean_names, macaque_CG_dict) pp_file = anc_CG_file_name else: print("Will read in existing CpG data...") pp_file = None phy_files = "None" high_CG = None tuples_mapping_dict_full = None macaque_CG_file_name = "{0}_macaque.txt".format(anc_CG_file_name[:-4]) macaque_CG_dict = rw.read_many_fields(macaque_CG_file_name, "\t") macaque_CG_dict = [i for i in macaque_CG_dict if len(i) == 2] macaque_CG_dict = list_to_dict(macaque_CG_dict, 0, 1) macaque_CG_dict = {i: [int(i) for i in macaque_CG_dict[i].split(",") if i != ""] for i in macaque_CG_dict} anc_CG_dict = get_ancestral_CG(pp_file, subst_model, phy_files, "DFE/UCSC_model.mod", tuples_mapping_dict_full, anc_CG_file_name, high_CG = high_CG, macaque = macaque_anc, comprehensive = comprehensive) [remove_file(i) for i in phy_files] #if you're looking at exon cores/flanks rather than full CDSs if regions: #you need to have matching bed/fasta files for this to work (with the records in the same order) bed = fasta.replace("fasta", "bed") transcripts = fs.get_transcripts() #for each flank/core, figure out what positions it covers in the full CDS mapping_dict = conservation.map_regions_to_CDS(fasta, bed, fs, transcripts, CDSs, trans_ids = True) anc_CG_dict = region_CpG(mapping_dict, anc_CG_dict) if return_tuples: return(anc_CG_dict, macaque_CG_dict, tuples_mapping_dict_full) else: return(anc_CG_dict, macaque_CG_dict)
def main(): description = "Run mDFEest." args = parse_arguments(description, ["hit_file", "control_file", "SNP_file", "SNP_number", "input_file", "output_file", "seed", "fixed_model", "new_input", "shuffle", "fix_pop_change"], ints = [3], flags = [8, 9, 10]) hit_file, control_file, SNP_file, SNP_number, input_file, output_file, seed, fixed_model, new_input, shuffle, fix_pop_change = args.hit_file, args.control_file, args.SNP_file, args.SNP_number, args.input_file, args.output_file, args.seed, args.fixed_model, args.new_input, args.shuffle, args.fix_pop_change #if you want to generate a new input file rather than reading in an existing one if new_input: remove_file("../multidfe/{0}".format(input_file.split("/")[-1])) arguments = ["python3", "mDFEest_input.py", hit_file, control_file, SNP_file, SNP_number, input_file] if shuffle: arguments.append("--shuffle") run_process(arguments) if seed == "None": seed = None else: seed = float(seed) #if you want to run it only with a population size change model, #rather than both a model assuming population size change and a fixed population #size model if fix_pop_change: pop_change = [True] else: pop_change = [False, True] if fixed_model == "None": #all possible models allowed = ["lognormal", "gamma", "beta", "spikes", "steps", "fixed six spikes"] spike_range = [2, 6] else: #only the spcified model allowed = [fixed_model] #only two-spike models spike_range = [2, 3] with open(output_file, "w") as file: file.write("model\tpop_change\tAIC\tNes_0.0_0.1\tNes_0.1_1.0\tNes_1.0_10.0\tNes_10.0_100.0\traw\n") for change_mode in pop_change: print("\nPopulation expansion: {0}.".format(str(change_mode))) if "lognormal" in allowed: print("lognormal model:") output = mDFEest("lognormal", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) if "gamma" in allowed: print("gamma model:") output = mDFEest("gamma", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) if "beta" in allowed: print("beta model:") output = mDFEest("beta", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) for spike_number in range(spike_range[0], spike_range[1]): if "spikes" in allowed: print("{0}-spikes model:".format(spike_number)) output = mDFEest("spikes", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode) print(output) write_mDFEest_output(output, file, change_mode) if "steps" in allowed: print("{0}-steps model:".format(spike_number)) output = mDFEest("steps", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode) print(output) write_mDFEest_output(output, file, change_mode) if "fixed six spikes" in allowed: print("fixed six spikes model:") output = mDFEest("six_spikes", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode)
def main(): description = "Given a BED file of reads, filter out reads whose " \ "3' end maps to the last nucleotide of an intron or" \ "the last nucleotide of an exon." args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"]) reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile print("Getting intron lariat positions...") # read in exon coordinates exons = rw.read_gtf(gtf, element="exon", gene=False) # make a BED file with the last positions of introns intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4]) co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True) # intersect the reads with intron lariat positions intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4]) co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name) hk.remove_file(intron_lariat_bed) intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4]) # check that the reads end exactly at intron lariat positions check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file) hk.remove_file(intron_lariat_intersect_file_name) # write BED with the last positions of exons splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4]) co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True) print("Getting splice intermediate positions.") # intersect the reads with splice intermediate positions splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4]) co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name) hk.remove_file(splice_intermediate_bed) SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4]) # check that the reads end exactly at the end of the exon check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file) hk.remove_file(splice_intermediate_intersect_file_name) print("Concatenating the two files.") # concatenate the IL and SI read files so you could exclude both in one go combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4]) hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file) hk.remove_file(SI_reads_file) hk.remove_file(intron_lariat_reads_file) # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the # putative intron lariat reads from the main reads file co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile) hk.remove_file(combined_file)
def mDFEest(model, input_file, n_spikes = None, repetitions = None, fold_SFS = True, pop_change = False, seed = None): ''' Wraps call to multiDFEest. ''' flags = [] if fold_SFS: fold_SFS = 1 else: fold_SFS = 0 #this looks weird but is normal: this value will be the value of conpop in the multiDFE call, meaning it'll be 1 with constant population size if pop_change: pop_change = 0 else: pop_change = 1 #convert the English distribution names into multiDFEest model codes if model == "lognormal": model_code = 4 #parameter number for calculating AIC par_number = 2 elif model == "gamma": model_code = 2 par_number = 2 elif model == "beta": model_code = 3 par_number = 2 elif model == "spikes": model_code = 0 if not n_spikes: print("To be able to use a spikes model, you need to specify the number of spikes.") raise Exception par_number = (2 * n_spikes) - 1 flags = ["-ranrep", repetitions, "-nspikes", n_spikes] elif model == "steps": model_code = 1 if not n_spikes: print("To be able to use a steps model, you need to specify the number of steps.") raise Exception par_number = (2 * n_spikes) - 1 flags = ["-ranrep", repetitions, "-nspikes", n_spikes] elif model == "six_spikes": model_code = 5 par_number = 5 flags = ["-ranrep", repetitions] else: print("{0} is not a valid model name!".format(model)) raise Exception input_file_short = input_file.split("/") input_file_short = input_file_short[-1] #do the analysis in the directory where multiDFEest is stored if not os.path.exists("../multidfe/{0}".format(input_file_short)): run_process(["cp", input_file, "../multidfe"]) MDE_output = "{0}.MAXL.out".format(input_file_short) current_dir = os.getcwd() os.chdir("../multidfe") arguments = ["./MultiDFE", "-N1", 100, "-conpop", pop_change, "-sfsfold", fold_SFS, "-selmode", model_code, "-file", input_file_short] if seed: seed_string = "GSL_RNG_SEED={0}".format(seed) arguments = [seed_string] + arguments arguments.extend(flags) print(" ".join([str(i) for i in arguments])) #run multiDFEest run_process(arguments) #parse output output = rw.read_many_fields(MDE_output, "\t")[0] output = [i.split(":") for i in output if ":" in i] output = {i[0]: float(i[1]) for i in output} #get the log likelihood and calculate AIC ll = output["L"] print("\n") print(par_number) print(ll) AIC = (2 * par_number) - (2 * ll) output["AIC"] = AIC if n_spikes: output["model"] = "{0}_{1}".format(model, n_spikes) else: output["model"] = model remove_file(MDE_output) os.chdir(current_dir) return(output)
def get_ss_strength(exons, genome_file, upstream=True, five=True, exonic=3, intronic=6): """ Given a set of exons, get an estimate of splice site strength. :param exons: Dictionary of CDS lines. :param genome_file: File with genome sequence. :param upstream: evaluate the (5' or 3') splice site of the upstream intron (rather than downstream) :param five: evaluate the 5' splice site (rather than 3') :param exonic: how many nucleotides to include from the exon :param intronic: how many nucleotides to include from the intron :return: a dictionary with the splice site strength for each exon """ # will contain the splice site strengths out_dict = {} # will contain the names of the exons so that later on, we'd know which # splice site strength value goes with which exon names = [] # write splice site coordinates to GTF hk.make_dir("temp_data") temp_file_name = "temp_data/ss_sequences.gtf" with open(temp_file_name, "w") as temp_file: writer = csv.writer(temp_file, delimiter="\t") for transcript in exons: curr_exons = exons[transcript] for pos, exon in enumerate(curr_exons): # don't analyze first exons if (pos != 0): # cause you can't do the downstream intron of the last exon if (upstream or (pos != len(curr_exons) - 1)): if five: if upstream: template = curr_exons[pos - 1].copy() else: template = exon.copy() if template[6] == "+": template[3] = template[4] - exonic + 1 template[4] = template[4] + intronic elif template[6] == "-": template[4] = template[3] + exonic - 1 template[3] = template[3] - intronic else: if upstream: template = exon.copy() else: template = curr_exons[pos + 1].copy() if template[6] == "+": template[4] = template[3] + exonic - 1 template[3] = template[3] - intronic elif template[6] == "-": template[3] = template[4] - exonic + 1 template[4] = template[4] + intronic # this is for scaffolds etc. if template[3] >= 0: # so you'd know the order of the values in the MaxEntScan output names.append("{0}.{1}".format(transcript, pos - 1)) writer.writerow(template) # make a FASTA with splice site sequences temp_fasta_file_name = "{0}.fasta".format(temp_file_name[:-4]) hk.run_process([ "bedtools", "getfasta", "-fi", genome_file, "-bed", temp_file_name, "-fo", temp_fasta_file_name, "-s" ]) # filter FASTA for Ns fasta_lines = [] with open(temp_fasta_file_name) as fasta: for line in fasta: if line[0] == ">": curr_name = line else: if "N" not in line: fasta_lines.append(curr_name) fasta_lines.append(line) with open(temp_fasta_file_name, "w") as fasta: for line in fasta_lines: fasta.write(line) # run MaxEntScan on the FASTA # lazy hardcoded path, replace as appropriate... mes_direct = "/Users/rsavisaar/Software/MaxEntScan/fordownload" if five: cmd = "/Users/rsavisaar/Software/MaxEntScan/fordownload/score5.pl" else: cmd = "/Users/rsavisaar/Software/MaxEntScan/fordownload/score3.pl" temp_mes_file_name = "{0}_mes.txt".format(temp_file_name[:-4]) hk.run_process(["perl", cmd, temp_fasta_file_name], file_for_output=temp_mes_file_name, verbose=True) hk.remove_file(temp_fasta_file_name) hk.remove_file(temp_file_name) # read in splice site scores and store in output directory with open(temp_mes_file_name, newline="") as mes_file: reader = csv.reader(mes_file, delimiter="\t") for pos, line in enumerate(reader): out_dict[names[pos]] = float(line[1]) hk.remove_file(temp_mes_file_name) return (out_dict)