def main(): description = "Pick out the multi-exon genes from a dataset and generate families." args = parse_arguments(description, ["features_file", "genome", "dataset", "fasta"]) [features_file, genome, dataset, fasta] = [args.features_file, args.genome, args.dataset, args.fasta] #set up global feature set and get relevant sequence features from it fs = Feature_Set(features_file, genome) fs.set_dataset(dataset) exons = fs.get_exons() exon_numbers = fs.get_exon_numbers(exons) output_fasta_name = "{0}_multiexon.fasta".format(fasta[:-6]) #get multi-exon genes multi_exon = [i for i in exon_numbers if exon_numbers[i] > 1] #create a new feature set for multi-exon genes only fs_new = Feature_Set(features_file, genome) fs_new.create_dataset("{0}_multiexon".format(dataset), input_list = multi_exon) fs_new.set_dataset("{0}_multiexon".format(dataset)) #also write a fasta with the ORF sequences names, seqs = rw.read_fasta(fasta) seqs = [seqs[pos] for pos, i in enumerate(names) if i in multi_exon] names = [i for i in names if i in multi_exon] rw.write_to_fasta(names, seqs, output_fasta_name) #find paralogous families transcripts = fs_new.get_transcripts() gene_name_dict = fs_new.get_gene_name_dict(transcripts) conservation.find_families(output_fasta_name, "general/{0}_multiexon".format(dataset))
def main(): description = "Write the median motif lengths of a series of motif sets to file." args = parse_arguments(description, ["input_file", "output_file"]) [input_file, output_file] = [args.input_file, args.output_file] #parse motifs from FASTA names, motifs = rw.read_fasta(input_file) motifs = [i.split("|") for i in motifs] motif_lengths = [[len(j) for j in i] for i in motifs] #write down and print out motif lengths with open(output_file, "w") as file: for pos, lengths_list in enumerate(motif_lengths): file.write("{0}\t{1}\n".format(names[pos], np.median(lengths_list))) print(np.median(lengths_list))
def CpG_frequency(fasta, hits, controls): ''' Compare the CpG frequency at hit vs control sites. ''' #parse fasta into dictionary names, seqs = rw.read_fasta(fasta) seqs = {names[i]: seqs[i] for i in range(len(names))} hit_site_counter = 0 hit_CpG_counter = 0 control_site_counter = 0 control_CpG_counter = 0 #for each transcript for name in hits: seq = seqs[name] #get all dinucleotides in hits/controls current_true_dints = [ seq[i - 1:i + 1] for i in hits[name] if i != 0 ] + [seq[i:i + 2] for i in hits[name] if i != (len(seq) - 1)] current_control_dints = [ seq[i - 1:i + 1] for i in controls[name] if i != 0 ] + [seq[i:i + 2] for i in controls[name] if i != (len(seq) - 1)] #store total number of sites hit_site_counter = hit_site_counter + len(current_true_dints) control_site_counter = control_site_counter + len( current_control_dints) #check how many are CpG hit_CpG_counter = hit_CpG_counter + len( [i for i in current_true_dints if i == "CG" or i == "GC"]) control_CpG_counter = control_CpG_counter + len( [i for i in current_control_dints if i == "CG" or i == "GC"]) #calculate overall frequency hit_freq = hit_CpG_counter / hit_site_counter control_freq = control_CpG_counter / control_site_counter print("Hit CpG frequency: {0}.".format(hit_freq)) print("Control CpG frequency: {0}.".format(control_freq)) return (hit_freq, control_freq)
def main(): parser = argparse.ArgumentParser(description="Prepare a clean dataset of protein-coding genes.") parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features") parser.add_argument("ortholog_features_file_name", type = str, help = "name of GTF file with genome features for the orthologous genome") parser.add_argument("genome", type = str, help = "genome assembly name") parser.add_argument("ortholog_genome", type = str, help = "ortholog genome assembly name") parser.add_argument("dataset_name", type = str, help = "dataset name") parser.add_argument("ortholog_dataset_name", type = str, help = "ortholog dataset name") parser.add_argument("orthologs_file_name", type = str, help = "csv with orthologous pairs") parser.add_argument("dS_threshold", type = float, help = "csv with orthologus pair") parser.add_argument("alignment_folder", type = str, help = "folder where phy alignment files will be stored") parser.add_argument("raw_orth_seq_file", type = str, help = "file with the raw ortholog CDS sequences (downloaded via ensembl biomart)") args = parser.parse_args() [features_file_name, ortholog_features_file_name, genome, ortholog_genome, dataset_name, ortholog_dataset_name, orthologs_file_name, dS_threshold, alignment_folder, raw_orth_seq_file] = [args.features_file_name, args.ortholog_features_file_name, args.genome, args.ortholog_genome, args.dataset_name, args.ortholog_dataset_name, args.orthologs_file_name, args.dS_threshold, args.alignment_folder, args.raw_orth_seq_file] make_dir(alignment_folder) trans_id_pattern = re.compile("ENS\w*T\d*") ids_to_keep = [] #loop over an ensembl GTF file with open(features_file_name) as features_file: #skip the metadata for i in range(5): features_file.readline() for i in features_file: #only consider features that have been localized to chromosomes and that are from protein-coding genes if "PATCH" not in i and "gene_biotype \"protein_coding\"" in i and i[0] in "123456789XY" and i[1] in "0123456789XY\t": trans_id_obj = re.search(trans_id_pattern, i) if trans_id_obj: trans_id = trans_id_obj.group(0) #store the transcript ID ids_to_keep.append(trans_id) #make a list of the unique transcript IDs you got in the previous step ids_to_keep = list(set(ids_to_keep)) #create a feature set object from the transcript IDs, #that is to say, make a file that has all the associated gene feature annotations fs = Feature_Set(features_file_name, genome) #the dataset only needs to be created if it didn't exist previously ## fs.create_dataset(dataset_name, input_list = ids_to_keep) fs.set_dataset(dataset_name) print("Created dataset with {0} transcripts.".format(len(fs.names))) #this file will have the mappings between genes from the focal species and genes from the orthologus species final_pairs_file_name = "general/{0}_{1}_pc_pairs.csv".format(genome, ortholog_genome) CDS = fs.get_CDS() CDS = {i: CDS[i] for i in CDS if CDS[i]} #write the full ORF sequences of the genes to FASTA, filtering based on reading frame integrity. Also check that #there are no premature termination codons. fs.write_full_CDS(CDS, check_ORF = True, bare_name = True, PTC_check = True) ids_to_keep = rw.read_fasta("{0}_{1}_full_CDS.fasta".format(fs.features_file_name[:-4], fs.dataset))[0] print("{0} transcripts pass the check for ORF integrity.".format(len(ids_to_keep))) transcripts = fs.get_transcripts() transcripts = {i: transcripts[i] for i in ids_to_keep} #for genes with several associated transcript IDs, only keep the longest. gene_name_dict = fs.get_gene_name_dict(transcripts) ids_to_keep = [] for gene in gene_name_dict: current_CDS = [CDS[j] for j in gene_name_dict[gene]] current_lengths = [sum([j[0][3] - j[0][2] + 1 for j in k]) for k in current_CDS] id_to_keep = gene_name_dict[gene][current_lengths.index(max(current_lengths))] ids_to_keep.append(id_to_keep) print("After only keeping one transcript per gene (the longest), {0} transcripts remain.".format(len(ids_to_keep))) #this is a file that has the orthologs of your gens from Ensmebl biomart orth_data = rw.read_many_fields(orthologs_file_name, ",") #make a dictionary for the gene-to-ortholog mapping pairs_dict = {} for line in orth_data: if line[1] not in pairs_dict: pairs_dict[line[1]] = [] pairs_dict[line[1]].append(line[2]) #only keep genes for which there is an ortholog in the comparator species #transcript identifiers ids_to_keep = [i for i in ids_to_keep if i in pairs_dict] #gene identifiers orth_ids_to_keep = list(pairs_dict.values()) orth_ids_to_keep = list(set(flatten(orth_ids_to_keep))) #create a feature set for the other species based on the genes that are orthologous to the genes in your focal set orth_fs = Feature_Set(ortholog_features_file_name, ortholog_genome) ## orth_fs.create_dataset(ortholog_dataset_name, input_list = orth_ids_to_keep, input_type = "gene") orth_fs.set_dataset(ortholog_dataset_name) orth_CDS = orth_fs.get_CDS() orth_CDS = {i: orth_CDS[i] for i in orth_CDS if orth_CDS[i]} #write the ortholog ORFs to FASTA. Filter based on reading frame integrity and PTC content. orth_fs.write_full_CDS(orth_CDS, check_ORF = True, bare_name = True, PTC_check = True) orth_full_CDS_file = "{0}_{1}_full_CDS.fasta".format(ortholog_features_file_name[:-4], ortholog_dataset_name) #in some cases, if the genome assembly for the ortholog is not very good, it can take forever to get the sequences using faidx. #In that case, you can get the sequences via biomart. Uncomment the code below! ## rw.write_names(list(orth_CDS.keys()), "general/{0}_trans_IDs.txt".format(ortholog_dataset_name)) ## with open(raw_orth_seq_file) as file: ## raw_orth_seq = "".join(file) ## raw_orth_seq = re.sub("([A-Z])\n([A-Z])", "\\1\\2", raw_orth_seq) ## raw_orth_seq = raw_orth_seq.split("\n") ## raw_orth_seq = [i for i in raw_orth_seq if len(i) > 0] ## raw_orth_names = [i for i in raw_orth_seq if i[0] == ">"] ## raw_orth_seq = [i for i in raw_orth_seq if i[0] != ">"] ## with open(orth_full_CDS_file, "w") as file: ## for pos, seq in enumerate(raw_orth_seq): ## ORF_check = check_ORF_integrity(seq, PTC_check = True) ## if ORF_check[0]: ## file.write("{0}\n".format(raw_orth_names[pos])) ## file.write("{0}\n".format(seq)) ## else: ## print(pos) ## print(ORF_check[1]) ## print(raw_orth_names[pos]) ## print(seq) ## print("\n") #read in the full ORF sequences from both species CDS_names, CDS_seq = rw.read_fasta("{0}_{1}_full_CDS.fasta".format(fs.features_file_name[:-4], fs.dataset)) orth_CDS_names, orth_CDS_seq = rw.read_fasta(orth_full_CDS_file) orth_transcripts = orth_fs.get_transcripts() orth_gene_name_dict = orth_fs.get_gene_name_dict(orth_transcripts) final_pairs = {} counter = 0 #loop over the remaining genes for i in ids_to_keep: if counter%1000 == 0: print(counter) counter = counter + 1 #get the IDs of the orthologous genes in the ortholog species orth_ids = pairs_dict[i] #get all the associated transcript identifiers orth_ids_trans = [[orth_gene_name_dict[j][k] for k in range(len(orth_gene_name_dict[j]))] for j in orth_ids if j in orth_gene_name_dict] orth_ids_trans = flatten(orth_ids_trans) CDS = CDS_seq[CDS_names.index(i)] orth_CDS = [] ids_to_remove = [] #get all the ortholog ORF sequences for j in orth_ids_trans: try: current_CDS = orth_CDS_seq[orth_CDS_names.index(j)] orth_CDS.append(current_CDS) #this is because some of the transcripts produced from the gene might be non-coding or have a wonky ORF and therefore not appear in the CDS fasta except ValueError: ids_to_remove.append(j) orth_ids_trans = [j for j in orth_ids_trans if j not in ids_to_remove] #check that the sequence from the focal species aligns to an ortholog with dN/dS below 0.5 and dS below the specified threshold if orth_ids_trans: conservation_check = keep_conserved_pc(i, orth_ids_trans, CDS, orth_CDS, dS_threshold, alignment_folder) if conservation_check[0]: #also store which ortholog transcript gave the lowest dS in the alignment final_pairs[i] = conservation_check[1] print("After filtering by conservation, {0} transcripts remain.".format(len(list(final_pairs.values())))) #write the final retained ortholog gene pairs to file with open(final_pairs_file_name, "w") as file: output_writer = csv.writer(file, delimiter = ",") for i in final_pairs: output_writer.writerow([i, final_pairs[i]]) print("Wrote ortholog pairs to {0}.".format(final_pairs_file_name)) #write the remaining ORF sequences to fasta CDS_seq = [i for pos, i in enumerate(CDS_seq) if CDS_names[pos] in final_pairs] CDS_names = [i for i in CDS_names if i in final_pairs] rw.write_to_fasta(CDS_names, CDS_seq, "general/filtered_{0}_wo_low_omega.fasta".format(dataset_name)) #create a feature set with the remaining genes filtered_fs = Feature_Set(features_file_name, genome) filtered_fs.create_dataset("filtered_{0}".format(dataset_name), input_list = list(final_pairs.keys())) print("All done.")
def main(): description = "Calculate the combined density of a set of motif sets." args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "correspondances_file_name", "alignment_folder_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "output_suffix", "validity_folder_name", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "gene_families", "newer_filters", "baseml"], ints = [7, 12], flags = [15, 16, 17, 18, 19, 20, 21, 22]) [motifs_file_name, summary_file_name, dataset_name, correspondances_file_name, alignment_folder_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, output_suffix, validity_folder_name, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, gene_families, newer_filters, baseml] = [args.motifs_file_name, args.summary_file_name, args.dataset_name, args.correspondances_file_name, args.alignment_folder_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.output_suffix, args.validity_folder_name, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.gene_families, args.newer_filters, args.baseml] #make a dictionary with RBPs as keys and ND/p values as values. if summary_file_name != "None": summary_data = rw.read_many_fields(summary_file_name, "\t") #because some of the files are tab-separated, while others are comma-separated and have a header row if len(summary_data[0]) == 1: summary_data = rw.read_many_fields(summary_file_name, ",") summary_data = summary_data[1:] summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True) #make a dictionary with RBPs as keys and lists of associated motifs as values motifs = rw.read_motifs(motifs_file_name) #if you only want to be using a subset of the motifs if not full_set: #which RBPs fulfill the necessary information content criteria? validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(validity_folder_name), "\t") validity = list_to_dict(validity, 0, 1) #motifs with negative ND if negative_ND: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0) and (validity[RBP] == "True")] #the most significantly enriched motifs elif upper_quarter: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) and (validity[RBP] == "True")] #the most significantly depleted motifs elif lower_quarter: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] > 0.9) and (validity[RBP] == "True")] #motifs with positive ND else: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] >= 0) and (validity[RBP] == "True")] #shove all the remaining motifs into a great big flattened and uniquified bag motifs = list(set(flatten(list(motifs.values())))) make_dir(output_folder_name) #prepare a Feature_Set object (a genome gtf associated to a particular genome and to a set of transcript identifiers) if features_file_name != "None": fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) transcripts = fs.get_transcripts() CDS = fs.get_CDS() #paralogous families families = rw.read_families(families_file_name) #the families file might use gene identifiers, whereas the Feature_Set object uses transcript identifiers if gene_families: families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) #pick a random member from each paralogous family picked_trans = fs.pick_random_members() names = rw.read_fasta(fasta_name)[0] if picked_trans[0] not in names: picked = [fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG") for i in picked_trans] else: picked = picked_trans print(len(picked)) else: picked = None if baseml: method = "baseml" else: method = "gy" #write the input data for the conservation analysis into a file input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random()) conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_name, input_dict_file_name, picked = picked) with open(output_file_name, "w") as file: file.write(",".join(["real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"])) file.write("\n") #make n_sim simulant sets for the motifs, filtering the simulants based on different sets of criteria if new_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1) elif newer_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1, no_duplicates = True, concat = False) else: simulants = nc.make_simulants(motifs, n_sim, seed = 100) #file where the simulants dS values will be stored sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, output_suffix) #calculate dS within motifs and simulants output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method) print(output_dict) print("\n") #write to output file if output_dict != None: file.write(",".join([str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))])) else: file.write(",".join([str(None), str(None), str(None), str(None), str(None)])) os.remove(input_dict_file_name)
def test_get_GC4(self): names, sequences = rw.read_fasta("tests/test_get_GC4_input.fasta") expected = [1.0, 1.0, 1.0, 0.5, 2/3] phases = [2, 0, 0, 1, 0] observed = [get_GC4(sequences[i], phases[i]) for i in range(len(sequences))] self.assertEqual(observed, expected)
def main(): parser = argparse.ArgumentParser(description="Calculate the conservation level of a series of RBP motifs.") parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features") parser.add_argument("dataset_name", type = str, help = "dataset name") parser.add_argument("genome", type = str, help = "genome assembly name") parser.add_argument("RBP_file_name", type = str, help = "name of file with RBP motifs") parser.add_argument("correspondances_file_name", type = str, help = "name of file with correspondances between genes in dataset and orthologs") parser.add_argument("fasta_file_name", type = str, help = "name of fasta file with the sequences") parser.add_argument("families_file_name", type = str, help = "name of file that contains families") parser.add_argument("output_file_name", type = str, help = "file for output data") parser.add_argument("output_folder_name", type = str, help = "folder that will contain simulated dS scores") parser.add_argument("alignment_folder_name", type = str, help = "name of folder that contains alignments") parser.add_argument("n_sim", type = int, help = "number of simulants") parser.add_argument("--valid_file", nargs = "?", const = "False") parser.add_argument("--gene_families", action = "store_true", help = "does the families file use gene identifiers?") parser.add_argument("--markov", dest = "markov", action = "store_true", help = "Should simulants be generated using a Markov model?") parser.add_argument("--new_filters", dest = "new_filters", action = "store_true", help = "Should simulants be generated using the old method but capping mononucleotide runs and removing existing motifs?") parser.add_argument("--newer_filters", dest = "newer_filters", action = "store_true", help = "Like new_filters but without concatenation and without allowing duplicates within simulant sets.") parser.add_argument("--goldman_yang", dest = "goldman_yang", action = "store_true", help = "Should Goldman & Yang's method be used for calculating dS?") parser.add_argument("--baseml", dest = "baseml", action = "store_true", help = "Should baseml be used instead of codeml?") args = parser.parse_args() [features_file_name, dataset_name, genome, RBP_file_name, correspondances_file_name, output_folder_name, fasta_file_name, families_file_name, output_file_name, output_folder_name, alignment_folder_name, n_sim, valid_file, gene_families, markov, new_filters, newer_filters, goldman_yang, baseml] = [args.features_file_name, args.dataset_name, args.genome, args.RBP_file_name, args.correspondances_file_name, args.output_folder_name, args.fasta_file_name, args.families_file_name, args.output_file_name, args.output_folder_name, args.alignment_folder_name, args.n_sim, args.valid_file, args.gene_families, args.markov, args.new_filters, args.newer_filters, args.goldman_yang, args.baseml] #pick a random member from each paralogous family if features_file_name != "None": fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) families = rw.read_families(families_file_name) #if the families file uses gene identifiers rather than transcript identifiers if gene_families: families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) picked_trans = fs.pick_random_members() #if the fasta uses gene identifiers but the feature set uses transcript identifiers names = rw.read_fasta(fasta_file_name)[0] if picked_trans[0] not in names: transcripts = fs.get_transcripts() picked = [] for i in picked_trans: picked.append(fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG")) else: picked = picked_trans print(len(picked)) else: picked = None motif_dict = rw.read_motifs(RBP_file_name) #valid_file says which proteins pass information content criteria. Only analyze the ones that do. if not valid_file: validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t") validity = list_to_dict(validity, 0, 1) elif valid_file == "None": validity = {i: "True" for i in motif_dict} else: validity = rw.read_many_fields(valid_file, "\t") validity = list_to_dict(validity, 0, 1) protein_names = sorted([name for name in list(motif_dict.keys()) if validity[name] == "True"]) #whether to use PAML codeml or yn00. if baseml: method = "baseml" elif goldman_yang: method = "gy" else: method = "yn" #write the input data for the conservation analysis to file input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random()) conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_file_name, input_dict_file_name, picked = picked) with open(output_file_name, "w") as file: file.write(",".join(["protein_name", "real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"])) file.write("\n") for protein in protein_names: print(protein) motifs = motif_dict[protein] #use one of several different methods to generate simulant motifs if markov: simulants = nc.make_simulants_markov(motifs, n_sim) elif new_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True) elif newer_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, no_duplicates = True, concat = False, seed = 1) else: simulants = nc.make_simulants(motifs, n_sim) sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, protein) #determine the conservation parameters of the current protein output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method) print(output_dict) print("\n") if output_dict != None: file.write(",".join([protein, str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))])) else: file.write(",".join([protein, str(None), str(None), str(None), str(None), str(None)])) file.write("\n") os.remove(input_dict_file_name)
def main(): parser = argparse.ArgumentParser(description="Calculate the density of a series of RBP motifs in any type of sequence.") parser.add_argument("RBP_file_name", type = str, help = "name of file with RBP motifs") parser.add_argument("output_folder_name", type = str, help = "name of folder that will contain analysis results") parser.add_argument("output_file_name", type = str, help = "name of file that will contain analysis results") parser.add_argument("input_file_name", type = str, help = "name of fasta file with the sequences") parser.add_argument("n_sim", type = int, help = "number of simulants") parser.add_argument("features_file_name", type = str, help = "name of GTF file") parser.add_argument("genome", type = str, help = "genome name") parser.add_argument("dataset_name", type = str, help = "dataset name") parser.add_argument("families_file_name", type = str, help = "families file name") parser.add_argument("--simulants_within", dest = "simulants_within", action = "store_true", help = "Should simulants be generated only from dinucleotides within each particular motif?") parser.add_argument("--sequence_control", dest = "sequence_control", action = "store_true", help = "Should shuffled sequences be used as control?") parser.add_argument("--remove_stops", dest = "remove_stops", action = "store_true", help = "Should simulant motifs not incldue motifs that contain stop codon sequences? (boolean)") parser.add_argument("--markov", dest = "markov", action = "store_true", help = "Should simulants be generated using a Markov model?") parser.add_argument("--new_filters", dest = "new_filters", action = "store_true", help = "Should simulants be generated using the old method but capping mononucleotide runs and removing existing motifs?") parser.add_argument("--no_concat", dest = "no_concat", action = "store_true", help = "Should a density be calculated for each gene?") parser.add_argument("--newer_filters", dest = "newer_filters", action = "store_true", help = "Like new_filters, but also not allowing duplicates in the simulants and without concatenation.") parser.add_argument("--two_seqs", dest = "two_seqs", action = "store_true", help = "Set to true if the sequence fasta has two sequences separated by a pipe in each line.") args = parser.parse_args() [RBP_file_name, output_folder_name, output_file_name, input_file_name, n_sim, features_file_name, genome, dataset_name, families_file_name, simulants_within, sequence_control, remove_stops, markov, new_filters, no_concat, newer_filters, two_seqs] = [args.RBP_file_name, args.output_folder_name, args.output_file_name, args.input_file_name, args.n_sim, args.features_file_name, args.genome, args.dataset_name, args.families_file_name, args.simulants_within, args.sequence_control, args.remove_stops, args.markov, args.new_filters, args.no_concat, args.newer_filters, args.two_seqs] make_dir(output_folder_name) #if you want to average over families if features_file_name != "None": fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) families = rw.read_families(families_file_name) fs.add_families(families) else: fs = None #if concat, sum motif hit base counts across sequences and divide by the total sequence length, #otherwise produce a density estimate separately for each sequence and use the median as the final statistic if no_concat: concat = False else: concat = True #read in RBP motifs RBPs, motifs = rw.read_fasta(RBP_file_name) with open(output_file_name, "w") as output_file: for pos, RBP in enumerate(RBPs): curr_motifs = motifs[pos].split("|") #if, as control, you want to shuffle the codons within sequences if sequence_control: current_simulants = 3 output_suffix = "_sequence_control" #if, as control, you want to calculate the density of simulant motifs else: #generate simulant motifs, applying different sets of filters onto the simulant motifs output_suffix = "" if simulants_within: current_simulants = nc.make_simulants_within(curr_motifs, n_sim) elif markov: current_simulants = nc.make_simulants_markov(curr_motifs, n_sim, remove_stops = remove_stops, remove_existing = True) elif new_filters: current_simulants = nc.make_simulants(curr_motifs, n_sim, remove_stops = remove_stops, remove_existing = True, cap_runs = True) elif newer_filters: current_simulants = nc.make_simulants(curr_motifs, n_sim, remove_stops = remove_stops, remove_existing = True, cap_runs = True, no_duplicates = True, concat = False) else: current_simulants = nc.make_simulants(curr_motifs, n_sim, remove_stops = remove_stops) #get raw density, normalized density, p, Z... for current RBP current_dict = nc.get_sequence_set_density(input_file_name, None, curr_motifs, current_simulants, n_sim, "{0}/{1}_{2}_density.csv".format(output_folder_name, RBP, output_suffix), "{0}/{1}_{2}_sim_density.csv".format(output_folder_name, RBP, output_suffix), "{0}/{1}_{2}_positions.csv".format(output_folder_name, RBP, output_suffix), "{0}/{1}_{2}_sim_positions".format(output_folder_name, RBP, output_suffix), concat = concat, positions = False, feature_set = fs, two_seqs = two_seqs) if concat: current_record = [RBP, str(current_dict["density"]), str(np.mean(current_dict["simulated densities"])), str(current_dict["ND"]), str(current_dict["effective p"]), str(current_dict["Z"]), str(current_dict["depletion p"]), str(len(curr_motifs)), str(current_dict["simulant sd"])] else: current_record = [RBP, str(current_dict["median density"]), str(np.mean(current_dict["simulated densities"])), str(current_dict["median ND"]), str(current_dict["effective p"]), str(current_dict["Z"]), str(current_dict["depletion p"]), str(len(curr_motifs)), str(current_dict["simulant sd"])] output_file.write("\t".join(current_record)) output_file.write("\n") print(current_record)
def main(): description = "Run INSIGHT on a set of sequences and a set of sites." args = parse_arguments(description, ["fasta", "genome", "features_file", "families_file", "suffix", "dataset", "output_folder", "freq_threshold", "n", "hit_file", "control_file", "SNP_file_name_prefix", "CDS_SNP_file_name_prefix", "MSA_file_name_prefix", "trial_file", "trials", "hit_degen_file", "control_degen_file", "hit_reduce", "control_reduce", "new_SNPs", "new_MSA", "shuffle", "nonsyn_hits", "remove_GT", "big_tree"], floats = [7, 18, 19], ints = [8, 15], flags = [20, 21, 22, 23, 24, 25]) fasta, genome, features_file, families_file, suffix, dataset, general_output_folder, freq_threshold, n, hit_file, control_file, SNP_file_name_prefix, CDS_SNP_file_name_prefix, MSA_file_name_prefix, trial_file, trials, hit_degen_file, control_degen_file, hit_reduce, control_reduce, new_SNPs, new_MSA, shuffle, nonsyn_hits, remove_GT, big_tree = args.fasta, args.genome, args.features_file, args.families_file, args.suffix, args.dataset, args.output_folder, args.freq_threshold, args.n, args.hit_file, args.control_file, args.SNP_file_name_prefix, args.CDS_SNP_file_name_prefix, args.MSA_file_name_prefix, args.trial_file, args.trials, args.hit_degen_file, args.control_degen_file, args.hit_reduce, args.control_reduce, args.new_SNPs, args.new_MSA, args.shuffle, args.nonsyn_hits, args.remove_GT, args.big_tree output_folder = "{0}/{1}_{2}".format(general_output_folder, dataset, suffix) names, seqs = rw.read_fasta(fasta) #prepare feature set and family information fs = Feature_Set(features_file, genome) fs.set_dataset(dataset) if families_file == "None": conservation.find_families(fasta, "general/{0}".format(dataset)) families_file = "general/{0}_families.txt".format(dataset) families = rw.read_families(families_file) fs.add_families(families) make_dir(output_folder) general_folder = "DFE/for_everybody" make_dir(general_folder) if MSA_file_name_prefix == "None": MSA_file_name_prefix = "{0}/{1}_MSA".format(general_folder, dataset) #read in degeneracy information if hit_degen_file != "None": degen_hits = parse_degen(hit_degen_file) degen_controls = parse_degen(control_degen_file) else: degen_hits = None degen_controls = None #get relevant genome features transcripts = fs.get_transcripts() CDSs = fs.get_CDS() lengths = fs.get_lengths(CDSs, CDS = True) #filter out sex chromosomes from the analysis sex_chromosomes = ["X", "Y"] chrom_dict = {i: transcripts[i][0] for i in transcripts if transcripts[i][0] not in sex_chromosomes} chroms = list(set(list(chrom_dict.values()))) clean_names = ["h**o", "pan", "pongo", "macaca"] #if you're running several trials #if just one, it'll still make a single trial file if trial_file == "None": trial_file = "{0}_{1}_{2}.txt".format(trial_file, suffix, trials) with open(trial_file, "w") as o_file: print(suffix) #output file header o_file.write("rho\teta\tgamma\tDp\tPw\talpha\ttau\trhose\tetase\tgammase\trholl\tetall\tgammall\n") for trial in range(trials): print("==========TRIAL {0}==========\n".format(trial)) #get INSIGHT input data as a string based on divergence and SNP data hit_output, neutral_output, chroms_to_keep, hit_counts, control_counts = get_MSA(chroms, chrom_dict, control_file, hit_file, CDSs, lengths, names, seqs, clean_names, freq_threshold, dataset, suffix, genome, output_folder, general_folder, n, SNP_file_name_prefix, CDS_SNP_file_name_prefix, MSA_file_name_prefix, new_SNPs, new_MSA, shuffle, remove_GT, big_tree, hit_reduce = hit_reduce, control_reduce = control_reduce, degen_hits = degen_hits, degen_controls = degen_controls) print("Writing output files...") neutral_output_file = "{0}/{1}_{2}_{3}_neutral_input.txt".format(output_folder, dataset, suffix, trial) hit_output_file = "{0}/{1}_{2}_{3}_hit_input.txt".format(output_folder, dataset, suffix, trial) write_output_file(neutral_output_file, neutral_output, n) write_output_file(hit_output_file, hit_output, n) print("Running INSIGHT...") conservation.INSIGHT(neutral_output_file, hit_output_file, freq_threshold, "../Software/INSIGHT", "{0}_{1}".format(dataset, suffix)) print("Counting positions on chromosomes...") with open("{0}/{1}_{2}_pos_per_chrom.csv".format(output_folder, dataset, suffix), "w") as file: file.write("chrom\thits\tcontrols\n") for chrom in sorted(chroms_to_keep): file.write("{0}\t{1}\t{2}\n".format(chrom, hit_counts[chrom], control_counts[chrom])) INSIGHT_output = "../Software/INSIGHT/{0}_{1}.ins.log".format(dataset, suffix) #parse the INSIGHT output and do simple significance testing try: parsed_output = parse_INSIGHT_output(INSIGHT_output) estimates = parsed_output["estimates"] SE = parsed_output["SEs"] lls = parsed_output["chi_sq"] print("\n") print("Chisq statistics: {0}".format(" ".join([str(i) for i in lls]))) rho_pL = scipy.stats.chi2.sf(lls[0], 3) print("pL(rho): {0}".format(rho_pL)) eta_pL = scipy.stats.chi2.sf(lls[1], 1) print("pL(eta): {0}".format(eta_pL)) gamma_pL = scipy.stats.chi2.sf(lls[2], 1) print("pL(gamma): {0}".format(gamma_pL)) lls = "\t".join([str(i) for i in lls]) estimates = "\t".join(estimates) SE = "\t".join(SE) o_file.write(estimates) o_file.write("\t") o_file.write(SE) o_file.write("\t") o_file.write(lls) o_file.write("\n") #skip trials where INSIGHT failed to produce a full output except IndexError: print("Skipping...") pass
def main(): description = "Construct a site frequency spectrum that only considers motif-disrupting SNPs." args = parse_arguments(description, ["fasta", "output_file", "motif_file", "anc_file", "control_file", "SNPs_file", "N", "old_motif_format", "human", "ancestral"], ints = [6], flags = [7, 8, 9]) fasta, output_file, motif_file, anc_file, control_file, SNPs_file, N, old_motif_format, human, ancestral = args.fasta, args.output_file, args.motif_file, args.anc_file, args.control_file, args.SNPs_file, args.N, args.old_motif_format, args.human, args.ancestral names, seqs = rw.read_fasta(fasta) #I use two different formats for storing sequence motifs, #got to know which on it is if old_motif_format: motifs = rw.read_names(motif_file)[1:] print(len(motifs)) else: motifs = rw.read_motifs(motif_file) motifs = sorted(list(set(flatten(list(motifs.values()))))) #get the lengths of the motifs and compile lookahead regexes #that recognize the whole motif but only store the position of the first bases #these will be needed when searchin for the motifs motif_lengths = [len(i) for i in motifs] motif_regex = nc.motif_to_regex(motifs) #I'm gonna treat CG and GC as two 2-bp motifs, use the same code as wehn searching for, say, #ESE motifs CG_2mers = ["CG", "GC"] CG_lengths = [2, 2] CG_regex = nc.motif_to_regex(CG_2mers) motifs = [list(i) for i in motifs] if ancestral: anc_pos = rw.read_pos(anc_file) #read in hit and control positions controls = rw.read_pos(control_file) hit_file = re.sub("controls", "hits", control_file) hits = rw.read_pos(hit_file) #read in SNP data SNPs = rw.read_many_fields(SNPs_file, "\t") #the second column in the SNPs file contains positions that need to be discarded from analysis because they contain unanalyzable SNP data to_remove = list_to_dict(SNPs, 0, 2) to_remove = {i: to_remove[i].split(",") for i in to_remove} to_remove = {i: [int(j) for j in to_remove[i] if j not in ["error", ""]] for i in to_remove} SNPs = list_to_dict(SNPs, 0, 1) #all the SNPs associated to a transcript full_SNPs = {} #disruptive SNPs only clean_SNPs = {} minor_alleles = {} #the number of hit positions where, say, a T could theoretically substitute to an A (i.e. all T positions) transitions_total = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_} #the same as above but only counting those substitutions that would turn a motif into a non-motif transitions_disr = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_} #this block of code filters the true SNPs to only leave those that are disruptive #and also calculates the probability of being disruptive for all potential SNPs with open("{0}_degen.txt".format(hit_file), "w") as hit_degen_file: counter = 0 for trans in names: counter = update_counter(counter, 1000) if trans in controls: if trans in SNPs: trans_SNPs = SNPs[trans] else: trans_SNPs = [] trans_SNPs, clean_SNPs, full_SNPs, minor_alleles = parse_SNPs(trans_SNPs, clean_SNPs, full_SNPs, minor_alleles, trans) current_seq = seqs[names.index(trans)] fourfold_pos = nc.get_4fold_deg(current_seq) #CpG filtering if human: CG_pos = nc.get_motif_set_density(CG_regex, CG_lengths, current_seq, concat = True)["positions"] fourfold_pos = [i for i in fourfold_pos if i not in CG_pos] if ancestral: fourfold_pos = [i for i in fourfold_pos if i not in anc_pos[trans]] all_sites, clean_SNPs, transitions_total, transitions_disr, hit_degen_file = check_disruption(motif_regex, current_seq, motifs, motif_lengths, fourfold_pos, full_SNPs, clean_SNPs, minor_alleles, trans, transitions_total, transitions_disr, hit_degen_file, to_remove) hit_degen_file.write("\n") to_remove = {i: [j for j in to_remove[i] if j not in full_SNPs[i]] for i in to_remove if i in controls} hit_SFS = get_SFS(hits, clean_SNPs, to_remove, N) transitions = get_transitions(transitions_disr, transitions_total) print(transitions) #this block randomly assigns certain SNPs at simulant positions to be disruptive, #with the probability of that happening proportional to the frequency with which potential substitutions #of that nucleotide composition would be disruptive for true (motif) sites with open("{0}_degen.txt".format(control_file), "w") as control_degen_file: control_SNPs = {} counter = 0 for trans in controls: control_degen_file.write("{0}\t".format(trans)) counter = update_counter(counter, 1000) control_SNPs[trans] = {} trans_SNPs = full_SNPs[trans] current_seq = seqs[names.index(trans)] for site in controls[trans]: if trans not in to_remove or site not in to_remove[trans]: ref_allele = current_seq[site] disrupt_bases = get_disrupt_bases(ref_allele, transitions) control_degen_file.write("{0}:{1},".format(site, "|".join(disrupt_bases))) if site in trans_SNPs: minor_allele = minor_alleles[trans][site] if minor_allele in disrupt_bases: control_SNPs[trans][site] = trans_SNPs[site] control_degen_file.write("\n") control_SFS = get_SFS(controls, control_SNPs, to_remove, N) with open(output_file, "w") as file: file.write("{0}\n".format(N)) file.write(" ".join([str(i) for i in hit_SFS])) file.write("\n") file.write(" ".join([str(i) for i in control_SFS])) file.write("\n")
def main(): description = "Calculate the conservation of k-mers that are a single point mutation away from being part of a set of motifs." args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "output_folder_name", "p_column", "alignment_folder_name", "correspondances_file_name", "output_file_name", "dataset_name", "features_file_name", "n_sim", "output_suffix", "sequences_file_name", "families_file_name", "genome", "by_RBP"], ints = [3, 9], flags = [14]) [motifs_file_name, summary_file_name, output_folder_name, p_column, alignment_folder_name, correspondances_file_name, output_file_name, dataset_name, features_file_name, n_sim, output_suffix, sequences_file_name, families_file_name, genome, by_RBP] = [args.motifs_file_name, args.summary_file_name, args.output_folder_name, args.p_column, args.alignment_folder_name, args.correspondances_file_name, args.output_file_name, args.dataset_name, args.features_file_name, args.n_sim, args.output_suffix, args.sequences_file_name, args.families_file_name, args.genome, args.by_RBP] RBPs = rw.read_motifs(motifs_file_name) #only leave those RBPs hat pass information content criteria validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t") validity = list_to_dict(validity, 0, 1) RBPs = {i: RBPs[i] for i in RBPs if validity[i] == "True"} #if you're not doing this by RBP, pool motifs from the most significantly depleted sets if not by_RBP: summary_data = rw.read_many_fields(summary_file_name, "\t") if len(summary_data[0]) == 1: summary_data = rw.read_many_fields(summary_file_name, ",") summary_dict = list_to_dict(summary_data, 0, p_column, floatify = True) RBPs = {i: RBPs[i] for i in RBPs if summary_dict[i] > 0.9} motifs = list(set(flatten(list(RBPs.values())))) RBPs = {"all": motifs} #randomly pick one gene from each paralogous family fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) transcripts = fs.get_transcripts() families = rw.read_families(families_file_name) families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) picked_from_families = fs.pick_random_members() gene_name_dict = fs.get_gene_name_dict(transcripts) picked = [fs.convert_between_ENST_and_ENSG(i, gene_name_dict, "ENSG") for i in picked_from_families] names, CDS = rw.read_fasta(sequences_file_name) #make a dictionary where the keys are genes from the focal species and the values are orthologs from another species correspondances = rw.read_many_fields(correspondances_file_name, ",") correspondance_dict = {} for i in correspondances: correspondance_dict[i[0]] = i[1] output_dict = {} #loop over the RBPs for protein in sorted(RBPs): #fetch the current motifs print(protein) motifs = RBPs[protein] print("There are {0} motifs.".format(len(motifs))) #generate all unique motifs that are a single base substitution away from one of the motifs but are not actually in the set neighbours = nc.get_neighbours(motifs) print("There are {0} neighbours.".format(len(neighbours))) #make simulants for the motifs. don't allow simulants to be part of the set of neighbours. simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, exclude = neighbours, no_duplicates = True, concat = False) neighbour_lengths = [len(i) for i in neighbours] neighbours = nc.motif_to_regex(neighbours) #determine the true frequency at which fourfold degenarte sites that are a single substitution away from a motif in human actually contain the base that #would give rise to the motif in the orthologous species site_number = 0 mutation_score = 0 motifs = [list(i) for i in motifs] true_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, motifs, neighbours, neighbour_lengths], get_mutation_to_motif) for i in true_result: current = i.get() site_number = site_number + current[0] mutation_score = mutation_score + current[1] if site_number > 0: real_fraction = mutation_score/site_number else: real_fraction = None print("Real fraction:") print(real_fraction) neighbours = "" sim_site_numbers = np.zeros((n_sim)) sim_mutation_scores = np.zeros((n_sim)) #obtain this estimate also for each simulant set #I'm doing this in this awkward manner because I don't have enough RAM to hold all the simulated neighbours in memory at once for sim in range(n_sim): if sim%10 == 0: print(sim) current_simulants = simulants[sim] current_neighbours = nc.get_neighbours(current_simulants) current_neighbour_lengths = [len(i) for i in current_neighbours] current_neighbours = nc.motif_to_regex(current_neighbours) current_simulants = [list(i) for i in current_simulants] current_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, current_simulants, current_neighbours, current_neighbour_lengths], get_mutation_to_motif) for i in current_result: current = i.get() sim_site_numbers[sim] = sim_site_numbers[sim] + current[0] sim_mutation_scores[sim] = sim_mutation_scores[sim] + current[1] #normalize the real fraction, calculate p sim_fractions = np.divide(sim_mutation_scores, sim_site_numbers) sim_fractions = [i for i in sim_fractions if i != np.inf] p = ms.calc_eff_p(real_fraction, sim_fractions, greater = False) norm_fraction = ms.normalize(real_fraction, sim_fractions) output_dict[protein] = [protein, mutation_score, site_number, real_fraction, np.mean(sim_fractions), p, norm_fraction] print(output_dict[protein]) with open(output_file_name, "w") as output_file: #write header to output file output_file.write("protein\tmutation score\tsite number\treal fraction\tmean sim fraction\tp\tnormalized fraction\n") #write the rest of the output data for protein in sorted(list(output_dict.keys())): to_write = output_dict[protein] to_write = [str(i) for i in to_write] output_file.write("\t".join(to_write)) output_file.write("\n")
def get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = False, pseudoCG = False, comprehensive = False, subst_model = None, return_tuples = False, regions = False): ''' Get two dictionaries, one that says for each transcript which positions are CpG/GpC in macaque and one which positions were likely CpG/GpC in the human-macaque ancestor. ''' names, seqs = rw.read_fasta(fasta) #if you're gonna determine ancestral CpG positions from scratch rather than reading them in from an existing file #if you want to have the name of the file determined automatically if (not anc_CG_file_name) or (anc_CG_file_name == "None"): new_CG = True phy_file = "temp_data/temp_anc_CG{0}.txt".format(random.random()) #if you want to give the file a name yourself elif not os.path.exists(anc_CG_file_name): new_CG = True else: new_CG = False if new_CG: print("Will get new CpG data...") if len(phylip_data) < 8 and comprehensive: print("Comprehensive CpG filtering only in big tree mode!") raise Exception #if you want to pretend some other dinucleotide are CpG if pseudoCG: CG_kmers = ["C[\-]*T", "A[\-]*G"] #the hyphens are there in case the two nucleotides are separated by an indel else: CG_kmers = ["C[\-]*G", "G[\-]*C"] CG_kmers = [re.compile(i) for i in CG_kmers] macaque_CG_dict = {} anc_CG_concat_full = [[[""]], [[""]]] tuples_mapping_dict_full = {} for chrom in chroms: print(chrom) #only leave those CDSs that are on the current chromosome current_CDSs = {i: CDSs[i] for i in CDSs if CDSs[i][0][0][0] == chrom} coords_file = "temp_data/coords_file{0}.txt".format(random.random()) #check if the MSA is already at the specified location, otherwise retrieve it MSA_file = "{0}_{1}.txt".format(MSA_file_name_prefix, chrom) if not os.path.isfile(MSA_file): print("Obtaining MSA...") eo.get_MSA_gene_list(current_CDSs, coords_file, "EPO", "primates", 85, "homo_sapiens", MSA_file) os.remove(coords_file) eo.flush_tables("localhost", "mysql", "fackel") MSA_raw = eo.parse_MSA_output(MSA_file) if high_CG_file_name != "None": high_CG = rw.read_many_fields(high_CG_file_name, "\t") high_CG = {i[0]: [int(j) for j in i[1:]] for i in high_CG} else: high_CG = None #get concatenated sequences (for determining ancestral CpG positions) and macaque CpG information for this chromosome anc_CG_concat, macaque_CG_dict, tuples_mapping_dict = get_CpG_dicts_core(MSA_raw, lengths, phylip_data, CG_kmers, macaque_anc, macaque_CG_dict, high_CG, comprehensive = comprehensive, subst_model = subst_model) remove_file(coords_file) #add that information to the global dictionaries anc_CG_concat_full, tuples_mapping_dict_full = update_anc_CG(anc_CG_concat_full, anc_CG_concat, tuples_mapping_dict_full, tuples_mapping_dict) phy_files = write_anc_CG(anc_CG_concat_full, anc_CG_file_name, clean_names, macaque_CG_dict) pp_file = anc_CG_file_name else: print("Will read in existing CpG data...") pp_file = None phy_files = "None" high_CG = None tuples_mapping_dict_full = None macaque_CG_file_name = "{0}_macaque.txt".format(anc_CG_file_name[:-4]) macaque_CG_dict = rw.read_many_fields(macaque_CG_file_name, "\t") macaque_CG_dict = [i for i in macaque_CG_dict if len(i) == 2] macaque_CG_dict = list_to_dict(macaque_CG_dict, 0, 1) macaque_CG_dict = {i: [int(i) for i in macaque_CG_dict[i].split(",") if i != ""] for i in macaque_CG_dict} anc_CG_dict = get_ancestral_CG(pp_file, subst_model, phy_files, "DFE/UCSC_model.mod", tuples_mapping_dict_full, anc_CG_file_name, high_CG = high_CG, macaque = macaque_anc, comprehensive = comprehensive) [remove_file(i) for i in phy_files] #if you're looking at exon cores/flanks rather than full CDSs if regions: #you need to have matching bed/fasta files for this to work (with the records in the same order) bed = fasta.replace("fasta", "bed") transcripts = fs.get_transcripts() #for each flank/core, figure out what positions it covers in the full CDS mapping_dict = conservation.map_regions_to_CDS(fasta, bed, fs, transcripts, CDSs, trans_ids = True) anc_CG_dict = region_CpG(mapping_dict, anc_CG_dict) if return_tuples: return(anc_CG_dict, macaque_CG_dict, tuples_mapping_dict_full) else: return(anc_CG_dict, macaque_CG_dict)