def main(): description = "Record the distribution of peaks for different exons." args = hk.parse_arguments(description, ["peaks_file", "gtf", "exon_starts_file", "output_file", "reads_file", "from_end", "intronic", "limit", "nts_before_start", "noncoding", "reads_mode"], flags = [5, 6, 9, 10], ints = [7, 8]) peaks_file, gtf, exon_starts_file, output_file, reads_file, from_end, intronic, limit, nts_before_start, noncoding, reads_mode = args.peaks_file, args.gtf, args.exon_starts_file, args.output_file, args.reads_file, args.from_end, args.intronic, args.limit, args.nts_before_start, args.noncoding, args.reads_mode if noncoding: exons = rw.read_gtf(gtf, "exon", gene=False) else: exons = rw.read_gtf(gtf, "CDS", gene=False) # the 3' ss that will be analyzed valid_junctions = rw.read_many_fields(exon_starts_file, "\t") # pull out the column with transcript IDs valid_junctions = [i[3] for i in valid_junctions] lengths_dict = co.get_lengths(exons, valid_junctions, intronic=intronic) if nts_before_start: lengths_dict = {i: lengths_dict[i] + nts_before_start for i in lengths_dict} coverage_file_name = "{0}_{1}_coverage.bed".format(exon_starts_file[:-4], reads_file.split("/")[-1][:-4]) co.get_coverage(exon_starts_file, reads_file, coverage_file_name) peak_distances_all, peak_centres = co.peak_pos_in_exon(exon_starts_file, peaks_file, from_end = from_end, reads_mode = reads_mode) write_dist_mat(peak_distances_all, limit, output_file, lengths_dict, "{0}_intron_names.txt".format(output_file[:-4]), None) write_dist_mat(peak_centres, limit, "{0}_centres.txt".format(output_file[:-4]), lengths_dict, "{0}_centres_intron_names.txt".format(output_file[:-4]), None)
def main(): description = "Take a hits file and a control file and shuffle which elements are in which." args = parse_arguments(description, [ "input_hits", "input_controls", "output_hits", "output_controls", "hit_reduce", "control_reduce" ], floats=[4, 5]) input_hits, input_controls, output_hits, output_controls, hit_reduce, control_reduce = args.input_hits, args.input_controls, args.output_hits, args.output_controls, args.hit_reduce, args.control_reduce hits = rw.read_pos(input_hits) controls = rw.read_pos(input_controls) #if you need to reduce the hit and control position dictionary sizes by a specified proportion if hit_reduce > 0: hits = reduce_dict(hits, hit_reduce) controls = reduce_dict(controls, control_reduce) rw.write_pos(hits, output_hits) rw.write_pos(controls, output_controls) else: with open(output_hits, "w") as hits_o, open(output_controls, "w") as controls_o: for gene in hits: hit_length = len(hits[gene]) combined = hits[gene] + controls[gene] current_hits_o = sorted( np.random.choice(combined, reduce_dictsize=hit_length, replace=False)) current_controls_o = sorted( [i for i in combined if i not in current_hits_o]) hits_o.write("{0}\t{1}\n".format( gene, ",".join([str(i) for i in current_hits_o]))) controls_o.write("{0}\t{1}\n".format( gene, ",".join([str(i) for i in current_controls_o])))
def main(): description = "Pick out the multi-exon genes from a dataset and generate families." args = parse_arguments(description, ["features_file", "genome", "dataset", "fasta"]) [features_file, genome, dataset, fasta] = [args.features_file, args.genome, args.dataset, args.fasta] #set up global feature set and get relevant sequence features from it fs = Feature_Set(features_file, genome) fs.set_dataset(dataset) exons = fs.get_exons() exon_numbers = fs.get_exon_numbers(exons) output_fasta_name = "{0}_multiexon.fasta".format(fasta[:-6]) #get multi-exon genes multi_exon = [i for i in exon_numbers if exon_numbers[i] > 1] #create a new feature set for multi-exon genes only fs_new = Feature_Set(features_file, genome) fs_new.create_dataset("{0}_multiexon".format(dataset), input_list = multi_exon) fs_new.set_dataset("{0}_multiexon".format(dataset)) #also write a fasta with the ORF sequences names, seqs = rw.read_fasta(fasta) seqs = [seqs[pos] for pos, i in enumerate(names) if i in multi_exon] names = [i for i in names if i in multi_exon] rw.write_to_fasta(names, seqs, output_fasta_name) #find paralogous families transcripts = fs_new.get_transcripts() gene_name_dict = fs_new.get_gene_name_dict(transcripts) conservation.find_families(output_fasta_name, "general/{0}_multiexon".format(dataset))
def main(): description = "Run mDFEest with shuffled input to check the false positive rate." args = parse_arguments(description, [ "hits_file", "controls_file", "output_file", "n_sim", "SNP_file", "SNP_number", "hit_reduce", "control_reduce", "const_pop" ], ints=[3, 5], floats=[6, 7], flags=[8]) hits_file, controls_file, output_file, n_sim, SNP_file, SNP_number, hit_reduce, control_reduce, const_pop = args.hits_file, args.controls_file, args.output_file, args.n_sim, args.SNP_file, args.SNP_number, args.hit_reduce, args.control_reduce, args.const_pop with open(output_file, "w") as file: for sim in range(n_sim): print(sim) temp_hits_file = "temp_data/hits_file{0}.txt".format( random.random()) temp_controls_file = "temp_data/controls_file{0}.txt".format( random.random()) temp_input_file = "temp_data/input_file{0}.txt".format( random.random()) #shuffle hits and controls for negative control run_process([ "python3", "shuffle_hits_and_controls.py", hits_file, controls_file, temp_hits_file, temp_controls_file, hit_reduce, control_reduce ]) #generate multiDFEest input file run_process([ "python3", "mDFEest_input.py", temp_hits_file, temp_controls_file, SNP_file, SNP_number, temp_input_file ]) output = mDFEest("beta", temp_input_file, pop_change=True) print(output) print(output["Nes_0.0_0.1"]) print(output["Nes_0.1_1.0"]) file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"], output["Nes_0.1_1.0"])) #if you also want to run with fixed population size if const_pop: output = mDFEest("beta", temp_input_file, pop_change=False) file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"], output["Nes_0.1_1.0"])) file.write("\n") remove_file(temp_hits_file) remove_file(temp_controls_file) remove_file(temp_input_file)
def main(): description = "Filter an osc file to only contain the samples that you want and format it as a bed file so that you could lift over the coordinates." arguments = ["input_file_name", "output_file_name", "filter_samples"] args = parse_arguments(description, arguments, flags = [2]) input_file_name, output_file_name, filter_samples = [args.input_file_name, args.output_file_name, args.filter_samples] #this is all the pooled ones except for all the brain subregion ones which I removed because otherwise like #11/40 would have been brain tissues. I left in the retina though. ones_I_want = ['of adipose tissue, adult, pool1', 'of adrenal gland, adult, pool1', 'of aorta, adult, pool1', 'of bladder, adult, pool1', 'of blood, adult, pool1', 'of brain, adult, pool1', 'of cervix, adult, pool1', 'of colon, adult, pool1', 'of esophagus, adult, pool1', 'of heart, adult, pool1', 'of kidney, adult, pool1', 'of liver, adult, pool1', 'of lung, adult, pool1', 'of ovary, adult, pool1', 'of placenta, adult, pool1', 'of prostate, adult, pool1', 'of retina, adult, pool1', 'of salivary gland, adult, pool1', 'of skeletal muscle, adult, pool1', 'of small intestine, adult, pool1', 'of smooth muscle, adult, pool1', 'of spleen, adult, pool1', 'of testis, adult, pool1', 'of thymus, adult, pool1', 'of thyroid, adult, pool1', 'of tonsil, adult, pool1', 'of trachea, adult, pool1', 'of uterus, adult, pool1'] IDs = [] indices = [] full_IDs = [] counter = 0 with open(input_file_name) as file, open(output_file_name, "w") as output_file: for line in file: counter = counter + 1 if counter % 1000 == 0: print(counter) if line[0] == "#": if filter_samples: if "adult, pool1" in line: for search in ones_I_want: if search in line: ID = re.findall("CNhs[\d\.\-\w]*", line)[0] IDs.append(ID) elif line[:6] == "00Anno": if filter_samples: line = line.split("\t") for pos, elem in enumerate(line): for ID in IDs: if ID in elem: indices.append(pos) full_IDs.append(elem) elif line[:3] == "chr": #I'm going to pretend that the actual data bit is just #the name of the bed record so it would survive the CrossMapping line = line.split("\t") coords = line[0] line[-1] = line[-1].rstrip("\n") if filter_samples: line = [line[i] for i in indices] else: line = line[1:] coords = coords.split("..") chrom = coords[0].split(":")[0] start = coords[0].split(":")[1] end = coords[1].split(",")[0] strand = coords[1].split(",")[1] name = "|".join(line) output_line = [chrom, start, end, name, ".", strand] output_file.write("\t".join(output_line)) output_file.write("\n")
def main(): description = "Calculate the conservation of a set of motifs separately for each dinucleotide." args = parse_arguments(description, ["features_file_name", "dataset_name", "genome", "RBP_file_name", "correspondances_file_name", "fasta_file_name", "families_file_name", "output_file_name", "alignment_folder_name", "flanks"], flags = [9]) [features_file_name, dataset_name, genome, RBP_file_name, correspondances_file_name, fasta_file_name, families_file_name, output_file_name, alignment_folder_name, flanks] = [args.features_file_name, args.dataset_name, args.genome, args.RBP_file_name, args.correspondances_file_name, args.fasta_file_name, args.families_file_name, args.output_file_name, args.alignment_folder_name, args.flanks] #prepare an object for storing the genome annotations associated to the sequences in the sequence file fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) #make a dictionary with RBPs as keys and lists of associated motifs as values motif_dict = rw.read_motifs(RBP_file_name) transcripts = fs.get_transcripts() gene_name_dict = fs.get_gene_name_dict(transcripts) #if working with full CDSs if not flanks: #pick a random memebr from each paralogous family families = rw.read_families(families_file_name) families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) picked_trans = fs.pick_random_members() picked = [] for i in picked_trans: for j in gene_name_dict: if gene_name_dict[j][0][4] == i: picked.append(j) print(len(picked)) map_from_regions = None #if working with exon subregions #my exon subregions file already has regions from only one transcript per paralogous family else: picked = None CDS = fs.get_CDS() bed_file_name = "{}.bed".format(fasta_file_name) fasta_file_name = "{0}.fasta".format(fasta_file_name) map_from_regions = conservation.map_regions_to_CDS(fasta_file_name, bed_file_name, fs, gene_name_dict, CDS) #generate all possible DNA dinucleotides dinucl = nc.generate_all_kmers(2) motifs = flatten(list(motif_dict.values())) with open(output_file_name, "w") as file: file.write("dinucleotide\tmotif rate\tmotif frequency\tnonmotif rate\tnonmotif frequency\n") #calculate the rate of evolution wihtin vs outside of motifs separately for each dinucleotide freqs_dict = conservation.cons_by_dinucl(fasta_file_name, motifs, correspondances_file_name, alignment_folder_name, dinucl, picked = picked, map_from_regions = map_from_regions) for dint in sorted(list(freqs_dict.keys())): if (freqs_dict[dint]["subst. in motifs"] != None) and (freqs_dict[dint]["subst. in non-motifs"] != None): to_write = [dint, freqs_dict[dint]["subst. in motifs"], freqs_dict[dint]["frequency in motifs"], freqs_dict[dint]["subst. in non-motifs"], freqs_dict[dint]["frequency in non-motifs"]] to_write = "\t".join([str(i) for i in to_write]) file.write(to_write) file.write("\n") #get an over-all estimate by taking a weighted avergae (weighted by dinucleotide frequency) of the frequencies of all the different dinucleotides output_dict = conservation.weight_cons_by_dinucl(freqs_dict, dinucl) print(output_dict)
def main(): description = "Given a BED file of reads, filter out reads whose " \ "3' end maps to the last nucleotide of an intron or" \ "the last nucleotide of an exon." args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"]) reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile print("Getting intron lariat positions...") # read in exon coordinates exons = rw.read_gtf(gtf, element="exon", gene=False) # make a BED file with the last positions of introns intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4]) co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True) # intersect the reads with intron lariat positions intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4]) co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name) hk.remove_file(intron_lariat_bed) intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4]) # check that the reads end exactly at intron lariat positions check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file) hk.remove_file(intron_lariat_intersect_file_name) # write BED with the last positions of exons splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4]) co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True) print("Getting splice intermediate positions.") # intersect the reads with splice intermediate positions splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4]) co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name) hk.remove_file(splice_intermediate_bed) SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4]) # check that the reads end exactly at the end of the exon check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file) hk.remove_file(splice_intermediate_intersect_file_name) print("Concatenating the two files.") # concatenate the IL and SI read files so you could exclude both in one go combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4]) hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file) hk.remove_file(SI_reads_file) hk.remove_file(intron_lariat_reads_file) # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the # putative intron lariat reads from the main reads file co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile) hk.remove_file(combined_file)
def main(): # Get arguments. description = "Check if nucleotide composition at the 5' ends of NET-seq reads is biased." args = hk.parse_arguments( description, ["input_file", "output_file", "genome_fasta", "gtf", "three_prime"], flags=[4]) input_file, output_file, genome_fasta, gtf, three_prime = args.input_file, args.output_file, args.genome_fasta, args.gtf, args.three_prime # Convert to .bed, if not already .bed if input_file[-3:] != "bed": print("Converting input file to .bed...") input_file_new_name = "{0}bed".format(input_file[:-3]) hk.convert2bed(input_file, input_file_new_name) input_file = input_file_new_name # Make an extended version of each read that extends 5 nt 5prime and 35 nt 3prime print("Extending the reads...") suffix = "" if three_prime: suffix = "_three_prime" temp_bed = "{0}_extended_for_bias{1}.bed".format(input_file[:-4], suffix) co.extend_intervals(input_file, temp_bed, 5, 35, remove_chr=True, add_chr=False, three_prime=three_prime) # Make a FASTA file from the BED file. print("Extracting sequences...") fasta_name = "{0}fasta".format(temp_bed[:-3]) hk.run_process([ "fastaFromBed", "-bed", temp_bed, "-fi", genome_fasta, "-fo", fasta_name, "-s" ]) print("Number of lines in FASTA:") print(hk.run_process(["wc", "-l", fasta_name])) # Store the sequences at -5:+5 and 30:40 in a 2D array print("Storing sequences in arrays...") occ_mat_true, occ_mat_control = extract_true_and_control_string( fasta_name, (0, 10), (30, 40)) # Make a PPM for either column bases = ["A", "T", "C", "G"] print("Making PPMs...\n") print("TRUE:") PPM_wrapper(occ_mat_true, bases, "{0}.true".format(output_file)) print("CONTROL:") PPM_wrapper(occ_mat_control, bases, "{0}.control".format(output_file))
def main(): description = "Write the median motif lengths of a series of motif sets to file." args = parse_arguments(description, ["input_file", "output_file"]) [input_file, output_file] = [args.input_file, args.output_file] #parse motifs from FASTA names, motifs = rw.read_fasta(input_file) motifs = [i.split("|") for i in motifs] motif_lengths = [[len(j) for j in i] for i in motifs] #write down and print out motif lengths with open(output_file, "w") as file: for pos, lengths_list in enumerate(motif_lengths): file.write("{0}\t{1}\n".format(names[pos], np.median(lengths_list))) print(np.median(lengths_list))
def main(): description = "Make a file with intron lariat read counts per exon." args = hk.parse_arguments(description, ["intron_lariat_file", "regions_file"]) intron_lariat_file, regions_file = args.intron_lariat_file, args.regions_file # the intron_lariat_file contains only those reads whose # 3' ends map to the last position of an intron snr_name = "{0}_snr.bed".format(intron_lariat_file[:-4]) co.snr_bed(intron_lariat_file, snr_name) co.intersect_bed(regions_file, snr_name, force_strand=True, hit_count=True, no_dups=False, output_file="{0}_il_counts.bed".format(regions_file[:-4]))
def main(): description = "Take a positions file with hits within exonic subregions and convert them to full CDS indices." args = parse_arguments(description, [ "positions_file", "bed_file", "genome", "features_file", "dataset", "output_file" ]) positions_file, bed_file, genome, features_file, dataset, output_file = args.positions_file, args.bed_file, args.genome, args.features_file, args.dataset, args.output_file #set up data fs = setup(features_file, genome, dataset) CDSs = fs.get_CDS() pos_dict = rw.read_pos(positions_file) #do actual work converted_pos = convert_region_to_CDS_func(pos_dict, bed_file, CDSs) #write output to file rw.write_pos(converted_pos, output_file)
def main(): description = "Extract exon end/core regions from feature set." args = parse_arguments( description, ["genome", "features_file", "families_file", "dataset", "start_only"], flags=[4]) genome, features_file, families_file, dataset, start_only = args.genome, args.features_file, args.families_file, args.dataset, args.start_only genome = "hg38" features_file = "general/Homo_sapiens.GRCh38.85.gtf" families_file = "general/filtered_hg38_85_pc_multiexon_families.txt" dataset = "filtered_hg38_85_pc_multiexon" #prepare feature set, get necessary genomic features fs = setup(features_file, genome, dataset, families_file=families_file) exons = fs.get_exons() CDS = fs.get_CDS() #pick a random member from each family picked = fs.pick_random_members() exons = {i: exons[i] for i in picked} CDS = {i: CDS[i] for i in picked} if start_only: #only the 5' end fs.get_exon_beginnings(exons, CDS, file_prefix="general/{0}".format(dataset), write_to_fasta=True) else: #both flanks and the core fs.get_exon_cores_and_flanks(exons, CDS, file_prefix="general/{0}".format(dataset), write_to_fasta=True)
def main(): description = "Construct a site frequency spectrum that only considers motif-disrupting SNPs." args = parse_arguments(description, ["fasta", "output_file", "motif_file", "anc_file", "control_file", "SNPs_file", "N", "old_motif_format", "human", "ancestral"], ints = [6], flags = [7, 8, 9]) fasta, output_file, motif_file, anc_file, control_file, SNPs_file, N, old_motif_format, human, ancestral = args.fasta, args.output_file, args.motif_file, args.anc_file, args.control_file, args.SNPs_file, args.N, args.old_motif_format, args.human, args.ancestral names, seqs = rw.read_fasta(fasta) #I use two different formats for storing sequence motifs, #got to know which on it is if old_motif_format: motifs = rw.read_names(motif_file)[1:] print(len(motifs)) else: motifs = rw.read_motifs(motif_file) motifs = sorted(list(set(flatten(list(motifs.values()))))) #get the lengths of the motifs and compile lookahead regexes #that recognize the whole motif but only store the position of the first bases #these will be needed when searchin for the motifs motif_lengths = [len(i) for i in motifs] motif_regex = nc.motif_to_regex(motifs) #I'm gonna treat CG and GC as two 2-bp motifs, use the same code as wehn searching for, say, #ESE motifs CG_2mers = ["CG", "GC"] CG_lengths = [2, 2] CG_regex = nc.motif_to_regex(CG_2mers) motifs = [list(i) for i in motifs] if ancestral: anc_pos = rw.read_pos(anc_file) #read in hit and control positions controls = rw.read_pos(control_file) hit_file = re.sub("controls", "hits", control_file) hits = rw.read_pos(hit_file) #read in SNP data SNPs = rw.read_many_fields(SNPs_file, "\t") #the second column in the SNPs file contains positions that need to be discarded from analysis because they contain unanalyzable SNP data to_remove = list_to_dict(SNPs, 0, 2) to_remove = {i: to_remove[i].split(",") for i in to_remove} to_remove = {i: [int(j) for j in to_remove[i] if j not in ["error", ""]] for i in to_remove} SNPs = list_to_dict(SNPs, 0, 1) #all the SNPs associated to a transcript full_SNPs = {} #disruptive SNPs only clean_SNPs = {} minor_alleles = {} #the number of hit positions where, say, a T could theoretically substitute to an A (i.e. all T positions) transitions_total = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_} #the same as above but only counting those substitutions that would turn a motif into a non-motif transitions_disr = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_} #this block of code filters the true SNPs to only leave those that are disruptive #and also calculates the probability of being disruptive for all potential SNPs with open("{0}_degen.txt".format(hit_file), "w") as hit_degen_file: counter = 0 for trans in names: counter = update_counter(counter, 1000) if trans in controls: if trans in SNPs: trans_SNPs = SNPs[trans] else: trans_SNPs = [] trans_SNPs, clean_SNPs, full_SNPs, minor_alleles = parse_SNPs(trans_SNPs, clean_SNPs, full_SNPs, minor_alleles, trans) current_seq = seqs[names.index(trans)] fourfold_pos = nc.get_4fold_deg(current_seq) #CpG filtering if human: CG_pos = nc.get_motif_set_density(CG_regex, CG_lengths, current_seq, concat = True)["positions"] fourfold_pos = [i for i in fourfold_pos if i not in CG_pos] if ancestral: fourfold_pos = [i for i in fourfold_pos if i not in anc_pos[trans]] all_sites, clean_SNPs, transitions_total, transitions_disr, hit_degen_file = check_disruption(motif_regex, current_seq, motifs, motif_lengths, fourfold_pos, full_SNPs, clean_SNPs, minor_alleles, trans, transitions_total, transitions_disr, hit_degen_file, to_remove) hit_degen_file.write("\n") to_remove = {i: [j for j in to_remove[i] if j not in full_SNPs[i]] for i in to_remove if i in controls} hit_SFS = get_SFS(hits, clean_SNPs, to_remove, N) transitions = get_transitions(transitions_disr, transitions_total) print(transitions) #this block randomly assigns certain SNPs at simulant positions to be disruptive, #with the probability of that happening proportional to the frequency with which potential substitutions #of that nucleotide composition would be disruptive for true (motif) sites with open("{0}_degen.txt".format(control_file), "w") as control_degen_file: control_SNPs = {} counter = 0 for trans in controls: control_degen_file.write("{0}\t".format(trans)) counter = update_counter(counter, 1000) control_SNPs[trans] = {} trans_SNPs = full_SNPs[trans] current_seq = seqs[names.index(trans)] for site in controls[trans]: if trans not in to_remove or site not in to_remove[trans]: ref_allele = current_seq[site] disrupt_bases = get_disrupt_bases(ref_allele, transitions) control_degen_file.write("{0}:{1},".format(site, "|".join(disrupt_bases))) if site in trans_SNPs: minor_allele = minor_alleles[trans][site] if minor_allele in disrupt_bases: control_SNPs[trans][site] = trans_SNPs[site] control_degen_file.write("\n") control_SFS = get_SFS(controls, control_SNPs, to_remove, N) with open(output_file, "w") as file: file.write("{0}\n".format(N)) file.write(" ".join([str(i) for i in hit_SFS])) file.write("\n") file.write(" ".join([str(i) for i in control_SFS])) file.write("\n")
def main(): description = "Generate a NET-seq control set that would have the same distribution of -2:2 nucleotides" \ "as the true set." args = hk.parse_arguments(description, [ "active_genes_file", "gtf", "PolII_file", "fasta", "outfile", "chrom_sizes" ]) active_genes_file, gtf, PolII_file, fasta, outfile, chrom_sizes = args.active_genes_file, args.gtf, args.PolII_file, args.fasta, args.outfile, args.chrom_sizes chrom_sizes = rw.read_many_fields(chrom_sizes, delimiter="\t") chrom_sizes = hk.list_to_dict(chrom_sizes, 0, 1, intify=True) # get transcriptionally active genes and make a BED file with their coordinates print("Getting the coordinates of transcriptionally active genes...") trans_active_genes = rw.read_many_fields(active_genes_file, "\t")[1:] trans_active_genes = [i[3] for i in trans_active_genes] transcripts_file = "{0}_transcripts_all.bed".format(gtf[:-4]) co.get_transcripts(gtf, transcripts_file, add_chr=True) transcripts_dict = {} # this will be used for getting the k-mers in the transcripts filtered_transcripts_file_plus2 = "{0}_trans_act_only_plus3.bed".format( transcripts_file[:-4]) # this will be used for filtering the reads filtered_transcripts_file = "{0}_trans_act_only.bed".format( transcripts_file[:-4]) with open(filtered_transcripts_file, "w") as ft_file, open(transcripts_file) as t_file, open( filtered_transcripts_file_plus2, "w") as ft_file2: reader = csv.reader(t_file, delimiter="\t") writer = csv.writer(ft_file, delimiter="\t") writer2 = csv.writer(ft_file2, delimiter="\t") for line in reader: if line[3] in trans_active_genes: # if line[0][0] not in ["G", "K"]: # line[0] = "chr{0}".format(line[0]) writer.writerow(line) # this is because if a read falls at the first position, you will need to know the # preceding two bases. Same if it falls at the last position. line[1] = str((int(line[1])) - 3) line[2] = str((int(line[2])) + 3) writer2.writerow(line) transcripts_dict[line[3]] = line print("Filtering reads to the transcripts...") # filter reads to only ones that overlap these transcripts transcripts_PolII = "{0}_transcripts.bed".format(PolII_file[:-4]) co.intersect_bed(PolII_file, filtered_transcripts_file, force_strand=True, output_file=transcripts_PolII) print("Extracting FASTA from the transcript coordinates...") # the genome FASTA is formatted as N rather than chrN filtered_transcripts_file_no_chr = "{0}_trans_act_only_plus3_no_chr.bed".format( transcripts_file[:-4]) hk.run_process(["sed", "s/^chr//", filtered_transcripts_file_plus2], file_for_output=filtered_transcripts_file_no_chr) filtered_transcripts_fasta_no_chr = "{0}_trans_act_only_plus3.fasta".format( transcripts_file[:-4]) hk.run_process([ "bedtools", "getfasta", "-fi", fasta, "-bed", filtered_transcripts_file_no_chr, "-fo", filtered_transcripts_fasta_no_chr, "-s", "-name" ]) print("Mapping kmers to transcript positions...") kmer_dict = map_kmers_to_positions(filtered_transcripts_fasta_no_chr, k=6, focal_pos=3) print("Extracting the starting dinucleotide for each read...") starting_dints_PolII = "{0}_transcripts_starting_6mers.bed".format( PolII_file[:-4]) starting_dints_PolII_fasta = "{0}_transcripts_starting_6mers.fasta".format( PolII_file[:-4]) co.extend_intervals(transcripts_PolII, starting_dints_PolII, 3, 3, remove_chr=True) hk.run_process([ "bedtools", "getfasta", "-fi", fasta, "-bed", starting_dints_PolII, "-fo", starting_dints_PolII_fasta, "-s" ]) print("Picking random control positions...") pick_random_positions(transcripts_PolII, starting_dints_PolII_fasta, outfile, kmer_dict, transcripts_dict, chrom_sizes=chrom_sizes) print("Making single nucleotide resolution file...") snr_file = "{0}_snr.bed".format(outfile[:-4]) co.snr_bed(outfile, snr_file) print( "Removing reads that overlap potential splice intermediate positions..." ) no_si_snr_file = "{0}_snr_no_si.bed".format(outfile[:-4]) co.intersect_bed(snr_file, "data/Genomes/GTFs/dm6/dmel-all-r6.18_exon_ends_chr.gtf", force_strand=True, exclude=True, no_dups=False)
def main(): description = "Prepare input file for running MultiDFEest." args = parse_arguments(description, [ "hit_file", "control_file", "SNPs_file_prefix", "N", "output_file", "per_chrom_files", "shuffle" ], ints=[3], flags=[5, 6]) hit_file, control_file, SNPs_file_prefix, N, output_file, per_chrom_files, shuffle = args.hit_file, args.control_file, args.SNPs_file_prefix, args.N, args.output_file, args.per_chrom_files, args.shuffle hits = parse_pos(hit_file) controls = parse_pos(control_file) if shuffle: hits, controls = shuffle_dictionaries(hits, controls) SNPs = {} to_remove_all = {} #if the data is stored chromosome by chromosome, rather than all combined if per_chrom_files: for chrom in range(1, 23): try: SNPs_file = "{0}{1}.bed".format(SNPs_file_prefix, str(chrom)) current_SNPs = rw.read_many_fields(SNPs_file, "\t") to_remove = list_to_dict(current_SNPs, 0, 2) to_remove = {i: to_remove[i].split(",") for i in to_remove} current_SNPs = list_to_dict(current_SNPs, 0, 1) for trans in current_SNPs: if trans in controls: SNPs[trans] = {} trans_SNPs = current_SNPs[trans] if trans_SNPs: trans_SNPs = [ i.split(",") for i in trans_SNPs.split("|") ] #this is where you get the allele count trans_SNPs = list_to_dict(trans_SNPs, 0, 3) trans_SNPs = { int(i): int(trans_SNPs[i]) for i in trans_SNPs } SNPs[trans] = trans_SNPs to_remove_all[trans] = [ int(i) for i in to_remove[trans] if i not in ["error", ""] ] except FileNotFoundError: pass else: SNPs_file = SNPs_file_prefix current_SNPs = rw.read_many_fields(SNPs_file, "\t") to_remove = list_to_dict(current_SNPs, 0, 2) to_remove = {i: to_remove[i].split(",") for i in to_remove} current_SNPs = list_to_dict(current_SNPs, 0, 1) counter = 0 for trans in current_SNPs: if trans in controls: SNPs[trans] = {} trans_SNPs = current_SNPs[trans] if trans_SNPs: trans_SNPs = [i.split(",") for i in trans_SNPs.split("|")] #this is where you get the allele count trans_SNPs = list_to_dict(trans_SNPs, 0, 3) trans_SNPs = { int(i): int(trans_SNPs[i]) for i in trans_SNPs } SNPs[trans] = trans_SNPs to_remove_all[trans] = [ int(i) for i in to_remove[trans] if i not in ["error", ""] ] hit_SFS = get_SFS(hits, SNPs, to_remove_all, N) control_SFS = get_SFS(controls, SNPs, to_remove_all, N) with open(output_file, "w") as file: file.write("{0}\n".format(N)) file.write(" ".join([str(i) for i in hit_SFS])) file.write("\n") file.write(" ".join([str(i) for i in control_SFS])) file.write("\n")
def main(): description = "Calculate the combined density of a set of motif sets." args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "correspondances_file_name", "alignment_folder_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "output_suffix", "validity_folder_name", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "gene_families", "newer_filters", "baseml"], ints = [7, 12], flags = [15, 16, 17, 18, 19, 20, 21, 22]) [motifs_file_name, summary_file_name, dataset_name, correspondances_file_name, alignment_folder_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, output_suffix, validity_folder_name, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, gene_families, newer_filters, baseml] = [args.motifs_file_name, args.summary_file_name, args.dataset_name, args.correspondances_file_name, args.alignment_folder_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.output_suffix, args.validity_folder_name, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.gene_families, args.newer_filters, args.baseml] #make a dictionary with RBPs as keys and ND/p values as values. if summary_file_name != "None": summary_data = rw.read_many_fields(summary_file_name, "\t") #because some of the files are tab-separated, while others are comma-separated and have a header row if len(summary_data[0]) == 1: summary_data = rw.read_many_fields(summary_file_name, ",") summary_data = summary_data[1:] summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True) #make a dictionary with RBPs as keys and lists of associated motifs as values motifs = rw.read_motifs(motifs_file_name) #if you only want to be using a subset of the motifs if not full_set: #which RBPs fulfill the necessary information content criteria? validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(validity_folder_name), "\t") validity = list_to_dict(validity, 0, 1) #motifs with negative ND if negative_ND: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0) and (validity[RBP] == "True")] #the most significantly enriched motifs elif upper_quarter: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) and (validity[RBP] == "True")] #the most significantly depleted motifs elif lower_quarter: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] > 0.9) and (validity[RBP] == "True")] #motifs with positive ND else: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] >= 0) and (validity[RBP] == "True")] #shove all the remaining motifs into a great big flattened and uniquified bag motifs = list(set(flatten(list(motifs.values())))) make_dir(output_folder_name) #prepare a Feature_Set object (a genome gtf associated to a particular genome and to a set of transcript identifiers) if features_file_name != "None": fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) transcripts = fs.get_transcripts() CDS = fs.get_CDS() #paralogous families families = rw.read_families(families_file_name) #the families file might use gene identifiers, whereas the Feature_Set object uses transcript identifiers if gene_families: families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) #pick a random member from each paralogous family picked_trans = fs.pick_random_members() names = rw.read_fasta(fasta_name)[0] if picked_trans[0] not in names: picked = [fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG") for i in picked_trans] else: picked = picked_trans print(len(picked)) else: picked = None if baseml: method = "baseml" else: method = "gy" #write the input data for the conservation analysis into a file input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random()) conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_name, input_dict_file_name, picked = picked) with open(output_file_name, "w") as file: file.write(",".join(["real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"])) file.write("\n") #make n_sim simulant sets for the motifs, filtering the simulants based on different sets of criteria if new_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1) elif newer_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1, no_duplicates = True, concat = False) else: simulants = nc.make_simulants(motifs, n_sim, seed = 100) #file where the simulants dS values will be stored sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, output_suffix) #calculate dS within motifs and simulants output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method) print(output_dict) print("\n") #write to output file if output_dict != None: file.write(",".join([str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))])) else: file.write(",".join([str(None), str(None), str(None), str(None), str(None)])) os.remove(input_dict_file_name)
def main(): description = "Calculate the conservation of k-mers that are a single point mutation away from being part of a set of motifs." args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "output_folder_name", "p_column", "alignment_folder_name", "correspondances_file_name", "output_file_name", "dataset_name", "features_file_name", "n_sim", "output_suffix", "sequences_file_name", "families_file_name", "genome", "by_RBP"], ints = [3, 9], flags = [14]) [motifs_file_name, summary_file_name, output_folder_name, p_column, alignment_folder_name, correspondances_file_name, output_file_name, dataset_name, features_file_name, n_sim, output_suffix, sequences_file_name, families_file_name, genome, by_RBP] = [args.motifs_file_name, args.summary_file_name, args.output_folder_name, args.p_column, args.alignment_folder_name, args.correspondances_file_name, args.output_file_name, args.dataset_name, args.features_file_name, args.n_sim, args.output_suffix, args.sequences_file_name, args.families_file_name, args.genome, args.by_RBP] RBPs = rw.read_motifs(motifs_file_name) #only leave those RBPs hat pass information content criteria validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t") validity = list_to_dict(validity, 0, 1) RBPs = {i: RBPs[i] for i in RBPs if validity[i] == "True"} #if you're not doing this by RBP, pool motifs from the most significantly depleted sets if not by_RBP: summary_data = rw.read_many_fields(summary_file_name, "\t") if len(summary_data[0]) == 1: summary_data = rw.read_many_fields(summary_file_name, ",") summary_dict = list_to_dict(summary_data, 0, p_column, floatify = True) RBPs = {i: RBPs[i] for i in RBPs if summary_dict[i] > 0.9} motifs = list(set(flatten(list(RBPs.values())))) RBPs = {"all": motifs} #randomly pick one gene from each paralogous family fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) transcripts = fs.get_transcripts() families = rw.read_families(families_file_name) families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) picked_from_families = fs.pick_random_members() gene_name_dict = fs.get_gene_name_dict(transcripts) picked = [fs.convert_between_ENST_and_ENSG(i, gene_name_dict, "ENSG") for i in picked_from_families] names, CDS = rw.read_fasta(sequences_file_name) #make a dictionary where the keys are genes from the focal species and the values are orthologs from another species correspondances = rw.read_many_fields(correspondances_file_name, ",") correspondance_dict = {} for i in correspondances: correspondance_dict[i[0]] = i[1] output_dict = {} #loop over the RBPs for protein in sorted(RBPs): #fetch the current motifs print(protein) motifs = RBPs[protein] print("There are {0} motifs.".format(len(motifs))) #generate all unique motifs that are a single base substitution away from one of the motifs but are not actually in the set neighbours = nc.get_neighbours(motifs) print("There are {0} neighbours.".format(len(neighbours))) #make simulants for the motifs. don't allow simulants to be part of the set of neighbours. simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, exclude = neighbours, no_duplicates = True, concat = False) neighbour_lengths = [len(i) for i in neighbours] neighbours = nc.motif_to_regex(neighbours) #determine the true frequency at which fourfold degenarte sites that are a single substitution away from a motif in human actually contain the base that #would give rise to the motif in the orthologous species site_number = 0 mutation_score = 0 motifs = [list(i) for i in motifs] true_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, motifs, neighbours, neighbour_lengths], get_mutation_to_motif) for i in true_result: current = i.get() site_number = site_number + current[0] mutation_score = mutation_score + current[1] if site_number > 0: real_fraction = mutation_score/site_number else: real_fraction = None print("Real fraction:") print(real_fraction) neighbours = "" sim_site_numbers = np.zeros((n_sim)) sim_mutation_scores = np.zeros((n_sim)) #obtain this estimate also for each simulant set #I'm doing this in this awkward manner because I don't have enough RAM to hold all the simulated neighbours in memory at once for sim in range(n_sim): if sim%10 == 0: print(sim) current_simulants = simulants[sim] current_neighbours = nc.get_neighbours(current_simulants) current_neighbour_lengths = [len(i) for i in current_neighbours] current_neighbours = nc.motif_to_regex(current_neighbours) current_simulants = [list(i) for i in current_simulants] current_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, current_simulants, current_neighbours, current_neighbour_lengths], get_mutation_to_motif) for i in current_result: current = i.get() sim_site_numbers[sim] = sim_site_numbers[sim] + current[0] sim_mutation_scores[sim] = sim_mutation_scores[sim] + current[1] #normalize the real fraction, calculate p sim_fractions = np.divide(sim_mutation_scores, sim_site_numbers) sim_fractions = [i for i in sim_fractions if i != np.inf] p = ms.calc_eff_p(real_fraction, sim_fractions, greater = False) norm_fraction = ms.normalize(real_fraction, sim_fractions) output_dict[protein] = [protein, mutation_score, site_number, real_fraction, np.mean(sim_fractions), p, norm_fraction] print(output_dict[protein]) with open(output_file_name, "w") as output_file: #write header to output file output_file.write("protein\tmutation score\tsite number\treal fraction\tmean sim fraction\tp\tnormalized fraction\n") #write the rest of the output data for protein in sorted(list(output_dict.keys())): to_write = output_dict[protein] to_write = [str(i) for i in to_write] output_file.write("\t".join(to_write)) output_file.write("\n")
def main(): description = "Calculate the combined density of a set of motif sets." args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "seed", "output_suffix", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "newer_filters", "two_seqs"], ints = [5, 10, 11], flags = [13, 14, 15, 16, 17, 18, 19]) [motifs_file_name, summary_file_name, dataset_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, seed, output_suffix, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, newer_filters, two_seqs] = [args.motifs_file_name, args.summary_file_name, args.dataset_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.seed, args.output_suffix, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.newer_filters, args.two_seqs] #make a dictionary with RBPs as keys and ND/p values as values. if summary_file_name != "None": summary_data = rw.read_many_fields(summary_file_name, "\t") #because some of the files are tab-separated, while others are comma-separated and have a header row if len(summary_data[0]) == 1: summary_data = rw.read_many_fields(summary_file_name, ",") summary_data = summary_data[1:] summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True) #make a dictionary with RBPs as keys and lists of associated motifs as values motifs = rw.read_motifs(motifs_file_name) #if you only want to be using a subset of the motifs if not full_set: #motifs with negative ND if negative_ND: motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] < 0] #the most significantly enriched motifs elif upper_quarter: motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] < 0.1] #the most significantly depleted motifs elif lower_quarter: motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] > 0.9] #motifs with positive ND else: motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] >= 0] #shove all the remaining motifs into a great big flattened and uniquified bag motifs = list(set(flatten(list(motifs.values())))) print(len(motifs)) make_dir(output_folder_name) #if you want to average over families if features_file_name != "None": fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) families = rw.read_families(families_file_name) fs.add_families(families) else: fs = None #generate 100 1000 bp long random sequences based on the hg38 mononucleotide composition and use that as your sequence fasta if fasta_name == "random": names = [i for i in range(100)] seqs = nc.kmers_from_nc(1000, 100, genome_comp = True) fasta_name = "RBP/random_sequences_from_genome_comp.fasta" rw.write_to_fasta(names, seqs, fasta_name) with open(output_file_name, "w") as output_file: #generate n_sim sets of simulant motifs (constraining the space of simulants based on different sets of filters) if new_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = seed) elif newer_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = seed, concat = False, no_duplicates = True) else: current_simulants = nc.make_simulants(motifs, n_sim, seed = seed) #calculate the density parameters of the motifs in the sequence fasta output_dict = nc.get_sequence_set_density(fasta_name, None, motifs, simulants, n_sim, "{0}/overall_density_{1}.csv".format(output_folder_name, output_suffix), "{0}/overall_sim_density_{1}.csv".format(output_folder_name, output_suffix), "{0}/overall_positions.csv_{1}".format(output_folder_name, output_suffix), "{0}/overall_sim_positions_{1}".format(output_folder_name, output_suffix), concat = False, positions = False, feature_set = fs, verbose = True, two_seqs = two_seqs) record = [str(output_dict["median density"]), str(np.mean(output_dict["simulated densities"])), str(output_dict["median ND"]), str(output_dict["effective p"]), str(output_dict["Z"]), str(output_dict["depletion p"]), str(len(motifs)), str(output_dict["simulant sd"])] #write to output file output_file.write("\t".join(record)) print(record)
def main(): description = "Call peaks in a BED file of NET-seq reads." help_info = [ "BED file (at least a BED6) with NET-seq reads. Should be single-nucleotide resolution (each BED region is the 3' end of a read.).", "Ensembl GTF file for the relevant species. Ensure that chromosome names are formatted the same way in both the GTF and the BED file with reads!", "BED file with the coordinates of the transcripts to analyze. Only the name field is read, hence the others can hold placeholders. The name field must contain transcript IDs from the GTF file.", "Name of the output file (BED file with peak coordinates).", "Alpha value for calling a position as having a significantly higher local read denisty than expected by chance. Default: 0.01.", "Merge distance: adjacent peaks will be merged if they are closer than this many nucleotides. Default: 21.", "Minimum reads per peak. Default: 10.", "The number of times the read position randomization should be performed for each transcript. Higher values make the significance calculation (marginally) more robust, however, they also make the programme very slow. Default: 5.", "Minimum length of a peak in nucleotides. Default: 5.", "Size of the sliding window to use when calculating the local read density. It may be sensible to set this to the same value as the merge distance. Should be an odd integer. Default: 21", "The analysis will be performed this many times, with the output files numbered. Useful for running many negative control simulations at once. Default: 1.", "Read positions will be shuffled within each transcript before analysis. This should disrupt any signal and should give a flat peak density profile.", "Instead of a sliding window, adjacent non-overlapping windows will be used when calculating the local read density.", "When calling peaks in a given exon/intron, do not include that exon/intron in the read position randomization.", "When --exclude_focal is set, count an exon and its upstream intron as a single unit (except for the first exon).", "Don't filter out likely PCR duplicates (peaks where more than 90%% of the reads come from a single nucleotide position).)" ] defaults = {4: 0.01, 5: 21, 6: 10, 7: 5, 8: 5, 9: 21, 10: 1} args = hk.parse_arguments(description, [ "reads_file", "gtf", "trans_active_file", "output_file", "significance_threshold", "merge", "min_reads_per_peak", "iterations", "min_peak_length", "window_size", "runs", "neg_control", "no_slide", "exclude_focal", "with_ups_intron", "no_PCR_filter" ], floats=[4], ints=[5, 6, 7, 8, 9, 10], flags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], detailed_help=help_info, defaults=defaults) reads_file, gtf, trans_active_file, output_file, significance_threshold, merge, min_reads_per_peak, iterations, min_peak_length, window_size, runs, neg_control, no_slide, exclude_focal, with_ups_intron, no_PCR_filter = args.reads_file, args.gtf, args.trans_active_file, args.output_file, args.significance_threshold, args.merge, args.min_reads_per_peak, args.iterations, args.min_peak_length, args.window_size, args.runs, args.neg_control, args.no_slide, args.exclude_focal, args.with_ups_intron, args.no_PCR_filter print("Merge distance: {0}".format(merge)) print("Minimum number of reads per peak: {0}".format(min_reads_per_peak)) print("Minimum peak length: {0}".format(min_peak_length)) print("Window size: {0}".format(window_size)) print("Significance level: {0}".format(significance_threshold)) print("Randomization iterations to perform: {0}".format(iterations)) print("Runs: {0}".format(runs)) neg_str = "" if neg_control: neg_str = "_neg_control" slide_str = "" if no_slide: slide_str = "_no_slide" intron_str = "" if with_ups_intron: intron_str = "w_ups_intr" # 0. make a BED file with the coordinates of transcripts transcripts_file = "{0}_transcripts.bed".format(gtf[:-4]) co.get_transcripts(gtf, transcripts_file, add_chr=True) exons = rw.read_gtf(gtf, "exon") # 1. intersect the two files, loop over the result and make a # dictionary of reads per pos for each transcript, which has reads reads_per_pos = get_reads_per_pos(reads_file, transcripts_file) # only leave transcriptionally active genes (one isoform per gene) trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:] # pull out the column with transcript IDs trans_active_genes = [i[3] for i in trans_active_genes] reads_per_pos = { i: reads_per_pos[i] for i in reads_per_pos if i.split(".")[-1] in trans_active_genes } for sim in range(runs): print("**********{0}**********".format(sim)) # 2. for each transcript, randomly reshuffle the reads and calculate the # nth percentile depending on what the significance threshold is # keep positions that are higher than that threshold and write to BED file raw_peak_bed = "{0}_{1}_raw_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, min_reads_per_peak, window_size, neg_str, intron_str, slide_str, sim) read_count_file = "{0}_{1}_read_counts{2}_{3}{4}{5}_{6}_sim.txt".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, window_size, neg_str, intron_str, sim) new_reads_file = write_raw_peaks(reads_per_pos, raw_peak_bed, read_count_file, exons, iterations=iterations, min_read_count=min_reads_per_peak, window_size=window_size, neg_control=neg_control, no_slide=no_slide, exclude_focal=exclude_focal, with_ups_intron=with_ups_intron) if neg_control: reads_file = new_reads_file # 3. merge peaks merged_peak_bed = "{0}_{1}_merged_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, window_size, merge, neg_str, slide_str, intron_str, sim) co.merge_bed(raw_peak_bed, merged_peak_bed, merge) print("Before filtering, there are {0} peaks.".format( hk.line_count(merged_peak_bed))) # 4. filter out peaks that don't have enough reads or are too short. # Write final results to file and also write a stats file with the size, # read count and overlapping transcript of the peaks stats_file = "{0}_stats_{1}_sim.txt".format(output_file[:-4], sim) filter_peaks(merged_peak_bed, reads_file, read_count_file, "{0}_{1}_sim.bed".format(output_file[:-4], sim), min_reads_per_peak, min_peak_length, stats_file, no_PCR_filter=no_PCR_filter)
def main(): description = "Prepare a BED file with the TES coordinates of transcriptionally" \ "active genes and make a metagene of reads within this region." args = hk.parse_arguments(description, ["trans_act_file", "gtf", "start_coord", "end_coord", "outname", "reads_file"], ints = [2, 3]) trans_act_file, gtf, start_coord, end_coord, outname, reads_file = args.trans_act_file, args.gtf, args.start_coord, args.end_coord, args.outname, args.reads_file trans_act_genes = [] with open(trans_act_file) as f: reader = csv.reader(f, delimiter = "\t") for line in reader: trans_act_genes.append(line[3]) exons = rw.read_gtf(gtf, "exon") CDSs = rw.read_gtf(gtf, "CDS") exons = {i: exons[i] for i in exons if i in trans_act_genes} # protein-coding only exons = {i: exons[i] for i in exons if i in CDSs} ds_500 = "{0}_ds_500.bed".format(outname[:-4]) with open(outname, "w") as out, open(ds_500, "w") as out_ds: writer = csv.writer(out, delimiter="\t") writer_ds = csv.writer(out_ds, delimiter="\t") for trans in exons: strand = exons[trans][0][6] chrom = "chr{0}".format(exons[trans][0][0]) if strand == "+": TES = exons[trans][-1][4] new_start = TES - start_coord new_end = TES + end_coord new_start_ds = TES new_end_ds = TES + 500 else: TES = exons[trans][-1][3] new_start = TES - start_coord - 1 new_end = TES + start_coord - 1 new_start_ds = TES - 500 - 1 new_end_ds = TES - 1 writer.writerow([chrom, new_start, new_end, trans, "0", strand]) chrom = chrom.lstrip("chr") writer_ds.writerow([chrom, new_start_ds, new_end_ds, trans, "0", strand]) intersect = "{0}_ds500_intersect.bed".format(outname[:-4]) transcripts_file = "{0}_transcripts.bed".format(gtf[:-4]) co.intersect_bed(ds_500, transcripts_file, write_both = True, force_strand=False, no_dups = False, output_file=intersect) co.get_transcripts(gtf, transcripts_file, with_detail=True) mapping = co.transcript_mapping(transcripts_file) to_exclude = [] with open(intersect) as int_file: reader = csv.reader(int_file, delimiter = "\t") for line in reader: strand = line[5] curr_gene = mapping[line[3]] other_gene = mapping[line[9]] if curr_gene != other_gene: to_exclude.append(line[3]) filtered_out_name = "{0}_filt.txt".format(outname[:-4]) with open(filtered_out_name, "w") as filt_f: for name in to_exclude: filt_f.write("{0}\n".format(name)) final_out_name = "{0}_distrib.bed".format(outname[:-4]) distances = co.peak_pos_in_exon(outname, reads_file, from_end = True, reads_mode = True)[0] write_dist_mat(distances, start_coord + end_coord, final_out_name, None, "{0}_names.txt".format(final_out_name[:-4]), None)
def main(): description = "Directly compare the frequency of segregating sites/mean allele frequency between hits and controls." args = parse_arguments(description, [ "hit_file", "control_file", "INSIGHT_hit_file", "INSIGHT_control_file", "SFS_file", "trial_file", "trials", "shuffle" ], ints=[6], flags=[7]) hit_file, control_file, INSIGHT_hit_file, INSIGHT_control_file, SFS_file, trial_file, trials, shuffle = args.hit_file, args.control_file, args.INSIGHT_hit_file, args.INSIGHT_control_file, args.SFS_file, args.trial_file, args.trials, args.shuffle true_hits = rw.read_pos(hit_file) true_controls = rw.read_pos(control_file) #to store the original data in case this is a negative control and you will be shuffling #hits and controls original_INSIGHT_hit_file = INSIGHT_hit_file original_INSIGHT_control_file = INSIGHT_control_file print(hit_file) with open(trial_file, "w") as file: file.write( "trial\tpoly_fraction_hits - poly_fraction_controls\tmedian_hit_MAF - median_control_MAF\n" ) for trial in range(trials): to_write = "{0}\t".format(trial) #if this is a negative control if shuffle: INSIGHT_hit_file = re.sub("_0_", "_{0}_".format(trial), original_INSIGHT_hit_file) INSIGHT_control_file = re.sub("_0_", "_{0}_".format(trial), original_INSIGHT_control_file) temp_hits_file = "temp_data/temp_hits{0}.txt".format( random.random()) temp_controls_file = "temp_data/temp_controls{0}.txt".format( random.random()) #shuffle hits and controls temp_hits, temp_controls = shuffle_dictionaries( true_hits, true_controls) rw.write_pos(temp_hits, temp_hits_file) rw.write_pos(temp_controls, temp_controls_file) SFS_file = "temp_data/temp_SFS_file{0}.txt".format( random.random()) #generate an ISNIGHT input file that you could then use for the manual analysis run_process([ "python3", "mDFEest_input.py", temp_hits_file, temp_controls_file, "general/1000genomes/filtered_hg38_85_pc_multiexon_Yoruban_SNPs_relative.txt", 216, SFS_file ]) remove_file(temp_hits_file) remove_file(temp_controls_file) hit_data = get_data(INSIGHT_hit_file) control_data = get_data(INSIGHT_control_file) poly_ratio_diff = get_chisq_site_freq(hit_data, control_data) to_write = to_write + "{0}\t".format(poly_ratio_diff) temp, median_diff = get_mean_freq(SFS_file) to_write = to_write + "{0}\n".format(median_diff) if shuffle: remove_file(SFS_file) file.write(to_write)
def main(): ''' Read in a series of input files on the sequence specificities of RBPs, filter the data and write a set of motifs for each RBP. Arguments (see Methods for further details on the input data files): upper_threshold, lower_threshold: the longest and shortest a motif is allowed to be, respectively RBPDB_experiments: path to RBPDB experiments file RBPDB proteins: path to RBPDB proteins file RBPDB_PWMs: path to file containing RBPDB PWM identifier to RBP mapping pwm_dir: path to directory containing RBPDB PWMs RBPmap_PSSMs: path to directory containing RBPmap PSSMs SFmap_proteins: path to file containing motifs from SFmap RNAcompete_information: path to summary file from CIS-BP RNA RNAcompete_PWMs: path to directory containing CIS-BP RNA PWMs final_motifs_file_name: name for output file plot_name: file for plot displaying the distribution of motif set sizes species: the species for which motifs are required ''' description = "Compile a set of motifs putatively recognized by RNA-binding proteins." args = parse_arguments(description, ["upper_threshold", "lower_threshold", "RBPDB_experiments", "RBPDB_proteins", "RBPDB_PWMs", "pwm_dir", "RBPmap_PSSMs", "SFmap_proteins", "RNAcompete_information", "RNAcompete_PWMs", "final_motifs_file_name", "plot_name", "species"], ints = [0, 1]) [upper_threshold, lower_threshold, RBPDB_experiments, RBPDB_proteins, RBPDB_PWMs, pwm_dir, RBPmap_PSSMs, SFmap_proteins, RNAcompete_information, RNAcompete_PWMs, final_motifs_file_name, plot_name, species] = [args.upper_threshold, args.lower_threshold, args.RBPDB_experiments, args.RBPDB_proteins, args.RBPDB_PWMs, args.pwm_dir, args.RBPmap_PSSMs, args.SFmap_proteins, args.RNAcompete_information, args.RNAcompete_PWMs, args.final_motifs_file_name, args.plot_name, args.species] db_fields = rw.read_many_fields(RBPDB_experiments, ",") db_fields = db_fields[1:] print("There are {0} RBPDB experiments.".format(len(db_fields))) db_proteins = rw.read_many_fields(RBPDB_proteins, ",") #species is "H**o sapiens" or "Mus musculus" db_proteins = [i for i in db_proteins if i[6] == species] protein_names = sorted(list(set([i[4] for i in db_proteins]))) db_fields = [i for i in db_fields if i[3] in protein_names] protein_number_before = (len(list(set([i[3] for i in db_fields])))) print("{0} were performed in {1}.\n".format(len(db_fields), species)) db_fields = [i for i in db_fields if i[2] != ""] protein_number_after = (len(list(set([i[3] for i in db_fields])))) db_fields = [[i[3], "RBPDB", i[0], i[1], i[2]] for i in db_fields] print("After removing experiments with no reported motif, {0} proteins remain of the initial {1}.\n".format(protein_number_after, protein_number_before)) bases = np.array(["A", "C", "G", "U"]) db_pwm_list = rw.read_many_fields(RBPDB_PWMs, "\t") for i in db_pwm_list: if i[1] in protein_names: current_file_name = "{0}/{1}.pwm".format(pwm_dir, i[0]) current_PWM = rw.read_many_fields(current_file_name, delimiter = " ") for j in range(len(current_PWM)): current_PWM[j] = [float(k) for k in current_PWM[j] if k != ""] consensus = nc.consensus_from_PWM(current_PWM, bases, 0) PMID = i[0].split("_") PMID = PMID[1] new_record = [i[1], "RBPDB_PWM", PMID, "SELEX", consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding additional sequences from SELEX PWMs (RBPDB), there are {0} proteins.\n".format(protein_number_after)) if species == "Mus musculus": RBPmap_proteins = rw.read_many_fields("RBP/RBPmap_proteins.csv", ",") RBPmap_proteins = list_to_dict(RBPmap_proteins, 0, 1) RNAc_source = [i for i in RBPmap_proteins if "23846655" in RBPmap_proteins[i]] else: RNAc_source = [] for file_name in os.listdir(RBPmap_PSSMs): #RBPmap and SFmap don't distinguish between human and mouse motifs if "human" in file_name: file_name_split = file_name.split("_") protein_name = file_name_split[0] if protein_name not in RNAc_source: initial_pssm = rw.read_many_fields(os.path.join(RBPmap_PSSMs, file_name), delimiter = "\t") current_pssm = initial_pssm[1:] current_pssm = [i[1:] for i in current_pssm] for i in range(len(current_pssm)): current_pssm[i] = [float(j) for j in current_pssm[i]] consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True) protein_name = list(protein_name) if protein_name[:4] == ["S", "R", "S", "F"]: protein_name[:4] = ["S", "F", "R", "S"] protein_name = "".join(protein_name) new_record = [protein_name, "RBPmap_PWM", "NULL", "various", consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding additional sequences from RBPmap PSSMs, there are {0} proteins.\n".format(protein_number_after)) SFmap_data = rw.read_many_fields(SFmap_proteins, delimiter = ",") for i in SFmap_data: if "," in i[1]: temp_split = i[1].split(", ") temp_split = [j.upper() for j in temp_split] i[1] = ";".join(temp_split) else: i[1] = i[1].upper() new_record = [i[0], "SFmap", "NULL", "various", i[1]] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding motifs from SFmap, there are {0} proteins.\n".format(protein_number_after)) RNAc = rw.read_many_fields(RNAcompete_information, delimiter = "\t") RNAc = [i for i in RNAc[1:] if i] if species == "H**o sapiens": RNAc = [i for i in RNAc if i[3] != "." and i[8] == "D"] if species == "Mus musculus": RNAc = [i for i in RNAc if i[3] != "."] PSSM_folder = RNAcompete_PWMs for record in RNAc: motif_name = record[3] initial_pssm = rw.read_many_fields(os.path.join(PSSM_folder, "{0}.txt".format(motif_name)), delimiter = "\t") if initial_pssm == []: if record[19] == "21036867":#RBPDB paper pass else: print(record) else: current_pssm = initial_pssm[1:] current_pssm = [i[1:] for i in current_pssm] for i in range(len(current_pssm)): current_pssm[i] = [float(j) for j in current_pssm[i]] consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True) protein_name = record[6] new_record = [protein_name, "CIS-BP_RNA_PWM", record[19], record[14], consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding motifs from CIS-BP RNA, there are {0} proteins.\n".format(protein_number_after)) to_delete = [] for pos, i in enumerate(db_fields): if ";" in i[4]: if "; " in i[4]: temp_split = i[4].split("; ") else: temp_split = i[4].split(";") temp_split = [((j.upper()).lstrip("N")).rstrip("N") for j in temp_split] temp_split = [j for j in temp_split if len(j) <= upper_threshold and len(j) >= lower_threshold and "(" not in j] if temp_split: db_fields[pos][4] = temp_split[0] for j in temp_split[1:]: db_fields.append([i[0], i[1], i[2], i[3], j]) else: to_delete.append(pos) else: i[4] = (((i[4]).upper()).rstrip("N")).lstrip("N") if len(i[4]) > upper_threshold or len(i[4]) < lower_threshold or "(" in i[4]: to_delete.append(pos) else: db_fields[pos][4] = i[4] db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete] protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After only keeping motifs of length {0}-{1} bp, {2} proteins remain.\n".format(lower_threshold, upper_threshold, protein_number_after)) protein_names = list(set([i[0] for i in db_fields])) if species == "Mus musculus": protein_names_file = "RBP/RBP_names_for_checking.txt" with open(protein_names_file, "w") as file: for name in protein_names: file.write("{0}\n".format(name)) MGI_file = "RBP/MGI_correspondances.txt" MGI = rw.read_many_fields(MGI_file, "\t") MGI_names_all = [i[0] for i in MGI[1:]] found = [i[0] for i in MGI if i[0] == i[3]] MGI = {i[0]: i[3] for i in MGI[1:] if i[0] not in found} to_delete = [] for pos, i in enumerate(db_fields): if species == "Mus musculus": db_fields[pos][0] = "".join([db_fields[pos][0][0].upper(), db_fields[pos][0][1:].lower()]) #will get rid of Hnrnpcl1, which didn't return anything in the MGI search. if db_fields[pos][0] not in MGI_names_all: to_delete.append(pos) else: if db_fields[pos][0] not in found: db_fields[pos][0] = MGI[db_fields[pos][0]] elif species == "H**o sapiens": if i[0] == "A2BP1" or i[0] == "FOX1": db_fields[pos][0] = "RBFOX1" elif i[0] == "SFRS13A": db_fields[pos][0] = "SRSF10" elif i[0][:6] == "BRUNOL": db_fields[pos][0] = "CELF{0}".format(i[0][-1]) elif i[0] == "CUGBP": db_fields[pos][0] = "CELF1" elif i[0] == "Fusip1": db_fields[pos][0] = "SRSF10" elif i[0][:4] == "SFRS": db_fields[pos][0] = "SRSF{0}".format(i[0][4:]) elif i[0] == "HuR": db_fields[pos][0] = "ELAVL1" elif i[0] == "MBNL": db_fields[pos][0] = "MBNL1" elif i[0] == "PTB": db_fields[pos][0] = "PTBP1" elif i[0] == "QK1": db_fields[pos][0] = "QKI" elif i[0] == "RBM9": db_fields[pos][0] = "RBFOX2" elif i[0] == "STAR-PAP": db_fields[pos][0] = "TUT1" elif i[0] == "YB-1": db_fields[pos][0] = "YBX1" elif i[0] == "hnRNPK": db_fields[pos][0] = "HNRNPK" elif i[0] == "hnRNPLL" or i[0] == "HNRPLL": db_fields[pos][0] = "HNRNPLL" db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete] protein_names = list(set([i[0] for i in db_fields])) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After cleaning up protein IDs, {0} proteins remain.\n".format(protein_number_after)) protein_dict = {} for i in db_fields: if i[0] not in protein_dict.keys(): protein_dict[i[0]] = [i] else: protein_dict[i[0]].append(i) if species == "H**o sapeins": del protein_dict["PPIE"] del protein_dict["MIR1236"] del protein_dict["PABPC4"] print("After removing PPIE, PABPC4 and MIR1236, {0} proteins remain.\n".format(len(protein_dict))) elif species == "Mus musculus": del protein_dict["Pabpc4"] print("After removing Pabpc4, {0} proteins remain.\n".format(len(protein_dict))) for i in protein_dict: if i == "ELAVL1": protein_dict[i].append(['ELAVL1', 'synthetic', 'synthetic', 'synthetic', 'UUWGDUU']) elif i == "ELAVL2": protein_dict[i].append(['ELAVL2', 'synthetic', 'synthetic', 'synthetic', 'RWUUYAUUUWR']) protein_dict[i] = sorted(protein_dict[i], key = lambda x:x[4]) current_motifs = [j[4] for j in protein_dict[i]] to_delete = [] for j in range(1, len(current_motifs)): if current_motifs[j] == current_motifs[j-1]: for k in range(1, 4): protein_dict[i][j][k] = ",".join([protein_dict[i][j][k], protein_dict[i][j - 1][k]]) to_delete.append(j - 1) protein_dict[i] = [protein_dict[i][j] for j in range(len(protein_dict[i])) if j not in to_delete] for i in protein_dict: protein_dict[i] = [[j[0], j[4], j[1], j[2], j[3]] for j in protein_dict[i]] print("\n") print("Writing motifs to {0}.\n".format(final_motifs_file_name)) motif_numbers = [] with open(final_motifs_file_name, "w") as final_motifs_file: for i in sorted(list(protein_dict.keys())): final_motifs_file.write(">{0}\n".format(i)) current_motifs = [j[1] for j in protein_dict[i]] DNA_motifs = [nc.DNA_RNA_conversion(j) for j in current_motifs] unravelled_motifs = [nc.unravel_consensus(j) for j in DNA_motifs] unravelled_motifs = flatten(unravelled_motifs) unravelled_motifs = list(set(unravelled_motifs)) print("Writing {0} motifs for {1}.".format(len(unravelled_motifs), i)) motif_numbers.append(len(unravelled_motifs)) unravelled_motifs = "|".join(unravelled_motifs) final_motifs_file.write("{0}\n".format(unravelled_motifs)) plt.figure(1) plotting.histogram(motif_numbers, 50, x_lab = "Motif number", y_lab = "Frequency", title = None) plotting.save_and_show([10, 10], 100, plot_name)
def main(): description = "Run mDFEest." args = parse_arguments(description, ["hit_file", "control_file", "SNP_file", "SNP_number", "input_file", "output_file", "seed", "fixed_model", "new_input", "shuffle", "fix_pop_change"], ints = [3], flags = [8, 9, 10]) hit_file, control_file, SNP_file, SNP_number, input_file, output_file, seed, fixed_model, new_input, shuffle, fix_pop_change = args.hit_file, args.control_file, args.SNP_file, args.SNP_number, args.input_file, args.output_file, args.seed, args.fixed_model, args.new_input, args.shuffle, args.fix_pop_change #if you want to generate a new input file rather than reading in an existing one if new_input: remove_file("../multidfe/{0}".format(input_file.split("/")[-1])) arguments = ["python3", "mDFEest_input.py", hit_file, control_file, SNP_file, SNP_number, input_file] if shuffle: arguments.append("--shuffle") run_process(arguments) if seed == "None": seed = None else: seed = float(seed) #if you want to run it only with a population size change model, #rather than both a model assuming population size change and a fixed population #size model if fix_pop_change: pop_change = [True] else: pop_change = [False, True] if fixed_model == "None": #all possible models allowed = ["lognormal", "gamma", "beta", "spikes", "steps", "fixed six spikes"] spike_range = [2, 6] else: #only the spcified model allowed = [fixed_model] #only two-spike models spike_range = [2, 3] with open(output_file, "w") as file: file.write("model\tpop_change\tAIC\tNes_0.0_0.1\tNes_0.1_1.0\tNes_1.0_10.0\tNes_10.0_100.0\traw\n") for change_mode in pop_change: print("\nPopulation expansion: {0}.".format(str(change_mode))) if "lognormal" in allowed: print("lognormal model:") output = mDFEest("lognormal", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) if "gamma" in allowed: print("gamma model:") output = mDFEest("gamma", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) if "beta" in allowed: print("beta model:") output = mDFEest("beta", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) for spike_number in range(spike_range[0], spike_range[1]): if "spikes" in allowed: print("{0}-spikes model:".format(spike_number)) output = mDFEest("spikes", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode) print(output) write_mDFEest_output(output, file, change_mode) if "steps" in allowed: print("{0}-steps model:".format(spike_number)) output = mDFEest("steps", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode) print(output) write_mDFEest_output(output, file, change_mode) if "fixed six spikes" in allowed: print("fixed six spikes model:") output = mDFEest("six_spikes", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode)
def main(): description = "Aggregate various statistics on the splicing events you're studying." args = hk.parse_arguments(description, [ "gtf", "polII_bed", "exon_start_coords", "truncated_exons_file", "genome_file", "output_file" ]) gtf, polII_bed, exon_start_coords, truncated_exons_file, genome_file, output_file = args.gtf, args.polII_bed, args.exon_start_coords, args.truncated_exons_file, args.genome_file, args.output_file CDSs = rw.read_gtf(gtf, "CDS", gene=False) exons = rw.read_gtf(gtf, "exon", gene=False) exon_starts = rw.read_many_fields(exon_start_coords, skip_header=False, delimiter="\t") exon_starts = {i[3]: i for i in exon_starts} out_array = np.array(sorted(exon_starts.keys()), dtype="str") out_array.shape = (len(exon_starts.keys()), 1) out_array = np.vstack((["junction"], out_array)) #1. exon size curr_dict = co.get_lengths(CDSs, exon_starts.keys()) out_array = add_to_array(out_array, curr_dict, "exon_size") print("Exon size done.") #2. exon number curr_dict = co.get_exon_number(exons, exon_starts.keys()) out_array = add_to_array(out_array, curr_dict, "exon_number") print("Exon number done.") #3. exon rank (from start and end) exon_rank_start, exon_rank_end = co.get_exon_rank(exons, exon_starts) out_array = add_to_array(out_array, exon_rank_start, "exon_rank_from_start") out_array = add_to_array(out_array, exon_rank_end, "exon_rank_from_end") print("Exon rank done.") #4. upstream intron size curr_dict = co.get_upstream_intron_size(exons, exon_rank_start) out_array = add_to_array(out_array, curr_dict, "upstream_intron_size") curr_dict = co.get_upstream_intron_size(exons, exon_rank_start, downstream=True) out_array = add_to_array(out_array, curr_dict, "downstream_intron_size") print("Intron size done.") if truncated_exons_file != "None": #5. Pol II density per transcript dens_per_trans_file = "{0}_dens_per_trans.txt".format(polII_bed[:-4]) dens_per_trans_junctions = get_dens_per_trans(truncated_exons_file, polII_bed, dens_per_trans_file, out_array[1:, 0]) out_array = add_to_array(out_array, dens_per_trans_junctions, "polII_dens_per_trans") print("Pol II density done.") #6. exon GC4 and GC content genome = Fasta(genome_file) curr_dict = get_exon_GC4(CDSs, exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "exon_GC4") curr_dict = get_exon_GC(exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "exon_GC") print("Exon GC done.") #7. upstream intron GC content curr_dict = get_upstream_intron_GC(exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "upstream_intron_GC") print("Intron GC done.") #8. splice site strength curr_dict = nc.get_ss_strength(exons, genome_file, upstream=True, five=True, exonic=3, intronic=6) out_array = add_to_array(out_array, curr_dict, "upstream_5ss_strength") curr_dict = nc.get_ss_strength(exons, genome_file, upstream=True, five=False, exonic=3, intronic=20) out_array = add_to_array(out_array, curr_dict, "upstream_3ss_strength") curr_dict = nc.get_ss_strength(exons, genome_file, upstream=False, five=True, exonic=3, intronic=6) out_array = add_to_array(out_array, curr_dict, "downstream_5ss_strength") print("Splice site strength done.") with open(output_file, "w") as file: for line in range(0, out_array.shape[0]): line = out_array[line, :] line = "\t".join([str(i) for i in line]) file.write(line) file.write("\n")
def main(): description = "Run INSIGHT on a set of sequences and a set of sites." args = parse_arguments(description, ["fasta", "genome", "features_file", "families_file", "suffix", "dataset", "output_folder", "freq_threshold", "n", "hit_file", "control_file", "SNP_file_name_prefix", "CDS_SNP_file_name_prefix", "MSA_file_name_prefix", "trial_file", "trials", "hit_degen_file", "control_degen_file", "hit_reduce", "control_reduce", "new_SNPs", "new_MSA", "shuffle", "nonsyn_hits", "remove_GT", "big_tree"], floats = [7, 18, 19], ints = [8, 15], flags = [20, 21, 22, 23, 24, 25]) fasta, genome, features_file, families_file, suffix, dataset, general_output_folder, freq_threshold, n, hit_file, control_file, SNP_file_name_prefix, CDS_SNP_file_name_prefix, MSA_file_name_prefix, trial_file, trials, hit_degen_file, control_degen_file, hit_reduce, control_reduce, new_SNPs, new_MSA, shuffle, nonsyn_hits, remove_GT, big_tree = args.fasta, args.genome, args.features_file, args.families_file, args.suffix, args.dataset, args.output_folder, args.freq_threshold, args.n, args.hit_file, args.control_file, args.SNP_file_name_prefix, args.CDS_SNP_file_name_prefix, args.MSA_file_name_prefix, args.trial_file, args.trials, args.hit_degen_file, args.control_degen_file, args.hit_reduce, args.control_reduce, args.new_SNPs, args.new_MSA, args.shuffle, args.nonsyn_hits, args.remove_GT, args.big_tree output_folder = "{0}/{1}_{2}".format(general_output_folder, dataset, suffix) names, seqs = rw.read_fasta(fasta) #prepare feature set and family information fs = Feature_Set(features_file, genome) fs.set_dataset(dataset) if families_file == "None": conservation.find_families(fasta, "general/{0}".format(dataset)) families_file = "general/{0}_families.txt".format(dataset) families = rw.read_families(families_file) fs.add_families(families) make_dir(output_folder) general_folder = "DFE/for_everybody" make_dir(general_folder) if MSA_file_name_prefix == "None": MSA_file_name_prefix = "{0}/{1}_MSA".format(general_folder, dataset) #read in degeneracy information if hit_degen_file != "None": degen_hits = parse_degen(hit_degen_file) degen_controls = parse_degen(control_degen_file) else: degen_hits = None degen_controls = None #get relevant genome features transcripts = fs.get_transcripts() CDSs = fs.get_CDS() lengths = fs.get_lengths(CDSs, CDS = True) #filter out sex chromosomes from the analysis sex_chromosomes = ["X", "Y"] chrom_dict = {i: transcripts[i][0] for i in transcripts if transcripts[i][0] not in sex_chromosomes} chroms = list(set(list(chrom_dict.values()))) clean_names = ["h**o", "pan", "pongo", "macaca"] #if you're running several trials #if just one, it'll still make a single trial file if trial_file == "None": trial_file = "{0}_{1}_{2}.txt".format(trial_file, suffix, trials) with open(trial_file, "w") as o_file: print(suffix) #output file header o_file.write("rho\teta\tgamma\tDp\tPw\talpha\ttau\trhose\tetase\tgammase\trholl\tetall\tgammall\n") for trial in range(trials): print("==========TRIAL {0}==========\n".format(trial)) #get INSIGHT input data as a string based on divergence and SNP data hit_output, neutral_output, chroms_to_keep, hit_counts, control_counts = get_MSA(chroms, chrom_dict, control_file, hit_file, CDSs, lengths, names, seqs, clean_names, freq_threshold, dataset, suffix, genome, output_folder, general_folder, n, SNP_file_name_prefix, CDS_SNP_file_name_prefix, MSA_file_name_prefix, new_SNPs, new_MSA, shuffle, remove_GT, big_tree, hit_reduce = hit_reduce, control_reduce = control_reduce, degen_hits = degen_hits, degen_controls = degen_controls) print("Writing output files...") neutral_output_file = "{0}/{1}_{2}_{3}_neutral_input.txt".format(output_folder, dataset, suffix, trial) hit_output_file = "{0}/{1}_{2}_{3}_hit_input.txt".format(output_folder, dataset, suffix, trial) write_output_file(neutral_output_file, neutral_output, n) write_output_file(hit_output_file, hit_output, n) print("Running INSIGHT...") conservation.INSIGHT(neutral_output_file, hit_output_file, freq_threshold, "../Software/INSIGHT", "{0}_{1}".format(dataset, suffix)) print("Counting positions on chromosomes...") with open("{0}/{1}_{2}_pos_per_chrom.csv".format(output_folder, dataset, suffix), "w") as file: file.write("chrom\thits\tcontrols\n") for chrom in sorted(chroms_to_keep): file.write("{0}\t{1}\t{2}\n".format(chrom, hit_counts[chrom], control_counts[chrom])) INSIGHT_output = "../Software/INSIGHT/{0}_{1}.ins.log".format(dataset, suffix) #parse the INSIGHT output and do simple significance testing try: parsed_output = parse_INSIGHT_output(INSIGHT_output) estimates = parsed_output["estimates"] SE = parsed_output["SEs"] lls = parsed_output["chi_sq"] print("\n") print("Chisq statistics: {0}".format(" ".join([str(i) for i in lls]))) rho_pL = scipy.stats.chi2.sf(lls[0], 3) print("pL(rho): {0}".format(rho_pL)) eta_pL = scipy.stats.chi2.sf(lls[1], 1) print("pL(eta): {0}".format(eta_pL)) gamma_pL = scipy.stats.chi2.sf(lls[2], 1) print("pL(gamma): {0}".format(gamma_pL)) lls = "\t".join([str(i) for i in lls]) estimates = "\t".join(estimates) SE = "\t".join(SE) o_file.write(estimates) o_file.write("\t") o_file.write(SE) o_file.write("\t") o_file.write(lls) o_file.write("\n") #skip trials where INSIGHT failed to produce a full output except IndexError: print("Skipping...") pass
def main(): description = "Record splicing distance." args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7]) input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal if outsuffix == "None": outsuffix = "" bare_input_path = input_file.split("/")[-1] bed = "{0}.bed".format(input_file[:-4]) # hk.convert2bed(input_file, bed) # get descriptive stats of the reads length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4]) write_read_lengths(bed, length_file) # read in CDS coordinates exons = rw.read_gtf(gtf, "CDS", gene=False) # only leave transcriptionally active genes (one isoform per gene) trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:] # pull out the column with transcript IDs trans_active_genes = [i[3] for i in trans_active_genes] exons = {i: exons[i] for i in exons if i in trans_active_genes} terminal_suff = "_with_terminal" if not leave_terminal: # remove last exons exons = {i: exons[i][:-1] for i in exons} terminal_suff = "" # prepare exon-exon junctions exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff) all_junctions = co.extract_3ss(exons, exon_junctions_file) out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True) out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True) intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff) write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True) out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True) out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True) out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) write_si_pos(all_junctions, out_bed_si, exons, add_chr=True) out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True) # check which junctions are associated with a splicing intermediate read snr_bed = "{0}_snr.bed".format(bed[:-4]) co.snr_bed(bed, snr_bed) si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed) si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed) # filter out reads that don't overlap exon-exon junctions exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) co.intersect_bed(bed, exon_junctions_file, write_both=True, output_file=exon_junction_bed, force_strand=True, no_dups=False) spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) sr_distances = {} ur_distances = {} found_count = 0 file_size = hk.line_count(exon_junction_bed) # will store all the intron names for which there are # either spliced or unspliced reads valid_junctions = [] with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile: for pos, line in enumerate(file): if pos % 100000 == 0: print("{0}/{1}".format(pos, file_size)) print("Found {0} spliced reads.".format(found_count)) print("\n") line = line.split("\t") # reads that end at the last nucleotide of an exon intermediate_read = NGS.check_intermediate_read(line, exons) intron_name = line[20] if not intermediate_read: # check that it ends within the exon just downstream of # the 3' ss that is being analyzed in_dwns_exon = NGS.check_position_in_exon(line, exons) if in_dwns_exon: # 'spliced', 'unspliced' or 'None' (=can't analyze) read_type = NGS.analyze_cigar(line, overhang = 5) if read_type: if intron_name not in valid_junctions: valid_junctions.append(intron_name) splice_dist = NGS.get_splice_dist(line) if read_type == "S": sfile.write("\t".join([str(i) for i in line])) found_count = found_count + 1 sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist) else: ufile.write("\t".join([str(i) for i in line])) ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist) print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1))) # for each valid junction, calculate the length of the exonic sequence # afterwards, so that you wouldn't consider intronic sequence in the distance # matrix lengths_dict = co.get_lengths(exons, valid_junctions) write_dist_mat(sr_distances, window_size, "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), lengths_dict, "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)) write_dist_mat(ur_distances, window_size, "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), lengths_dict, "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))
def main(): description = "Calculate the normalized dS of a dataset." args = parse_arguments(description, [ "dataset", "feature_set", "genome", "families_file", "fasta", "hit_file_prefix", "motifs_file", "correspondances", "alignments", "suffix", "trials", "trial_file", "old_trial_file", "region_fasta", "old_motif_format", "nonsense", "no_families", "newest_only", "top_set_only", "calc_p", "reverse_site_numbers", "matched", "degen", "regions" ], ints=[10], flags=[14, 15, 16, 17, 18, 19, 20, 21, 22, 23]) dataset, feature_set, genome, families_file, fasta, hit_file_prefix, motifs_file, correspondances, alignments, suffix, trials, trial_file, old_trial_file, region_fasta, old_motif_format, nonsense, no_families, newest_only, top_set_only, calc_p, reverse_site_numbers, matched, degen, regions = args.dataset, args.feature_set, args.genome, args.families_file, args.fasta, args.hit_file_prefix, args.motifs_file, args.correspondances, args.alignments, args.suffix, args.trials, args.trial_file, args.old_trial_file, args.region_fasta, args.old_motif_format, args.nonsense, args.no_families, args.newest_only, args.top_set_only, args.calc_p, args.reverse_site_numbers, args.matched, args.degen, args.regions n_sim = 1000 print(suffix) #set up feature set and families fs = Feature_Set(feature_set, genome) fs.set_dataset(dataset) if no_families: picked = fs.names else: families = rw.read_families(families_file) fs.add_families(families) picked = fs.pick_random_members() hit_phylip = "temp_data/temp_{0}.phy".format(random.random()) control_phylip = "temp_data/temp_control_{0}.phy".format(random.random()) if not nonsense: if old_motif_format: motifs = rw.read_names(motifs_file)[1:] else: motifs = rw.read_motifs(motifs_file) if top_set_only: summary_data = rw.read_many_fields( "RBP/RBP_hg38_introncontaining_new.txt", "\t") summary_dict = list_to_dict(summary_data, 0, 4, floatify=True) motifs = { RBP: motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) } motifs = list(set(flatten(motifs.values()))) if reverse_site_numbers: site_number_suffix = "_reversed_site_numbers_" else: site_number_suffix = "" if matched: matched_suff = "_matched" else: matched_suff = "" if degen: degen_suff = "_degen.txt" else: degen_suff = "" with open(trial_file, "w") as trial_out: trial_out.write( "trial\tA\tT\tC\tG\told\told_no_hum_CG\tnew_no_human_CG\tnew_no_hum_no_anc_CG\tnew_w_CG\tnew_no_anc_CG\tnew_no_anc_CG_macaque\tnewer_no_human_CG\tnewer_no_hum_no_anc_CG\tnewer_w_CG\tnewer_no_anc_CG\n" ) if old_trial_file != "None": old_trials = rw.read_many_fields(old_trial_file, "\t") old_trials = old_trials[1:] old_trials = [i[1:5] for i in old_trials] seed_kmers = 1 else: seed_kmers = None #you can do this for loads of trials #useful as a negative control if you're generating a new set of nonsense motifs #each time for trial in range(trials): print(trial) trial_output = [trial] #if you're meant to generate a load of nonsense motifs rather than using real motifs if nonsense: if old_trial_file != "None": #read in the intended nucleotide composition of the nonsense #motifs from file scaled_comp = [float(i) for i in old_trials[trial]] else: #pick nonsense motifs nucleotide composition by chance comp = [random.random() for i in range(4)] scaled_comp = [i / np.sum(comp) for i in comp] comp_dict = { i: scaled_comp[pos] for pos, i in enumerate(nc._canon_bases_) } motifs, obtained_dict = nc.kmers_from_nc(6, 50, comp_dict=comp_dict, return_freqs=True, seed=seed_kmers) motifs = ["motifs"] + motifs trial_output = trial_output + [ obtained_dict[i] for i in nc._canon_bases_ ] temp_motifs_file = "temp_data/temp_motifs.txt" rw.write_names(motifs, temp_motifs_file) print( "===NEW METHOD WITH NO ANCESTRAL CpG (MACAQUE, BIG TREE, CONTEXT), REPLACEMENT CONTROL===" ) hit_file = "{0}_hits_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format( hit_file_prefix, matched_suff, degen_suff) control_file = "{0}_controls_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format( hit_file_prefix, matched_suff, degen_suff) if nonsense: hit_file = "temp_data/temp_hits{0}.txt".format(random.random()) control_file = "temp_data/temp_controls{0}.txt".format( random.random()) error_file = "temp_data/temp_error{0}.txt".format( random.random()) get_control_sites( fasta, genome, feature_set, families_file, dataset, temp_motifs_file, hit_file, control_file, error_file, "DFE/for_everybody/filtered_hg38_85_pc_multiexon_anc_CG_big_context_threshold05.txt", [ "--leave_CG", "--context", "--remove_ancestral_CpG", "--macaque_anc", "--big_tree", "--replacement_control" ]) get_density(fasta, motifs, fs) norm_ds = get_new_method_results(hit_file, control_file, hit_phylip, control_phylip, correspondances, alignments, fasta, regions=regions, global_fasta=region_fasta, fs=fs) trial_output.append(norm_ds) if calc_p: p, low_CI, high_CI, sd, Z = get_sim_p( norm_ds, hit_file, control_file, correspondances, alignments, fasta, n_sim, reverse_site_numbers=reverse_site_numbers, sim_ds_file= "{0}{1}_sim_norm_ds_no_anc_CG_only_macaque_big_context{2}_replace.txt{3}" .format(hit_file_prefix, site_number_suffix, matched_suff, degen_suff)) trial_output = "\t".join([str(i) for i in trial_output]) trial_out.write(trial_output) trial_out.write("\n") remove_file(hit_phylip)
def main(): description = "Write out a BED file with the region surrounding the TSS for a set of genes." args = hk.parse_arguments( description, ["genes_file", "gtf", "outfile", "start_coord", "end_coord"], ints=[3, 4]) genes_file, gtf, outfile, start_coord, end_coord = args.genes_file, args.gtf, args.outfile, args.start_coord, args.end_coord need_to_seek = False if genes_file[-3:] != "bed": # it means that you got a list of gene symbols rather than a # BED file with coordinates need_to_seek = True transcript_file = "{0}_transcripts.gtf".format(gtf[:-4]) co.get_transcripts(gtf, transcript_file, add_chr=False, with_detail=False, output_gtf=True) with open(genes_file) as gf, open(outfile, "w") as of: reader = csv.reader(gf, delimiter="\t") writer = csv.writer(of, delimiter="\t") for line in reader: if need_to_seek: gene = line[0] print(gene) possibilities = hk.run_process([ "grep", "gene_symbol \"\"{0}\"\"".format(gene), transcript_file ]) possibilities = [ i.split("\t") for i in possibilities.split("\n")[:-1] ] chrom = "chr{0}".format(possibilities[0][0]) strand = possibilities[0][6] starts = [i[3] for i in possibilities] ends = [i[4] for i in possibilities] if strand == "+": counts = Counter(starts) start = int(counts.most_common()[0][0]) end = ends[starts.index(str(start))] length = int(end) - start new_end_coord = min(length, end_coord) new_start = str(start - start_coord - 1) new_end = str(start + (new_end_coord - 1)) elif strand == "-": counts = Counter(ends) end = int(counts.most_common()[0][0]) start = starts[ends.index(str(end))] length = end - int(start) new_end_coord = min(length, end_coord) new_start = str(end - new_end_coord) new_end = str(end + start_coord) new_line = [chrom, new_start, new_end, gene, ".", strand] writer.writerow(new_line) else: if line[0] != "chrom": length = int(line[2]) - int(line[1]) curr_end_coord = min(length, end_coord) if line[5] == "+": new_start = str(int(line[1]) - start_coord) new_end = str(int(line[1]) + curr_end_coord) elif line[5] == "-": new_start = str(int(line[2]) - curr_end_coord) new_end = str(int(line[2]) + start_coord) else: raise Exception("Invalid strand!") new_line = line.copy() new_line[1] = new_start new_line[2] = new_end # to make it a BED6 new_line = new_line[:-2] writer.writerow(new_line)
def main(): description = "Pick roughly nucleotide-matched control sites for a set of motif hits." args = parse_arguments(description, ["fasta", "genome", "features_file", "families_file", "dataset", "motifs_file", "run_number", "hit_file", "niter", "stepsize", "control_file", "error_file", "MSA_file_name_prefix", "anc_CG_file_name", "high_CG_file_name", "exclude_file", "brute_mapping", "verbose", "old_motif_format", "nonsyn_hits", "top_set_only", "remove_GT", "leave_CG", "remove_ancestral_CpG", "replacement_control", "macaque_anc", "remove_macaque_CpG", "big_tree", "pseudoCG", "comprehensive", "context", "prone_sites", "CG_gene_filter", "match_size", "raw", "regions"], ints = [6, 8, 9], flags = [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]) fasta, genome, features_file, families_file, dataset, motifs_file, run_number, hit_file, niter, stepsize, control_file, error_file, MSA_file_name_prefix, anc_CG_file_name, high_CG_file_name, exclude_file, brute_mapping, verbose, old_motif_format, nonsyn_hits, top_set_only, remove_GT, leave_CG, remove_ancestral_CpG, replacement_control, macaque_anc, remove_macaque_CpG, big_tree, pseudoCG, comprehensive, context, prone_sites, CG_gene_filter, match_size, raw, regions = args.fasta, args.genome, args.features_file, args.families_file, args.dataset, args.motifs_file, args.run_number, args.hit_file, args.niter, args.stepsize, args.control_file, args.error_file, args.MSA_file_name_prefix, args.anc_CG_file_name, args.high_CG_file_name, args.exclude_file, args.brute_mapping, args.verbose, args.old_motif_format, args.nonsyn_hits, args.top_set_only, args.remove_GT, args.leave_CG, args.remove_ancestral_CpG, args.replacement_control, args.macaque_anc, args.remove_macaque_CpG, args.big_tree, args.pseudoCG, args.comprehensive, args.context, args.prone_sites, args.CG_gene_filter, args.match_size, args.raw, args.regions #argparse can't do booleans if anc_CG_file_name == "None": anc_CG_file_name = None #I store motif data in one of two formats if old_motif_format: motifs = rw.read_names(motifs_file)[1:] else: motifs = rw.read_motifs(motifs_file) #if you're doing RBP motifs and only want motifs that were found to be enriched in Savisaar and Hurst 2017 if top_set_only: summary_data = rw.read_many_fields("RBP/RBP_hg38_introncontaining_new.txt", "\t") summary_dict = list_to_dict(summary_data, 0, 4, floatify = True) motifs = {RBP: motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1)} motifs = list(set(flatten(list(motifs.values())))) #create an instance of a Feature_Set object and associate a structure of paralogous families to it, unless if you've said to ignore that (used when analyzing exon flanks/cores) fs = Feature_Set(features_file, genome) fs.set_dataset(dataset) if families_file == "None": conservation.find_families(fasta, "general/{0}".format(dataset)) families_file = "general/{0}_families.txt".format(dataset) if families_file != "ignore": families = rw.read_families(families_file) fs.add_families(families) general_folder = "DFE/for_everybody" make_dir(general_folder) #if you've already retrieved MSAs from ensembl if MSA_file_name_prefix == "None": MSA_file_name_prefix = "{0}/{1}_MSA".format(general_folder, dataset) #admin transcripts = fs.get_transcripts() CDSs = fs.get_CDS() lengths = fs.get_lengths(CDSs, CDS = True) #only consider genes that are not on the sex chromosomes sex_chromosomes = ["X", "Y"] chrom_dict = {i: transcripts[i][0] for i in transcripts if transcripts[i][0] not in sex_chromosomes} chroms = list(set(list(chrom_dict.values()))) #U2S is a dinucleotide-based substitution model, JC69 is mononucleotide-based if context: subst_model = "U2S" else: subst_model = "JC69" #names used in the MSA (there's a character restriction in the phylip files so you can't use the full name) clean_names = ["h**o", "pan", "pongo", "macaca"] phylip_data = {"homo_sapiens": [], "pongo_abelii": [], "macaca_mulatta": [], "pan_troglodytes": []} if big_tree: clean_names = ["calli", "chloro", "gorilla", "h**o", "macaca", "pan", "papio", "pongo"] phylip_data = {"gorilla_gorilla": [], "callithrix_jacchus": [], "papio_anubis": [], "chlorocebus_sabaeus": [], "homo_sapiens": [], "pongo_abelii": [], "macaca_mulatta": [], "pan_troglodytes": []} if remove_ancestral_CpG or remove_macaque_CpG or CG_gene_filter: anc_CG_dict, macaque_CG_dict = get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = macaque_anc, pseudoCG = pseudoCG, comprehensive = comprehensive, subst_model = subst_model, regions = regions) else: anc_CG_dict = None macaque_CG_dict = None if replacement_control: nc.fit_control_pos_to_hits_replacement(fasta, motifs, run_number, hit_file, control_file, anc_CG_dict, macaque_CG_dict, family_seed = 5, CG_gene_filter = CG_gene_filter, niter = niter, verbose = verbose, brute_mapping = brute_mapping, stepsize = stepsize, write_errors = error_file, fs = fs, nonsyn_hits = nonsyn_hits, leave_CG = leave_CG, remove_ancestral_CpG = remove_ancestral_CpG, remove_macaque_CpG = remove_macaque_CpG, pseudoCG = pseudoCG, prone_sites = prone_sites, match_size = match_size, raw = raw, exclude_file = exclude_file) else: nc.fit_control_pos_to_hits_wrapper(fasta, motifs, run_number, hit_file, control_file, anc_CG_dict, macaque_CG_dict, family_seed = 5, CG_gene_filter = CG_gene_filter, niter = niter, verbose = verbose, brute_mapping = brute_mapping, stepsize = stepsize, write_errors = error_file, fs = fs, nonsyn_hits = nonsyn_hits, leave_CG = leave_CG, remove_ancestral_CpG = remove_ancestral_CpG, remove_macaque_CpG = remove_macaque_CpG, pseudoCG = pseudoCG, prone_sites = prone_sites, match_size = match_size)