def get_MSA_gene_list(coords, coords_file, method, species_set, version, query_species, MSA_file): ''' Given a dictionary of lists of lists of CDS coordinates, retrieve the Compara MSAs. ''' with open(coords_file, "w") as file: for trans in coords: for exon in coords[trans]: phase = exon[1] current_coords = exon[0] current_coords = [str(i) for i in current_coords] current_coords.append(str(phase)) current_coords = "|".join(current_coords) file.write(current_coords) file.write("\n") remove_file(MSA_file) run_process([ "perl", "MSA_list.pl", method, species_set, version, coords_file, query_species, MSA_file ]) with open(MSA_file) as file: string = "".join(file) string = re.sub("([a-z])\n([a-z])", "\\1\\2", string) with open(MSA_file, "w") as file: file.write(string)
def get_pairwise_alignment(coords, coords_file, query_species, other_species, version, output_file): ''' Given a list of feature coordinates and two species, get the corresponding pairwise alignments from Compara. ''' #write the coordinates to file in a way that can be read by the downstream perl script with open(coords_file, "w") as file: for feature in coords: feature = [str(i) for i in feature] feature = "|".join(feature) file.write(feature) file.write("\n") remove_file(output_file) #get the alignments from the database run_process([ "perl", "pairwise_from_ensembl.pl", coords_file, query_species, other_species, output_file, version ]) #parse them from the output file produced by the perl script with open(output_file) as file: string = "".join(file) string = string.split("***") string = [(i.rstrip("\n")).lstrip("\n") for i in string] string = [i.split("|||") for i in string] string = [[j for j in i if len(j) > 0] for i in string] #getting rid of cases where there's multiple GABs string = flatten([i for i in string if len(i) == 1]) #write alignments to a pretty FASTA with open(output_file, "w") as file: for feature in string: temp = feature.split("\n") name = temp[0] name = name.split("|") antisense = False if name[6] == "-": antisense = True try: alignments = [temp[2], temp[3]] alignments = [i.split(" ") for i in alignments] #covert to upper case alignments = [([j for j in i if j][1]).upper() for i in alignments] #only keep alignments with no ambiguous bases in either sequence if "N" not in alignments[0] and "N" not in alignments[1]: #reverse complement, if necessary if antisense: alignments = [ str( Seq(i, IUPAC.unambiguous_dna).reverse_complement( )) for i in alignments ] file.write(">{0}\n".format("|".join(name))) file.write("|".join(alignments)) file.write("\n") except IndexError: pass
def run_bedtools(A_file, B_file, force_strand = False, force_opposite_strand = False, write_both = False, chrom = None, overlap = None, sort = False, no_name_check = False, no_dups = True, hit_number = False, output_file = None, intersect = False, bed_path = None, overlap_rec = None, intersect_bam = None, write_zero = None, write_bed = False, exclude = False): ''' See intersect_bed for details. ''' if write_zero: write_option = "-wao" elif hit_number: write_option = "-c" elif write_both: write_option = "-wo" else: write_option = "-wa" if sort: sort_bed(A_file, A_file) sort_bed(B_file, B_file) bedtools_args = ["bedtools", "intersect", "-a", A_file,"-b", B_file, write_option] if intersect: del bedtools_args[-1] if overlap: bedtools_args.extend(["-f", str(overlap)]) if overlap_rec: bedtools_args.append("-r") if force_strand: bedtools_args.append("-s") elif force_opposite_strand: bedtools_args.append("-S") if no_name_check: bedtools_args.append("-nonamecheck") if no_dups: bedtools_args.append("-u") if chrom: print("Bedtools cannot be restricted to a single chromosome. Use bedops!") raise Exception if hit_number and no_dups: print("When counting hits, each interval in the first bed file is only reported once by default. Set no_dups to False!") raise(Exception) if bed_path: bedtools_args[0] = "{0}{1}".format(bed_path, bedtools_args[0]) if exclude: bedtools_args.append("-v") if intersect_bam: if A_file[-4:] != ".bam": print("Bam file must be called first") raise Exception if B_file[-4:] == ".bam": print("BAM file must be called first") raise Exception bedtools_args = ["intersectBed", write_option, "-abam", A_file, "-b", B_file] if write_bed: bedtools_args.append("-bed") try: bedtools_output = hk.run_process(bedtools_args, file_for_output = output_file) except FileNotFoundError: bedtools_args[0] = "intersectBed" bedtools_output = hk.run_process(bedtools_args, file_for_output = output_file) return(bedtools_output)
def main(): description = "Run mDFEest with shuffled input to check the false positive rate." args = parse_arguments(description, [ "hits_file", "controls_file", "output_file", "n_sim", "SNP_file", "SNP_number", "hit_reduce", "control_reduce", "const_pop" ], ints=[3, 5], floats=[6, 7], flags=[8]) hits_file, controls_file, output_file, n_sim, SNP_file, SNP_number, hit_reduce, control_reduce, const_pop = args.hits_file, args.controls_file, args.output_file, args.n_sim, args.SNP_file, args.SNP_number, args.hit_reduce, args.control_reduce, args.const_pop with open(output_file, "w") as file: for sim in range(n_sim): print(sim) temp_hits_file = "temp_data/hits_file{0}.txt".format( random.random()) temp_controls_file = "temp_data/controls_file{0}.txt".format( random.random()) temp_input_file = "temp_data/input_file{0}.txt".format( random.random()) #shuffle hits and controls for negative control run_process([ "python3", "shuffle_hits_and_controls.py", hits_file, controls_file, temp_hits_file, temp_controls_file, hit_reduce, control_reduce ]) #generate multiDFEest input file run_process([ "python3", "mDFEest_input.py", temp_hits_file, temp_controls_file, SNP_file, SNP_number, temp_input_file ]) output = mDFEest("beta", temp_input_file, pop_change=True) print(output) print(output["Nes_0.0_0.1"]) print(output["Nes_0.1_1.0"]) file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"], output["Nes_0.1_1.0"])) #if you also want to run with fixed population size if const_pop: output = mDFEest("beta", temp_input_file, pop_change=False) file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"], output["Nes_0.1_1.0"])) file.write("\n") remove_file(temp_hits_file) remove_file(temp_controls_file) remove_file(temp_input_file)
def sort_bed(input_file_name, output_file_name): ''' Sort a bed file. ''' #This is done via a temp file because that way you can specify the same file as input and output file and thus #overwrite the unsorted file with the sorted one. temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random()) hk.run_process(["sort-bed", input_file_name], file_for_output = temp_file_name) hk.run_process(["mv", temp_file_name, output_file_name]) hk.remove_file(temp_file_name)
def get_coverage(regions_file, reads_file, output_file_name): """ Given a BED file with regions and a BED/BAM file of reads, count how many reads overlap each region and output in new BED file. :param regions_file: BED file :param reads_file: BED/BAM file :param output_file_name: name for output BED file :return: None """ hk.run_process(["bedtools", "coverage", "-a", regions_file, "-b", reads_file, "-counts", "-s"], file_for_output=output_file_name)
def main(): description = "Given a BED file of reads, filter out reads whose " \ "3' end maps to the last nucleotide of an intron or" \ "the last nucleotide of an exon." args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"]) reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile print("Getting intron lariat positions...") # read in exon coordinates exons = rw.read_gtf(gtf, element="exon", gene=False) # make a BED file with the last positions of introns intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4]) co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True) # intersect the reads with intron lariat positions intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4]) co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name) hk.remove_file(intron_lariat_bed) intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4]) # check that the reads end exactly at intron lariat positions check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file) hk.remove_file(intron_lariat_intersect_file_name) # write BED with the last positions of exons splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4]) co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True) print("Getting splice intermediate positions.") # intersect the reads with splice intermediate positions splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4]) co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name) hk.remove_file(splice_intermediate_bed) SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4]) # check that the reads end exactly at the end of the exon check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file) hk.remove_file(splice_intermediate_intersect_file_name) print("Concatenating the two files.") # concatenate the IL and SI read files so you could exclude both in one go combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4]) hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file) hk.remove_file(SI_reads_file) hk.remove_file(intron_lariat_reads_file) # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the # putative intron lariat reads from the main reads file co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile) hk.remove_file(combined_file)
def main(): # Get arguments. description = "Check if nucleotide composition at the 5' ends of NET-seq reads is biased." args = hk.parse_arguments( description, ["input_file", "output_file", "genome_fasta", "gtf", "three_prime"], flags=[4]) input_file, output_file, genome_fasta, gtf, three_prime = args.input_file, args.output_file, args.genome_fasta, args.gtf, args.three_prime # Convert to .bed, if not already .bed if input_file[-3:] != "bed": print("Converting input file to .bed...") input_file_new_name = "{0}bed".format(input_file[:-3]) hk.convert2bed(input_file, input_file_new_name) input_file = input_file_new_name # Make an extended version of each read that extends 5 nt 5prime and 35 nt 3prime print("Extending the reads...") suffix = "" if three_prime: suffix = "_three_prime" temp_bed = "{0}_extended_for_bias{1}.bed".format(input_file[:-4], suffix) co.extend_intervals(input_file, temp_bed, 5, 35, remove_chr=True, add_chr=False, three_prime=three_prime) # Make a FASTA file from the BED file. print("Extracting sequences...") fasta_name = "{0}fasta".format(temp_bed[:-3]) hk.run_process([ "fastaFromBed", "-bed", temp_bed, "-fi", genome_fasta, "-fo", fasta_name, "-s" ]) print("Number of lines in FASTA:") print(hk.run_process(["wc", "-l", fasta_name])) # Store the sequences at -5:+5 and 30:40 in a 2D array print("Storing sequences in arrays...") occ_mat_true, occ_mat_control = extract_true_and_control_string( fasta_name, (0, 10), (30, 40)) # Make a PPM for either column bases = ["A", "T", "C", "G"] print("Making PPMs...\n") print("TRUE:") PPM_wrapper(occ_mat_true, bases, "{0}.true".format(output_file)) print("CONTROL:") PPM_wrapper(occ_mat_control, bases, "{0}.control".format(output_file))
def merge_bed(in_bed, out_bed, distance): """ Strand-specifically merge a BED files. Sorts the input BED file first. :param in_bed: input file name :param out_bed: output file name :param distance: maximum distance between two elements that are to be merged :return: None """ sorted = hk.run_process(["sortBed", "-i", in_bed]) # the -c and the -o are so that the strand would end up in the # right column hk.run_process(["mergeBed", "-s", "-d", distance, "-c", "4,5,6", "-o", "distinct,distinct,distinct"], input_to_pipe = sorted, file_for_output=out_bed)
def read_gtf(file_name, element, gene=False, filter_parameter=None, filter_value=None): ''' Read in all rows that contain coordinates of type _element_ from gtf file. Will make a dictionary with either gene or transcript IDs as keys (depending on what _gene_ is set to). Will also convert start and end coordinates to integers. If filter_parameter and filter-value have been specified, only lines with the specified value for that parameter will be returned. ''' filter_pattern = "" if filter_parameter: if not filter_value: raise Exception( "If filter_parameter has been specified, then filter_value must be too!" ) filter_pattern = "{0} \"{1}\"".format(filter_parameter, filter_value) output = {} if gene: pattern = re.compile("(?<=gene_id \")[\d\w]*") else: pattern = re.compile("(?<=transcript_id \")[\d\w]*") # check if you're on linux or Mac to know whether you need the -P flag platform = sys.platform if platform == "linux" or platform == "linux2": relevant_lines = hk.run_process( ["grep", "-P", r"\t{0}\t".format(element), file_name]).rstrip("\n").split("\n") elif platform == "darwin": relevant_lines = hk.run_process( ["grep", r"\t{0}\t".format(element), file_name]).rstrip("\n").split("\n") for line in relevant_lines: line = line.split("\t") if len(line) > 1: # if you need to filter by parameter if filter_pattern in line[8]: # convert start and end coordinates to integers line[3] = int(line[3]) line[4] = int(line[4]) # get identifier (transcript or gene) idn = re.search(pattern, line[8]).group() if idn not in output: output[idn] = [] output[idn].append(line) return (output)
def get_control_sites(fasta, genome, feature_set, families_file, dataset, temp_motifs_file, hit_file, control_file, error_file, anc_CG_file, high_CG_file, flags): ''' Given motifs and sequences, pick control sites using the optimization method. ''' arguments = [ "python3", "pick_control_sites.py", fasta, genome, feature_set, families_file, dataset, temp_motifs_file, 10, hit_file, 500, 10, control_file, error_file, "None", anc_CG_file, high_CG_file, "--old_motif_format" ] arguments = arguments + flags run_process(arguments)
def get_MSA(coords, method, species_set, query_species, version, force_strand = True): ''' Get the genome alignments that overlap a particular sequence region. ''' reverse = False if coords[6] == "-" and force_strand: reverse = True MSA = run_process(["perl", "MSA.pl", method, species_set, version, coords[0], coords[2], coords[3], query_species]) MSA = MSA.split("|||") MSA = [i.split(">") for i in MSA if i] MSA = [[j.split("\n") for j in i if j] for i in MSA] MSA_dict = {} for gab in MSA: for species in gab: name = species[0] temp_name = name.split("/") true_name = temp_name[0] coords = "-".join(temp_name[1:]) if true_name not in MSA_dict: MSA_dict[true_name] = {} current_seq = "".join(species[1:]).upper() if reverse: current_seq = Seq(current_seq, IUPAC.unambiguous_dna) current_seq = current_seq.reverse_complement() current_seq = str(current_seq) MSA_dict[true_name][coords] = current_seq return(MSA_dict)
def get_pp(outroot, subst_model, phy_file, model_file, separate_to_concat_mapping, combined_dict, tuples_mapping, min_inf = None, parse_output = True): ''' Get prior probabilities for all the bases at the different sites in an MSA. Note that for phyloFit these are posterior probabilities but theyare priors for INSIGHT. ''' #you don't want to compute a tree, just get the posterior probabilities for an existing tree #hence all the flags from --post_probs onwards arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--post-probs", "--scale-only", "--no-rates", "--no-freqs", phy_file] if min_inf: arguments.extend(["-I", min_inf]) results = run_process(arguments) #parse into convenient dictionary if parse_output: pp_file = "{0}.postprob".format(outroot) pp = rw.read_many_fields(pp_file, " ") pp = [[j for j in i if j] for i in pp] #the outgroup nodes are labelled from the inside out, starting from 1 pp = {i[1]: i[-4:] for i in pp} pp_final = {} #map from coordinates in the concatenated alignment to positions in individual CDSs for trans in separate_to_concat_mapping: pp_final[trans] = {} for position in combined_dict[trans]: pp_final[trans][position] = pp[tuples_mapping[separate_to_concat_mapping[trans][position]]] return(pp_final)
def get_pairwise_alignment(coords, coords_file, query_species, other_species, version, output_file): ''' Given a list of feature coordinates and two species, get the corresponding pairwise alignments from Compara. ''' #write the coordinates to file in a way that can be read by the downstream perl script with open(coords_file, "w") as file: for feature in coords: feature = [str(i) for i in feature] feature = "|".join(feature) file.write(feature) file.write("\n") remove_file(output_file) #get the alignments from the database run_process(["perl", "pairwise_from_ensembl.pl", coords_file, query_species, other_species, output_file, version]) #parse them from the output file produced by the perl script with open(output_file) as file: string = "".join(file) string = string.split("***") string = [(i.rstrip("\n")).lstrip("\n") for i in string] string = [i.split("|||") for i in string] string = [[j for j in i if len(j) > 0] for i in string] #getting rid of cases where there's multiple GABs string = flatten([i for i in string if len(i) == 1]) #write alignments to a pretty FASTA with open(output_file, "w") as file: for feature in string: temp = feature.split("\n") name = temp[0] name = name.split("|") antisense = False if name[6] == "-": antisense = True try: alignments = [temp[2], temp[3]] alignments = [i.split(" ") for i in alignments] #covert to upper case alignments = [([j for j in i if j][1]).upper() for i in alignments] #only keep alignments with no ambiguous bases in either sequence if "N" not in alignments[0] and "N" not in alignments[1]: #reverse complement, if necessary if antisense: alignments = [str(Seq(i, IUPAC.unambiguous_dna).reverse_complement()) for i in alignments] file.write(">{0}\n".format("|".join(name))) file.write("|".join(alignments)) file.write("\n") except IndexError: pass
def MSA_names(method, species_set, version): ''' Given a Compara WGA method, a species set name and an ensembl db version, get the names of all the species in the set. ''' names = run_process(["perl", "MSA_names.pl", method, species_set, version]) names = names.rstrip(",") names = names.split(",") return(names)
def MSA_names(method, species_set, version): ''' Given a Compara WGA method, a species set name and an ensembl db version, get the names of all the species in the set. ''' names = run_process(["perl", "MSA_names.pl", method, species_set, version]) names = names.rstrip(",") names = names.split(",") return (names)
def intersect_bed(bed_file1, bed_file2, use_bedops = False, overlap = False, overlap_rec = False, write_both = False, sort = False, output_file = None, force_strand = False, force_opposite_strand = False, no_name_check = False, no_dups = True, chrom = None, intersect = False, hit_count = False, bed_path = None, intersect_bam=None, write_zero = False, write_bed = False, exclude = False): '''Use bedtools/bedops to intersect coordinates from two bed files. Return those lines in bed file 1 that overlap with intervals in bed file 2. OPTIONS output_file: write output to this file use_bedops: use bedops rather than bedtools. Certain options are only valid with one of the two, see below. overlap: minimum oxverlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the overlap has to be at least 80% of the intervals in bed file 1). overlap_rec: require that the overlap as a fraction of the intervals in file 2 be at least as high as the threshold indicated in -f. write_both: if True, return not only the interval from bed file 1 but, tagged onto the end, also the interval from bed file 2 that it overlaps (only valid when using bedtools). exclude: if True, report intervals that DON'T overlap sort: sort bed files before taking the intersection force_strand: check that the feature and the bed interval are on the same strand (only valid with bedtools) force_opposite_strand: if True, check that the feature and the interval are on OPPOSITE strands no_name_check: if set to False, checks whether the chromosome names are the same in the too bed files (only valid with bedtools) no_dups: if True, only returns each interval once. If set to false, intervals in bed file 1 that overlap several intervals in bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2) chrom: limit search to a specific chromosome (only valid with bedops, can help in terms of efficiency) intersect: rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2. hit_count: for each element in bed file 1, return the number of elements it overlaps in bed file 2 (only valid with bedtools) intersect_bam: intersect a bam file with a bed file. Requires bam file to be called first write_zero: like write_both but also write A intervals that don't overlap with any B intervals, write_bed: when intersecting a bam file, write output as bed.''' if force_strand and force_opposite_strand: raise Exception("force_strand and force_opposite_strand can't both be True") hk.make_dir("temp_data/") temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random()) #have it write the output to a temporary file if use_bedops: bedtools_output = run_bedops(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, no_dups = no_dups, intersect_bam = intersect_bam, overlap_rec = overlap_rec) else: bedtools_output = run_bedtools(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, no_name_check, no_dups, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, bed_path = bed_path, intersect_bam = intersect_bam, write_zero = write_zero, overlap_rec = overlap_rec, write_bed = write_bed, exclude = exclude) #move it to a permanent location only if you want to keep it if output_file: hk.run_process(["mv", temp_file_name, output_file]) else: bedtools_output = rw.read_many_fields(temp_file_name, "\t") hk.remove_file(temp_file_name) return(bedtools_output)
def get_MSA_gene_list(coords, coords_file, method, species_set, version, query_species, MSA_file): ''' Given a list of lists of CDS coordinates, retrieve the Compara MSAs. ''' with open(coords_file, "w") as file: for trans in coords: for exon in coords[trans]: phase = exon[1] current_coords = exon[0] current_coords = [str(i) for i in current_coords] current_coords.append(str(phase)) current_coords = "|".join(current_coords) file.write(current_coords) file.write("\n") remove_file(MSA_file) run_process(["perl", "MSA_list.pl", method, species_set, version, coords_file, query_species, MSA_file]) with open(MSA_file) as file: string = "".join(file) string = re.sub("([a-z])\n([a-z])", "\\1\\2", string) with open(MSA_file, "w") as file: file.write(string)
def chi_test(observed, expected): ''' Given a series of observed and expected values, conduct a chi-squared test. ''' observed = ",".join([str(i) for i in observed]) expected = ",".join([str(i) for i in expected]) string_to_R = "|".join([observed, expected]) output = run_process(["Rscript", "R_scripts/chi_test.r", string_to_R]) output = output.split(" ") output = [i for i in output if i != ""] chi = float((output[1].lstrip("\"")).rstrip("\"")) p = float(((output[4].lstrip("\"")).rstrip("\n")).rstrip("\"")) return({"chi": chi, "p": p})
def fishers_exact_test(observed, expected): ''' Perform a Fisher's exact test on an observed and an expected proportion ''' string_to_R = ",".join([str(observed[0]), str(observed[1]), str(expected[0]), str(expected[1]), "greater"]) results = run_process(["Rscript", "R_scripts/fisher_test.r", string_to_R]) results = results.rstrip("\"\n") #sometimes there's a space between the quotation marks and the newline results = results.rstrip("\" \n") results = results.split(" ") results = [(i.rstrip("\"")).lstrip("\"") for i in results if i != ""] results = results[1:] results = [float(i) if "Inf" not in i else i for i in results] return(results)
def correct_multiple_testing(p_values, method): ''' Given a list of p-values, correct them for multiple testing. ''' p_values = [str(i) for i in p_values] p_values.append(method) string_to_R = ",".join(p_values) corrected_values = run_process(["Rscript", "R_scripts/holm_correct.r", string_to_R]) corrected_values = re.findall("[\d\.]*", corrected_values, re.MULTILINE) corrected_values = [float(i) for i in corrected_values if "." in i] if (len(p_values)) - 1 != len(corrected_values): print("Problem correcting for multiple comparisons!") print(p_values) print(corrected_values) sys.exit() return(corrected_values)
def wilcoxon_signed_rank_test(vector1, vector2, alt): ''' Perform a Mann-Whitney U test to compare two samples. The alternative must be one of "greater", "less" and "two.tailed". ''' vector1 = ",".join([str(i) for i in vector1]) vector2 = ",".join([str(i) for i in vector2]) vectors = "|".join([vector1, vector2]) string_to_R = "_".join([vectors, alt]) results = run_process(["Rscript", "R_scripts/wilcoxon_signed_rank_test.r", string_to_R]) results = results.rstrip("\n") results = results.split(" ") results = [i for i in results if i != ""] results = (results[1].rstrip("\"")).lstrip("\"") results = float(results) return(results)
def run_bedops(A_file, B_file, force_strand = False, force_opposite_strand = False, write_both = False, chrom = None, overlap = None, sort = False, output_file = None, intersect = False, hit_number = None, no_dups = False, overlap_rec = None, intersect_bam = None): ''' See intersect_bed for details. ''' if intersect: command = "--intersect" else: command = "--element-of" if sort: sort_bed(A_file, A_file) sort_bed(B_file, B_file) bedops_args = ["bedops", "--chrom", "foo", command, "1", A_file, B_file] if overlap: bedops_args[4] = overlap if chrom: bedops_args[2] = chrom if intersect: del bedops_args[4] else: del bedops_args[1:3] if intersect: del bedops_args[2] if force_strand: print("Bedops can't search by strand! Either use bedtools or separate input data by strand!") raise Exception if force_opposite_strand: print("Bedops can't search by strand! Either use bedtools or separate input data by strand!") raise Exception if write_both: print("Bedops can't write both features!") raise Exception if hit_number: print("Bedops hasn't been set up to count the number of overlapping elements. Use bedtools!") raise Exception if no_dups: print("Bedops doesn't print duplicates by default!") if overlap_rec: print("Bedops hasn't been set up to filter by overlap in second file!") if intersect_bam: print("Use bedtools to intersect bam and bed!") raise Exception bedops_output = hk.run_process(bedops_args, file_for_output = output_file) return(bedops_output)
def get_lambda(lambda_file_outroot, phy_file, subst_model, min_inf = None): ''' Calculate lambda input parameter for INSIGHT. ''' lambda_file = "{0}.mod".format(lambda_file_outroot) #to make sure you catch it if the phyloFit process fails remove_file(lambda_file) #from UCSC tree_file = "DFE/UCSC_model.mod" #subst_model is JC69, for instance #scale-only, cause you don't want it to estimate a new tree, just to scale the whole thing arguments = ["phyloFit", "--init-model", tree_file, "--out-root", lambda_file_outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--scale-only", phy_file] #must be set to False for testing if min_inf: arguments.extend(["-I", min_inf]) results = run_process(arguments) with open(lambda_file) as file: lambda_b = file.read() lambda_b = re.findall(lambda_regex, lambda_b)[0] return(lambda_b)
def fishers_exact_test_enrichment(element, sample, population, alt): ''' Perform a Fisher's exact test to check whether a given element is enriched in a sample when compared to a population. ''' N = len(population) n = len(sample) if len(sample) >= len(population): print("The sample has to be smaller than the population!") raise Exception K = population.count(element) k = sample.count(element) string_to_R = ",".join([str(k), str(n - k), str(K), str(N - K), alt]) results = run_process(["Rscript", "R_scripts/fisher_test.r", string_to_R]) results = results.rstrip("\"\n") #sometimes there's a space between the quotation marks and the newline results = results.rstrip("\" \n") results = results.split(" ") results = [(i.rstrip("\"")).lstrip("\"") for i in results if i != ""] results = results[1:] results = [float(i) if "Inf" not in i else i for i in results] return(results)
def get_MSA(coords, method, species_set, query_species, version, force_strand=True): ''' Get the genome alignments that overlap a particular sequence region. ''' reverse = False if coords[6] == "-" and force_strand: reverse = True MSA = run_process([ "perl", "MSA.pl", method, species_set, version, coords[0], coords[2], coords[3], query_species ]) MSA = MSA.split("|||") MSA = [i.split(">") for i in MSA if i] MSA = [[j.split("\n") for j in i if j] for i in MSA] MSA_dict = {} for gab in MSA: for species in gab: name = species[0] temp_name = name.split("/") true_name = temp_name[0] coords = "-".join(temp_name[1:]) if true_name not in MSA_dict: MSA_dict[true_name] = {} current_seq = "".join(species[1:]).upper() if reverse: current_seq = Seq(current_seq, IUPAC.unambiguous_dna) current_seq = current_seq.reverse_complement() current_seq = str(current_seq) MSA_dict[true_name][coords] = current_seq return (MSA_dict)
def get_ancestral_CG(outroot, subst_model, phy_files, model_file, tuples_mapping_dict, anc_CG_file_name, high_CG = None, min_inf = None, macaque = False, comprehensive = False, from_model = False): ''' Get a dictionary that says for each transcript which positions were ancestrally CpG/GpC. ''' #if a file name hasn't been supplied or if the file with the supplied name doesn't exist, determine #CpG positions again, otherwise just read them in from the file if not anc_CG_file_name or anc_CG_file_name == "None" or not os.path.exists(anc_CG_file_name): #you need several in case you have a high_CG dictionary pps = [] for phy_file in phy_files: if subst_model == "JC69" or from_model: #use an existing substitution model arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--post-probs", "--scale-only", phy_file] else: #estimate a new model arguments = ["phyloFit", "--out-root", outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--tree", "DFE/full_tree.tree", "--post-probs", phy_file] if subst_model == "JC69": block_size = 4 tuple_pos_lim = 2 shift_in_tuple = 0 else: #for dinucleotide models block_size = 16 tuple_pos_lim = 3 shift_in_tuple = 9 #turn off when testing if min_inf: arguments.extend(["-I", min_inf]) results = run_process(arguments) #read in posterior probabilities of having various nucelotides ancestrally pp_file = "{0}.postprob".format(outroot) pp = rw.read_many_fields(pp_file, " ") pp = [[j for j in i if j] for i in pp] pp = pp[2:] #the posterior probability that you had a CpG at a position has to be greater #than threshold for a position to be counted as ancestrally CpG threshold = 0.5 #will be over-written if you're doing big tree human_pos = 0 #the outgroup nodes are labelled from the outside in, starting from 1 if macaque: #it's to know whether we're doing big tree or little tree if len(pp[0]) == 14: #little tree, mononucleotide pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (3 * block_size): len(i) - (2 * block_size)]] for i in pp} elif len(pp[0]) > 14: #big tree/dinucleotide (i.e. it'll give you nonsense if you're trying to do context with the little tree) #the shift_in_tuple is to do with the fact that if you're doing U2S, you want the second tuple and not the first human_pos = 3 + shift_in_tuple if comprehensive: #you want to get all nodes except for node 0, which is the outgroup-ingroup ancestor pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (j * block_size): len(i) - ((j - 1) * block_size)] for j in range(1, 7)] for i in pp} else: pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (6 * block_size): len(i) - (5 * block_size)]] for i in pp} else: #for tests etc. where you might only have, say, two species pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp} else: pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp} pps.append(pp) anc_CG = {} #just to get the length example_pp = pps[0][list(pps[0].keys())[0]] for trans in tuples_mapping_dict: #tuples_mapping_dict has the alignment tuple corresponding to each position #because the phyloFit output is organized by tuples, not by positions anc_CG[trans] = [] for node_pos in range(len(example_pp)): #if you're using dinucleotides if subst_model != "JC69": for pos in sorted(tuples_mapping_dict[trans].keys())[1:]: try: pp_number = 0 #if you're gonna produce different output dictionaries for high and low GC regions if high_CG: if pos in high_CG[trans]: pp_number = 1 current_tuple = tuples_mapping_dict[trans][pos] #don't consider positions where there is an alignment gap for human if current_tuple[human_pos] != "*": ## print(current_tuple) ## print(pps[pp_number]) ## print("\n") if current_tuple in pps[pp_number]: current_pp = pps[pp_number][current_tuple][node_pos] else: current_pp = pps[abs(pp_number - 1)][current_tuple][node_pos] #because it can be either GC or CG, hence 6 or 9 if float(current_pp[6]) > threshold or float(current_pp[9]) > threshold: #you're always testing the second member in the dinucleotide anc_CG[trans].append(pos - 1) anc_CG[trans].append(pos) except KeyError: if pos % 100 == 0: pass else: raise KeyError else: #if you're using mononucleotides, you have to keep track of what the previous neuclotide was C_prev = False G_prev = False for pos in sorted(tuples_mapping_dict[trans].keys()): pp_number = 0 if high_CG: if pos in high_CG[trans]: pp_number = 1 current_C = False current_G = False current_tuple = tuples_mapping_dict[trans][pos] if current_tuple[human_pos] != "*": current_pp = pps[pp_number][current_tuple][node_pos] #if current is C and previous was G if float(current_pp[1]) > threshold: if G_prev: anc_CG[trans].append(G_pos) anc_CG[trans].append(pos) current_C = True #if current is G and previous was C if float(current_pp[2]) > threshold: if C_prev: anc_CG[trans].append(C_pos) anc_CG[trans].append(pos) current_G = True C_prev = False G_prev = False if current_C: C_prev = True #you need to specify the position explicitly because it's not necessarily #the last one if there were dashes C_pos = pos if current_G: G_prev = True G_pos = pos anc_CG[trans] = sorted(list(set(anc_CG[trans]))) remove_file(pp_file) if anc_CG_file_name and anc_CG_file_name != "None": with open(anc_CG_file_name, "w") as file: for trans in anc_CG: to_write = "\t".join([trans, ",".join([str(i) for i in anc_CG[trans]])]) file.write(to_write) file.write("\n") else: #parse anc_CG = rw.read_many_fields(anc_CG_file_name, "\t") anc_CG = [i for i in anc_CG if len(i) == 2] anc_CG = list_to_dict(anc_CG, 0, 1) anc_CG = {i: [int(i) for i in anc_CG[i].split(",") if i != ""] for i in anc_CG} return(anc_CG)
def MSA_filter_by_anatomy(input_file, output_file, version): ''' Given an output file from get_MSA_concat_list, filter the CDSs based on whether the exon coordinates have been conserved. ''' run_process(["perl", "MSA_CDSs.pl", version, input_file, output_file])
def main(): description = "Directly compare the frequency of segregating sites/mean allele frequency between hits and controls." args = parse_arguments(description, [ "hit_file", "control_file", "INSIGHT_hit_file", "INSIGHT_control_file", "SFS_file", "trial_file", "trials", "shuffle" ], ints=[6], flags=[7]) hit_file, control_file, INSIGHT_hit_file, INSIGHT_control_file, SFS_file, trial_file, trials, shuffle = args.hit_file, args.control_file, args.INSIGHT_hit_file, args.INSIGHT_control_file, args.SFS_file, args.trial_file, args.trials, args.shuffle true_hits = rw.read_pos(hit_file) true_controls = rw.read_pos(control_file) #to store the original data in case this is a negative control and you will be shuffling #hits and controls original_INSIGHT_hit_file = INSIGHT_hit_file original_INSIGHT_control_file = INSIGHT_control_file print(hit_file) with open(trial_file, "w") as file: file.write( "trial\tpoly_fraction_hits - poly_fraction_controls\tmedian_hit_MAF - median_control_MAF\n" ) for trial in range(trials): to_write = "{0}\t".format(trial) #if this is a negative control if shuffle: INSIGHT_hit_file = re.sub("_0_", "_{0}_".format(trial), original_INSIGHT_hit_file) INSIGHT_control_file = re.sub("_0_", "_{0}_".format(trial), original_INSIGHT_control_file) temp_hits_file = "temp_data/temp_hits{0}.txt".format( random.random()) temp_controls_file = "temp_data/temp_controls{0}.txt".format( random.random()) #shuffle hits and controls temp_hits, temp_controls = shuffle_dictionaries( true_hits, true_controls) rw.write_pos(temp_hits, temp_hits_file) rw.write_pos(temp_controls, temp_controls_file) SFS_file = "temp_data/temp_SFS_file{0}.txt".format( random.random()) #generate an ISNIGHT input file that you could then use for the manual analysis run_process([ "python3", "mDFEest_input.py", temp_hits_file, temp_controls_file, "general/1000genomes/filtered_hg38_85_pc_multiexon_Yoruban_SNPs_relative.txt", 216, SFS_file ]) remove_file(temp_hits_file) remove_file(temp_controls_file) hit_data = get_data(INSIGHT_hit_file) control_data = get_data(INSIGHT_control_file) poly_ratio_diff = get_chisq_site_freq(hit_data, control_data) to_write = to_write + "{0}\t".format(poly_ratio_diff) temp, median_diff = get_mean_freq(SFS_file) to_write = to_write + "{0}\n".format(median_diff) if shuffle: remove_file(SFS_file) file.write(to_write)
def main(): description = "Generate a NET-seq control set that would have the same distribution of -2:2 nucleotides" \ "as the true set." args = hk.parse_arguments(description, [ "active_genes_file", "gtf", "PolII_file", "fasta", "outfile", "chrom_sizes" ]) active_genes_file, gtf, PolII_file, fasta, outfile, chrom_sizes = args.active_genes_file, args.gtf, args.PolII_file, args.fasta, args.outfile, args.chrom_sizes chrom_sizes = rw.read_many_fields(chrom_sizes, delimiter="\t") chrom_sizes = hk.list_to_dict(chrom_sizes, 0, 1, intify=True) # get transcriptionally active genes and make a BED file with their coordinates print("Getting the coordinates of transcriptionally active genes...") trans_active_genes = rw.read_many_fields(active_genes_file, "\t")[1:] trans_active_genes = [i[3] for i in trans_active_genes] transcripts_file = "{0}_transcripts_all.bed".format(gtf[:-4]) co.get_transcripts(gtf, transcripts_file, add_chr=True) transcripts_dict = {} # this will be used for getting the k-mers in the transcripts filtered_transcripts_file_plus2 = "{0}_trans_act_only_plus3.bed".format( transcripts_file[:-4]) # this will be used for filtering the reads filtered_transcripts_file = "{0}_trans_act_only.bed".format( transcripts_file[:-4]) with open(filtered_transcripts_file, "w") as ft_file, open(transcripts_file) as t_file, open( filtered_transcripts_file_plus2, "w") as ft_file2: reader = csv.reader(t_file, delimiter="\t") writer = csv.writer(ft_file, delimiter="\t") writer2 = csv.writer(ft_file2, delimiter="\t") for line in reader: if line[3] in trans_active_genes: # if line[0][0] not in ["G", "K"]: # line[0] = "chr{0}".format(line[0]) writer.writerow(line) # this is because if a read falls at the first position, you will need to know the # preceding two bases. Same if it falls at the last position. line[1] = str((int(line[1])) - 3) line[2] = str((int(line[2])) + 3) writer2.writerow(line) transcripts_dict[line[3]] = line print("Filtering reads to the transcripts...") # filter reads to only ones that overlap these transcripts transcripts_PolII = "{0}_transcripts.bed".format(PolII_file[:-4]) co.intersect_bed(PolII_file, filtered_transcripts_file, force_strand=True, output_file=transcripts_PolII) print("Extracting FASTA from the transcript coordinates...") # the genome FASTA is formatted as N rather than chrN filtered_transcripts_file_no_chr = "{0}_trans_act_only_plus3_no_chr.bed".format( transcripts_file[:-4]) hk.run_process(["sed", "s/^chr//", filtered_transcripts_file_plus2], file_for_output=filtered_transcripts_file_no_chr) filtered_transcripts_fasta_no_chr = "{0}_trans_act_only_plus3.fasta".format( transcripts_file[:-4]) hk.run_process([ "bedtools", "getfasta", "-fi", fasta, "-bed", filtered_transcripts_file_no_chr, "-fo", filtered_transcripts_fasta_no_chr, "-s", "-name" ]) print("Mapping kmers to transcript positions...") kmer_dict = map_kmers_to_positions(filtered_transcripts_fasta_no_chr, k=6, focal_pos=3) print("Extracting the starting dinucleotide for each read...") starting_dints_PolII = "{0}_transcripts_starting_6mers.bed".format( PolII_file[:-4]) starting_dints_PolII_fasta = "{0}_transcripts_starting_6mers.fasta".format( PolII_file[:-4]) co.extend_intervals(transcripts_PolII, starting_dints_PolII, 3, 3, remove_chr=True) hk.run_process([ "bedtools", "getfasta", "-fi", fasta, "-bed", starting_dints_PolII, "-fo", starting_dints_PolII_fasta, "-s" ]) print("Picking random control positions...") pick_random_positions(transcripts_PolII, starting_dints_PolII_fasta, outfile, kmer_dict, transcripts_dict, chrom_sizes=chrom_sizes) print("Making single nucleotide resolution file...") snr_file = "{0}_snr.bed".format(outfile[:-4]) co.snr_bed(outfile, snr_file) print( "Removing reads that overlap potential splice intermediate positions..." ) no_si_snr_file = "{0}_snr_no_si.bed".format(outfile[:-4]) co.intersect_bed(snr_file, "data/Genomes/GTFs/dm6/dmel-all-r6.18_exon_ends_chr.gtf", force_strand=True, exclude=True, no_dups=False)
def main(): description = "Write out a BED file with the region surrounding the TSS for a set of genes." args = hk.parse_arguments( description, ["genes_file", "gtf", "outfile", "start_coord", "end_coord"], ints=[3, 4]) genes_file, gtf, outfile, start_coord, end_coord = args.genes_file, args.gtf, args.outfile, args.start_coord, args.end_coord need_to_seek = False if genes_file[-3:] != "bed": # it means that you got a list of gene symbols rather than a # BED file with coordinates need_to_seek = True transcript_file = "{0}_transcripts.gtf".format(gtf[:-4]) co.get_transcripts(gtf, transcript_file, add_chr=False, with_detail=False, output_gtf=True) with open(genes_file) as gf, open(outfile, "w") as of: reader = csv.reader(gf, delimiter="\t") writer = csv.writer(of, delimiter="\t") for line in reader: if need_to_seek: gene = line[0] print(gene) possibilities = hk.run_process([ "grep", "gene_symbol \"\"{0}\"\"".format(gene), transcript_file ]) possibilities = [ i.split("\t") for i in possibilities.split("\n")[:-1] ] chrom = "chr{0}".format(possibilities[0][0]) strand = possibilities[0][6] starts = [i[3] for i in possibilities] ends = [i[4] for i in possibilities] if strand == "+": counts = Counter(starts) start = int(counts.most_common()[0][0]) end = ends[starts.index(str(start))] length = int(end) - start new_end_coord = min(length, end_coord) new_start = str(start - start_coord - 1) new_end = str(start + (new_end_coord - 1)) elif strand == "-": counts = Counter(ends) end = int(counts.most_common()[0][0]) start = starts[ends.index(str(end))] length = end - int(start) new_end_coord = min(length, end_coord) new_start = str(end - new_end_coord) new_end = str(end + start_coord) new_line = [chrom, new_start, new_end, gene, ".", strand] writer.writerow(new_line) else: if line[0] != "chrom": length = int(line[2]) - int(line[1]) curr_end_coord = min(length, end_coord) if line[5] == "+": new_start = str(int(line[1]) - start_coord) new_end = str(int(line[1]) + curr_end_coord) elif line[5] == "-": new_start = str(int(line[2]) - curr_end_coord) new_end = str(int(line[2]) + start_coord) else: raise Exception("Invalid strand!") new_line = line.copy() new_line[1] = new_start new_line[2] = new_end # to make it a BED6 new_line = new_line[:-2] writer.writerow(new_line)
def main(): description = "Run mDFEest." args = parse_arguments(description, ["hit_file", "control_file", "SNP_file", "SNP_number", "input_file", "output_file", "seed", "fixed_model", "new_input", "shuffle", "fix_pop_change"], ints = [3], flags = [8, 9, 10]) hit_file, control_file, SNP_file, SNP_number, input_file, output_file, seed, fixed_model, new_input, shuffle, fix_pop_change = args.hit_file, args.control_file, args.SNP_file, args.SNP_number, args.input_file, args.output_file, args.seed, args.fixed_model, args.new_input, args.shuffle, args.fix_pop_change #if you want to generate a new input file rather than reading in an existing one if new_input: remove_file("../multidfe/{0}".format(input_file.split("/")[-1])) arguments = ["python3", "mDFEest_input.py", hit_file, control_file, SNP_file, SNP_number, input_file] if shuffle: arguments.append("--shuffle") run_process(arguments) if seed == "None": seed = None else: seed = float(seed) #if you want to run it only with a population size change model, #rather than both a model assuming population size change and a fixed population #size model if fix_pop_change: pop_change = [True] else: pop_change = [False, True] if fixed_model == "None": #all possible models allowed = ["lognormal", "gamma", "beta", "spikes", "steps", "fixed six spikes"] spike_range = [2, 6] else: #only the spcified model allowed = [fixed_model] #only two-spike models spike_range = [2, 3] with open(output_file, "w") as file: file.write("model\tpop_change\tAIC\tNes_0.0_0.1\tNes_0.1_1.0\tNes_1.0_10.0\tNes_10.0_100.0\traw\n") for change_mode in pop_change: print("\nPopulation expansion: {0}.".format(str(change_mode))) if "lognormal" in allowed: print("lognormal model:") output = mDFEest("lognormal", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) if "gamma" in allowed: print("gamma model:") output = mDFEest("gamma", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) if "beta" in allowed: print("beta model:") output = mDFEest("beta", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode) for spike_number in range(spike_range[0], spike_range[1]): if "spikes" in allowed: print("{0}-spikes model:".format(spike_number)) output = mDFEest("spikes", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode) print(output) write_mDFEest_output(output, file, change_mode) if "steps" in allowed: print("{0}-steps model:".format(spike_number)) output = mDFEest("steps", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode) print(output) write_mDFEest_output(output, file, change_mode) if "fixed six spikes" in allowed: print("fixed six spikes model:") output = mDFEest("six_spikes", input_file, pop_change = change_mode, seed = seed) print(output) write_mDFEest_output(output, file, change_mode)
def mDFEest(model, input_file, n_spikes = None, repetitions = None, fold_SFS = True, pop_change = False, seed = None): ''' Wraps call to multiDFEest. ''' flags = [] if fold_SFS: fold_SFS = 1 else: fold_SFS = 0 #this looks weird but is normal: this value will be the value of conpop in the multiDFE call, meaning it'll be 1 with constant population size if pop_change: pop_change = 0 else: pop_change = 1 #convert the English distribution names into multiDFEest model codes if model == "lognormal": model_code = 4 #parameter number for calculating AIC par_number = 2 elif model == "gamma": model_code = 2 par_number = 2 elif model == "beta": model_code = 3 par_number = 2 elif model == "spikes": model_code = 0 if not n_spikes: print("To be able to use a spikes model, you need to specify the number of spikes.") raise Exception par_number = (2 * n_spikes) - 1 flags = ["-ranrep", repetitions, "-nspikes", n_spikes] elif model == "steps": model_code = 1 if not n_spikes: print("To be able to use a steps model, you need to specify the number of steps.") raise Exception par_number = (2 * n_spikes) - 1 flags = ["-ranrep", repetitions, "-nspikes", n_spikes] elif model == "six_spikes": model_code = 5 par_number = 5 flags = ["-ranrep", repetitions] else: print("{0} is not a valid model name!".format(model)) raise Exception input_file_short = input_file.split("/") input_file_short = input_file_short[-1] #do the analysis in the directory where multiDFEest is stored if not os.path.exists("../multidfe/{0}".format(input_file_short)): run_process(["cp", input_file, "../multidfe"]) MDE_output = "{0}.MAXL.out".format(input_file_short) current_dir = os.getcwd() os.chdir("../multidfe") arguments = ["./MultiDFE", "-N1", 100, "-conpop", pop_change, "-sfsfold", fold_SFS, "-selmode", model_code, "-file", input_file_short] if seed: seed_string = "GSL_RNG_SEED={0}".format(seed) arguments = [seed_string] + arguments arguments.extend(flags) print(" ".join([str(i) for i in arguments])) #run multiDFEest run_process(arguments) #parse output output = rw.read_many_fields(MDE_output, "\t")[0] output = [i.split(":") for i in output if ":" in i] output = {i[0]: float(i[1]) for i in output} #get the log likelihood and calculate AIC ll = output["L"] print("\n") print(par_number) print(ll) AIC = (2 * par_number) - (2 * ll) output["AIC"] = AIC if n_spikes: output["model"] = "{0}_{1}".format(model, n_spikes) else: output["model"] = model remove_file(MDE_output) os.chdir(current_dir) return(output)
def get_ss_strength(exons, genome_file, upstream=True, five=True, exonic=3, intronic=6): """ Given a set of exons, get an estimate of splice site strength. :param exons: Dictionary of CDS lines. :param genome_file: File with genome sequence. :param upstream: evaluate the (5' or 3') splice site of the upstream intron (rather than downstream) :param five: evaluate the 5' splice site (rather than 3') :param exonic: how many nucleotides to include from the exon :param intronic: how many nucleotides to include from the intron :return: a dictionary with the splice site strength for each exon """ # will contain the splice site strengths out_dict = {} # will contain the names of the exons so that later on, we'd know which # splice site strength value goes with which exon names = [] # write splice site coordinates to GTF hk.make_dir("temp_data") temp_file_name = "temp_data/ss_sequences.gtf" with open(temp_file_name, "w") as temp_file: writer = csv.writer(temp_file, delimiter="\t") for transcript in exons: curr_exons = exons[transcript] for pos, exon in enumerate(curr_exons): # don't analyze first exons if (pos != 0): # cause you can't do the downstream intron of the last exon if (upstream or (pos != len(curr_exons) - 1)): if five: if upstream: template = curr_exons[pos - 1].copy() else: template = exon.copy() if template[6] == "+": template[3] = template[4] - exonic + 1 template[4] = template[4] + intronic elif template[6] == "-": template[4] = template[3] + exonic - 1 template[3] = template[3] - intronic else: if upstream: template = exon.copy() else: template = curr_exons[pos + 1].copy() if template[6] == "+": template[4] = template[3] + exonic - 1 template[3] = template[3] - intronic elif template[6] == "-": template[3] = template[4] - exonic + 1 template[4] = template[4] + intronic # this is for scaffolds etc. if template[3] >= 0: # so you'd know the order of the values in the MaxEntScan output names.append("{0}.{1}".format(transcript, pos - 1)) writer.writerow(template) # make a FASTA with splice site sequences temp_fasta_file_name = "{0}.fasta".format(temp_file_name[:-4]) hk.run_process([ "bedtools", "getfasta", "-fi", genome_file, "-bed", temp_file_name, "-fo", temp_fasta_file_name, "-s" ]) # filter FASTA for Ns fasta_lines = [] with open(temp_fasta_file_name) as fasta: for line in fasta: if line[0] == ">": curr_name = line else: if "N" not in line: fasta_lines.append(curr_name) fasta_lines.append(line) with open(temp_fasta_file_name, "w") as fasta: for line in fasta_lines: fasta.write(line) # run MaxEntScan on the FASTA # lazy hardcoded path, replace as appropriate... mes_direct = "/Users/rsavisaar/Software/MaxEntScan/fordownload" if five: cmd = "/Users/rsavisaar/Software/MaxEntScan/fordownload/score5.pl" else: cmd = "/Users/rsavisaar/Software/MaxEntScan/fordownload/score3.pl" temp_mes_file_name = "{0}_mes.txt".format(temp_file_name[:-4]) hk.run_process(["perl", cmd, temp_fasta_file_name], file_for_output=temp_mes_file_name, verbose=True) hk.remove_file(temp_fasta_file_name) hk.remove_file(temp_file_name) # read in splice site scores and store in output directory with open(temp_mes_file_name, newline="") as mes_file: reader = csv.reader(mes_file, delimiter="\t") for pos, line in enumerate(reader): out_dict[names[pos]] = float(line[1]) hk.remove_file(temp_mes_file_name) return (out_dict)