def get_methylation_levels_DMRfind(input_tsv_file, output, input_allc_files, samples, mc_type=["CGN"], num_procs=1, buffer_line_number=100000): """ This function assumes that allc files are of the format allc_<sample>_<chr>.tsv input is the path to a file containing collapsed DMR results output is the path to a file where the methylation values should be stored samples is a list of samples you'd like to compute the methylation level for path_to_allc is the path to the directory containing the allc files for these samples num_procs is an integer indicating the number of processors you'd like to use for calculating methylation level. This function can be parallelized up to the number of samples """ # need to check # 1. whether the tsv file is sorted by coordinate # 1. check whether allc files are stored in a list # 2. check whether samples are stored in a list # 4. whether samples and allc files have the same length mc_class = expand_nucleotide_code(mc_type) with open(output, 'w') as g: # header f = open(input_tsv_file, 'r') line = f.readline() line = line.rstrip("\n") fields = line.split("\t") g.write( "\t".join(fields) + "\t" + "\t".join(["methylation_level_" + sample for sample in samples]) + "\n") # methylation_levels = {} if num_procs == 1: for allc_file, sample in zip(input_allc_files, samples): get_methylation_level_DMRfind_worker(input_tsv_file, allc_file, sample, output, mc_class, buffer_line_number) else: pool = Pool(min(num_procs, len(samples))) results = {} #for allc_file,sample in zip(input_allc_files,samples): for ind in range(len(samples)): pool.apply_async( get_methylation_level_DMRfind_worker, (input_tsv_file, input_allc_files[ind], samples[ind], output, mc_class, buffer_line_number)) pool.close() pool.join() temp_files = {} for sample in samples: temp_files[sample] = open( output.replace(".tsv", "") + "_" + sample + "_temp_methylation_levels.tsv", 'r') f.seek(0) line = f.readline() for line in f: g.write(line.rstrip("\n")) for sample in samples: g.write("\t" + temp_files[sample].readline().rstrip("\n")) g.write("\n") for sample in samples: temp_files[sample].close() subprocess.check_call( shlex.split("rm " + output.replace(".tsv", "") + "_" + sample + "_temp_methylation_levels.tsv"))
def DMRfind(allc_files, samples, mc_type, chroms, output_prefix, num_procs=1, min_cov=0, keep_temp_files=False, mc_max_dist=0, dmr_max_dist=250, resid_cutoff=.01, sig_cutoff=.01, num_sims=3000, num_sig_tests=100, seed=-1, min_num_dms=0, collapse_samples=False, sample_category=False, min_cluster=0, max_iterations=1000, convergence_diff=1, buffer_line_number=100000): """ This function will take a set of allc files, and look for differentially methylated regions. Note that in the output file, -1 is used to represent missing data. mc_type is a list of the mc nucleotide contexts for which you want to look for DMRs. These classifications may use the wildcards "H" (indicating anything but a G) and "N" (indicating any nucleotide). For example, CAH would look for DMRs in methylated cytosines followed by AT, AC, or AA. region_dict is a dictionary that has keys of the chromosomes that you want to search that points to a list of [start,end] for that chromosome. The list elements should be positive integers. samples is a list of sample names to help find the allc files. See path_to_allc for more details. path_to_allc is a string indicating the path to the tab separated files containing methylation information for all C nucleotides in the genome. These files will be of the format: chromosome, position, strand, nucleotide context, reads with a methylated C at this position, total reads covering this position, and a binary value 0/1 indicating whether or not this site was statistically significantly methylated (e.g., as determined by a binomial test based on factors like sequencing error and non-conversion rate). A file for each chromosome is expected and their names will be in this format: path_to_allc/allc_<sample name>_<chromosome number>.tsv. Note that chromosome number should be an integer or letter (i.e., exclude the string "chr") num_procs is the number of processors you wish to use to parallelize this function save_result is a string indicating the prefix for various result files produced by this function min_cov is the minimum number of reads that must cover a site for it to be considered in DMR finding keep_temp_files indicates that you'd like to keep all the intermediate files this function generates along the way. This can be useful for debugging. num_sims indicates the number of permutation tests you'd like to run to estimate the p-values of the differential methylation tests num_sig_tests is an integer indicating how many permutations can return a result more significant than the original statistic before permutation testing is abandoned for that particular site. mc_max_dist is an integer indicating the maximum distance two sites can be from one another for their methylation counts to be combined this option helps with low coverage experiments where you may want to leverage the correlation of methylation between sites to get more statistical power. *Options that are passed through to collapse_dmr_windows* dmr_max_dist is the maximum distance two significant sites can be to be included in the same block resid_cutoff - If this option is specified, not only will a result have to be significant to be included in a window, but it will also, have to show deviations in the contingency table in the same direction as the rest of the window. For example, if sample A is generally showing a degree of methylation higher than expected and sample B lower than expected, any new site will have to have these same properties. Furthermore, these deviations have to be at least as extreme as resid_cutoff (in the positive or negative direction). This value is determined by looking at the distribution of residuals at sites with non-significant p-values. The value specified here indicates the percentile (0.00-1.00) in the distribution of non-significant p-values to look for to determine a residual cutoff. sig_cutoff - the cutoff for determining whether a row is significant or not seed - a seed to provide to the random number generator for permutation testing. Only change this if you are debugging and want to make sure the permutation output is consistent min_num_dms - the minimum number of differentially methylated sites that a differentially methylated region needs to contain to be reported collapse_samples - a list of samples for collapsing blocks. If the methylation status (hypermethylated/hypomethylated) of one of the samples switch for a DMS, a new block will be created. NOTE: it is best to run DMRfind with all the samples you will potentially want to consider, and then to run collapse_dmr_windows for the individual cases you are interested in. In other words, run DMRfind with the sample argument set to: ["A","B","C"] and then run collapse_dmr_windows 3 times with collapse_samples set to ["A","B"],["A","C"], and finally ["B","C"] sample_category - A list of categories that each respective sample belongs to; the categories must begin at 0 and increase by 1 for each category added. ex: samples [A,B,C] categories [0,1,2] or categories [0, 1, 0] min_cluster - The minimum number of each sample category that must be present in every block that is output. max_iterations is the maximum number of iterations performed by the algorithm described in the paper cited above convergence_diff determines when the algorithm will terminate. When the current m0 estimate and the last m0 estimate differ by no more than convergence_diff. """ #User input checks if not isinstance(allc_files, list): exit("mc_type must be a list of string(s)") if not isinstance(mc_type, list): if isinstance(mc_type, str): mc_type = [mc_type] else: exit("mc_type must be a list of string(s)") if not isinstance(samples, list): exit("samples must be a list of string(s)") try: num_procs = int(num_procs) except: exit("In DMRfind, num_procs must be an integer") try: mc_max_dist = int(mc_max_dist) except: exit("In DMRfind, mc_max_dist must be an integer") try: dmr_max_dist = int(dmr_max_dist) except: exit("In DMRfind, dmr_max_dist must be an integer") try: min_cov = int(min_cov) except: exit("In DMRfind, min_cov must be an integer") if isinstance(chroms, list) == False: exit("chroms must be a list of string(s)") chroms = list(map(str, chroms)) if collapse_samples != False: if not isinstance(collapse_samples, list): exit("collapse_samples must be a list of strings") if sample_category and not isinstance(sample_category, list): exit("sample_category must be a list of strings") for sample in collapse_samples: if sample not in samples: exit( "There is a sample in collapse_samples that is not in samples." + "collapse_samples MUST be a subset of samples.") elif sample_category != False: exit( "In order to use sample_category, you must specify a corresponding" + " list of samples in collapse_samples!") if mc_max_dist < 0: exit("In DMRfind, mc_max_dist must be greater than or equal to 0") if dmr_max_dist < 0: exit("In DMRfind, dmr_max_dist must be greater than 0") #This code creates all variations of the shorthand C contexts (e.g., CHG->CHG,CAG,CCG,CTG) mc_class = expand_nucleotide_code(mc_type) # scan allc file to set up a table for fast look-up of lines belong # to different chromosomes chrom_pointer = {} for allc_file, sample in zip(allc_files, samples): cp_dict = {} with open_allc_file(allc_file) as f: cur_chrom = "" cur_pointer = 0 while True: line = f.readline() if not line: break fields = line.split("\t") if fields[0] != cur_chrom: cp_dict[fields[0]] = cur_pointer cur_chrom = fields[0] cur_pointer = f.tell() chrom_pointer[sample] = cp_dict if num_procs > 1: pool = Pool(num_procs) else: pool = False try: for chr_key in chroms: chrom = str(chr_key).replace("chr", "") results = [] print_checkpoint("Splitting allc files for chromosome " + str(chrom)) split_files_by_position(allc_files, samples, num_procs, mc_class, chrom_pointer=chrom_pointer, chrom=chrom, num_procs=num_procs, min_cov=min_cov, pool=pool, max_dist=mc_max_dist) print_checkpoint("Running rms tests for chromosome " + str(chrom)) if num_procs > 1: for chunk in range(0, num_procs): filenames = [] for allc_file in allc_files: filenames.extend( glob(allc_file + "_" + chrom + "_" + str(chunk))) if len(filenames) == 0: print("Nothing to run for chunk " + str(chunk)) continue pool.apply_async( run_rms_tests, (filenames, output_prefix + "_rms_results_for_" + str(chrom) + "_chunk_" + str(chunk) + ".tsv", samples), { "min_cov": min_cov, "num_sims": num_sims, "num_sig_tests": num_sig_tests, "seed": seed, "keep_temp_files": keep_temp_files }) else: filenames = [] for allc_file in allc_files: filenames.extend(glob(allc_file + "_" + chrom + "_0")) if len(filenames) == 0: print("Nothing to run for chunk " + str(chunk)) continue run_rms_tests(filenames, output_prefix + "_rms_results_for_" + str(chrom) + "_chunk_0.tsv", samples, min_cov=min_cov, num_sims=num_sims, num_sig_tests=num_sig_tests, seed=seed, keep_temp_files=keep_temp_files) if pool != False: pool.close() pool.join() except Exception as e: exc_type, exc_obj, exc_tb = exc_info() print(exc_type, exc_tb.tb_lineno) print(e) try: pool.terminate() pool.join() except: pass exit("Running RMS tests failed.") print_checkpoint("Merging sorted " + output_prefix + "_rms_results.tsv files.") header = "\t".join([ "\t".join(["chr", "pos", "strand", "mc_class", "pvalue" ]), "\t".join(["mc_" + sample for sample in samples]), "\t".join(["h_" + sample for sample in samples ]), "\t".join(["frac_" + sample for sample in samples]), "\t".join(["mc_residual_" + sample for sample in samples]), "\t".join(["uc_residual_" + sample for sample in samples]), "num_simulations_sig\tnum_simulations_run" + "\n" ]) #I put this up here because it's actually a lot harder to prepend a header file than you might think g = open(output_prefix + "_rms_results.tsv", 'w') g.write(header) for chr_key in sorted(chroms): chrom = str(chr_key).replace("chr", "") for chunk in range(0, num_procs): try: with open( output_prefix + "_rms_results_for_" + chrom + "_chunk_" + str(chunk) + ".tsv", 'r') as f: for line in f: g.write(line) except: pass g.close() if keep_temp_files == False: basecmd = ['rm'] file_paths = glob( output_prefix + "_rms_results_for_*_chunk_[0-9].tsv") + glob( output_prefix + "_rms_results_for_*_chunk_[0-9][0-9].tsv" ) + glob(output_prefix + "_rms_results_for_*_chunk_[0-9][0-9][0-9].tsv") if file_paths: try: check_call(basecmd + file_paths) except: pass print_checkpoint("Begin FDR Correction") pvalue_cutoff = histogram_correction_DMRfind( output_prefix + "_rms_results.tsv", num_sims, num_sig_tests, target_fdr=sig_cutoff, max_iterations=max_iterations, convergence_diff=convergence_diff) print_checkpoint("Calculating Residual Cutoff") resid_cutoff = get_resid_cutoff(resid_cutoff, pvalue_cutoff, len(samples), output_prefix + "_rms_results.tsv") print_checkpoint("Begin Defining Windows") collapse_dmr_windows(output_prefix + "_rms_results.tsv", output_prefix + "_rms_results_collapsed.tsv", column=4, sig_cutoff=pvalue_cutoff, max_dist=dmr_max_dist, resid_cutoff=resid_cutoff, min_num_dms=min_num_dms, collapse_samples=collapse_samples, sample_category=sample_category, min_cluster=min_cluster) get_methylation_levels_DMRfind( output_prefix + "_rms_results_collapsed.tsv", output_prefix + "_rms_results_collapsed_with_levels.tsv", allc_files, samples, mc_type=mc_type, num_procs=num_procs, buffer_line_number=buffer_line_number) subprocess.check_call( shlex.split("mv " + output_prefix + "_rms_results_collapsed_with_levels.tsv " + output_prefix + "_rms_results_collapsed.tsv")) print_checkpoint("Done")