Exemple #1
0
def get_methylation_levels_DMRfind(input_tsv_file,
                                   output,
                                   input_allc_files,
                                   samples,
                                   mc_type=["CGN"],
                                   num_procs=1,
                                   buffer_line_number=100000):
    """
    This function assumes that allc files are of the format allc_<sample>_<chr>.tsv
    input is the path to a file containing collapsed DMR results
    output is the path to a file where the methylation values should be stored
    samples is a list of samples you'd like to compute the methylation level for
    path_to_allc is the path to the directory containing the allc files for these samples
    num_procs is an integer indicating the number of processors you'd like to use for calculating
        methylation level. This function can be parallelized up to the number of samples
    """
    # need to check
    # 1. whether the tsv file is sorted by coordinate
    # 1. check whether allc files are stored in a list
    # 2. check whether samples are stored in a list
    # 4. whether samples and allc files have the same length

    mc_class = expand_nucleotide_code(mc_type)

    with open(output, 'w') as g:
        # header
        f = open(input_tsv_file, 'r')
        line = f.readline()
        line = line.rstrip("\n")
        fields = line.split("\t")
        g.write(
            "\t".join(fields) + "\t" +
            "\t".join(["methylation_level_" + sample
                       for sample in samples]) + "\n")
        #
        methylation_levels = {}
        if num_procs == 1:
            for allc_file, sample in zip(input_allc_files, samples):
                get_methylation_level_DMRfind_worker(input_tsv_file, allc_file,
                                                     sample, output, mc_class,
                                                     buffer_line_number)
        else:
            pool = Pool(min(num_procs, len(samples)))
            results = {}
            #for allc_file,sample in zip(input_allc_files,samples):
            for ind in range(len(samples)):
                pool.apply_async(
                    get_methylation_level_DMRfind_worker,
                    (input_tsv_file, input_allc_files[ind], samples[ind],
                     output, mc_class, buffer_line_number))
            pool.close()
            pool.join()
        temp_files = {}
        for sample in samples:
            temp_files[sample] = open(
                output.replace(".tsv", "") + "_" + sample +
                "_temp_methylation_levels.tsv", 'r')

        f.seek(0)
        line = f.readline()
        for line in f:
            g.write(line.rstrip("\n"))
            for sample in samples:
                g.write("\t" + temp_files[sample].readline().rstrip("\n"))
            g.write("\n")
        for sample in samples:
            temp_files[sample].close()
            subprocess.check_call(
                shlex.split("rm " + output.replace(".tsv", "") + "_" + sample +
                            "_temp_methylation_levels.tsv"))
Exemple #2
0
def DMRfind(allc_files,
            samples,
            mc_type,
            chroms,
            output_prefix,
            num_procs=1,
            min_cov=0,
            keep_temp_files=False,
            mc_max_dist=0,
            dmr_max_dist=250,
            resid_cutoff=.01,
            sig_cutoff=.01,
            num_sims=3000,
            num_sig_tests=100,
            seed=-1,
            min_num_dms=0,
            collapse_samples=False,
            sample_category=False,
            min_cluster=0,
            max_iterations=1000,
            convergence_diff=1,
            buffer_line_number=100000):
    """
    This function will take a set of allc files, and look for differentially methylated regions. Note that in the output file, 
    -1 is used to represent missing data.
    
    mc_type is a list of the mc nucleotide contexts for which you want to look for DMRs. These classifications 
        may use the wildcards "H" (indicating anything but a G) and "N" (indicating any nucleotide). For example,
        CAH would look for DMRs in methylated cytosines followed by AT, AC, or AA.
    region_dict is a dictionary that has keys of the chromosomes that you want to search that points to a list 
        of [start,end] for that chromosome. The list elements should be positive integers.
    samples is a list of sample names to help find the allc files. See path_to_allc for more details.
    path_to_allc is a string indicating the path to the tab separated files containing methylation information
        for all C nucleotides in the genome. These files will be of the format: chromosome, position, strand, nucleotide context,
        reads with a methylated C at this position, total reads covering this position, and a binary value 0/1 indicating whether or not this
        site was statistically significantly methylated (e.g., as determined by a binomial test based on factors like sequencing
        error and non-conversion rate). A file for each chromosome is expected and their names will be in this format:
        path_to_allc/allc_<sample name>_<chromosome number>.tsv. Note that chromosome number should be an integer or letter (i.e.,
        exclude the string "chr")
    num_procs is the number of processors you wish to use to parallelize this function
    save_result is a string indicating the prefix for various result files produced by this function
    min_cov is the minimum number of reads that must cover a site for it to be considered in DMR finding
    keep_temp_files indicates that you'd like to keep all the intermediate files this function generates along
        the way. This can be useful for debugging.
    num_sims indicates the number of permutation tests you'd like to run to estimate the p-values of the differential methylation tests
    num_sig_tests is an integer indicating how many permutations can return a result more significant than the original statistic
        before permutation testing is abandoned for that particular site.
    mc_max_dist is an integer indicating the maximum distance two sites can be from one another for their methylation counts to be combined
        this option helps with low coverage experiments where you may want to leverage the correlation of methylation between sites to get 
        more statistical power.
    *Options that are passed through to collapse_dmr_windows*
    dmr_max_dist is the maximum distance two significant sites can be to be included in the same block
    resid_cutoff - If this option is specified, not only will a result have to be significant to be included in a window, but it will also,
        have to show deviations in the contingency table in the same direction as the rest of the window. For example, if sample A is generally
        showing a degree of methylation higher than expected and sample B lower than expected, any new site will have to have these same properties.
        Furthermore, these deviations have to be at least as extreme as resid_cutoff (in the positive or negative direction). This value is determined
        by looking at the distribution of residuals at sites with non-significant p-values. The value specified here indicates the percentile (0.00-1.00) 
        in the distribution of non-significant p-values to look for to determine a residual cutoff.
    sig_cutoff - the cutoff for determining whether a row is significant or not
    seed - a seed to provide to the random number generator for permutation testing. Only change this if you are debugging and want to make sure
        the permutation output is consistent
    min_num_dms - the minimum number of differentially methylated sites that a differentially methylated region needs to contain to be reported
    collapse_samples - a list of samples for collapsing blocks. If the methylation status (hypermethylated/hypomethylated) of one of the samples switch
        for a DMS, a new block will be created. NOTE: it is best to run DMRfind with all the samples you will potentially want to consider, and then
        to run collapse_dmr_windows for the individual cases you are interested in. In other words, run DMRfind with the sample argument set to:
        ["A","B","C"] and then run collapse_dmr_windows 3 times with collapse_samples set to ["A","B"],["A","C"], and finally ["B","C"]
    sample_category - A list of categories that each respective sample belongs to; the categories must begin at 0 and increase by
        1 for each category added. ex: samples [A,B,C] categories [0,1,2] or categories [0, 1, 0] 
    min_cluster - The minimum number of each sample category that must be present in every block that is output.
    max_iterations is the maximum number of iterations performed by the algorithm described in the paper cited above
    convergence_diff determines when the algorithm will terminate. When the current m0 estimate and the last m0 estimate differ by no
        more than convergence_diff.
    """

    #User input checks
    if not isinstance(allc_files, list):
        exit("mc_type must be a list of string(s)")
    if not isinstance(mc_type, list):
        if isinstance(mc_type, str):
            mc_type = [mc_type]
        else:
            exit("mc_type must be a list of string(s)")
    if not isinstance(samples, list):
        exit("samples must be a list of string(s)")
    try:
        num_procs = int(num_procs)
    except:
        exit("In DMRfind, num_procs must be an integer")

    try:
        mc_max_dist = int(mc_max_dist)
    except:
        exit("In DMRfind, mc_max_dist must be an integer")
    try:
        dmr_max_dist = int(dmr_max_dist)
    except:
        exit("In DMRfind, dmr_max_dist must be an integer")

    try:
        min_cov = int(min_cov)
    except:
        exit("In DMRfind, min_cov must be an integer")
    if isinstance(chroms, list) == False:
        exit("chroms must be a list of string(s)")
    chroms = list(map(str, chroms))

    if collapse_samples != False:
        if not isinstance(collapse_samples, list):
            exit("collapse_samples must be a list of strings")
        if sample_category and not isinstance(sample_category, list):
            exit("sample_category must be a list of strings")
        for sample in collapse_samples:
            if sample not in samples:
                exit(
                    "There is a sample in collapse_samples that is not in samples."
                    + "collapse_samples MUST be a subset of samples.")
    elif sample_category != False:
        exit(
            "In order to use sample_category, you must specify a corresponding"
            + " list of samples in collapse_samples!")

    if mc_max_dist < 0:
        exit("In DMRfind, mc_max_dist must be greater than or equal to 0")
    if dmr_max_dist < 0:
        exit("In DMRfind, dmr_max_dist must be greater than 0")

    #This code creates all variations of the shorthand C contexts (e.g., CHG->CHG,CAG,CCG,CTG)
    mc_class = expand_nucleotide_code(mc_type)
    # scan allc file to set up a table for fast look-up of lines belong
    # to different chromosomes
    chrom_pointer = {}
    for allc_file, sample in zip(allc_files, samples):
        cp_dict = {}
        with open_allc_file(allc_file) as f:
            cur_chrom = ""
            cur_pointer = 0
            while True:
                line = f.readline()
                if not line: break
                fields = line.split("\t")
                if fields[0] != cur_chrom:
                    cp_dict[fields[0]] = cur_pointer
                    cur_chrom = fields[0]
                cur_pointer = f.tell()
        chrom_pointer[sample] = cp_dict

    if num_procs > 1:
        pool = Pool(num_procs)
    else:
        pool = False

    try:
        for chr_key in chroms:
            chrom = str(chr_key).replace("chr", "")
            results = []
            print_checkpoint("Splitting allc files for chromosome " +
                             str(chrom))
            split_files_by_position(allc_files,
                                    samples,
                                    num_procs,
                                    mc_class,
                                    chrom_pointer=chrom_pointer,
                                    chrom=chrom,
                                    num_procs=num_procs,
                                    min_cov=min_cov,
                                    pool=pool,
                                    max_dist=mc_max_dist)
            print_checkpoint("Running rms tests for chromosome " + str(chrom))
            if num_procs > 1:
                for chunk in range(0, num_procs):
                    filenames = []
                    for allc_file in allc_files:
                        filenames.extend(
                            glob(allc_file + "_" + chrom + "_" + str(chunk)))
                    if len(filenames) == 0:
                        print("Nothing to run for chunk " + str(chunk))
                        continue
                    pool.apply_async(
                        run_rms_tests,
                        (filenames, output_prefix + "_rms_results_for_" +
                         str(chrom) + "_chunk_" + str(chunk) + ".tsv",
                         samples), {
                             "min_cov": min_cov,
                             "num_sims": num_sims,
                             "num_sig_tests": num_sig_tests,
                             "seed": seed,
                             "keep_temp_files": keep_temp_files
                         })
            else:
                filenames = []
                for allc_file in allc_files:
                    filenames.extend(glob(allc_file + "_" + chrom + "_0"))
                if len(filenames) == 0:
                    print("Nothing to run for chunk " + str(chunk))
                    continue
                run_rms_tests(filenames,
                              output_prefix + "_rms_results_for_" +
                              str(chrom) + "_chunk_0.tsv",
                              samples,
                              min_cov=min_cov,
                              num_sims=num_sims,
                              num_sig_tests=num_sig_tests,
                              seed=seed,
                              keep_temp_files=keep_temp_files)
        if pool != False:
            pool.close()
            pool.join()

    except Exception as e:
        exc_type, exc_obj, exc_tb = exc_info()
        print(exc_type, exc_tb.tb_lineno)
        print(e)
        try:
            pool.terminate()
            pool.join()
        except:
            pass
        exit("Running RMS tests failed.")

    print_checkpoint("Merging sorted " + output_prefix +
                     "_rms_results.tsv files.")
    header = "\t".join([
        "\t".join(["chr", "pos", "strand", "mc_class", "pvalue"
                   ]), "\t".join(["mc_" + sample for sample in samples]),
        "\t".join(["h_" + sample for sample in samples
                   ]), "\t".join(["frac_" + sample for sample in samples]),
        "\t".join(["mc_residual_" + sample for sample in samples]),
        "\t".join(["uc_residual_" + sample for sample in samples]),
        "num_simulations_sig\tnum_simulations_run" + "\n"
    ])
    #I put this up here because it's actually a lot harder to prepend a header file than you might think
    g = open(output_prefix + "_rms_results.tsv", 'w')
    g.write(header)
    for chr_key in sorted(chroms):
        chrom = str(chr_key).replace("chr", "")
        for chunk in range(0, num_procs):
            try:
                with open(
                        output_prefix + "_rms_results_for_" + chrom +
                        "_chunk_" + str(chunk) + ".tsv", 'r') as f:
                    for line in f:
                        g.write(line)
            except:
                pass
    g.close()
    if keep_temp_files == False:
        basecmd = ['rm']
        file_paths = glob(
            output_prefix + "_rms_results_for_*_chunk_[0-9].tsv") + glob(
                output_prefix + "_rms_results_for_*_chunk_[0-9][0-9].tsv"
            ) + glob(output_prefix +
                     "_rms_results_for_*_chunk_[0-9][0-9][0-9].tsv")
        if file_paths:
            try:
                check_call(basecmd + file_paths)
            except:
                pass
    print_checkpoint("Begin FDR Correction")
    pvalue_cutoff = histogram_correction_DMRfind(
        output_prefix + "_rms_results.tsv",
        num_sims,
        num_sig_tests,
        target_fdr=sig_cutoff,
        max_iterations=max_iterations,
        convergence_diff=convergence_diff)

    print_checkpoint("Calculating Residual Cutoff")
    resid_cutoff = get_resid_cutoff(resid_cutoff, pvalue_cutoff, len(samples),
                                    output_prefix + "_rms_results.tsv")

    print_checkpoint("Begin Defining Windows")
    collapse_dmr_windows(output_prefix + "_rms_results.tsv",
                         output_prefix + "_rms_results_collapsed.tsv",
                         column=4,
                         sig_cutoff=pvalue_cutoff,
                         max_dist=dmr_max_dist,
                         resid_cutoff=resid_cutoff,
                         min_num_dms=min_num_dms,
                         collapse_samples=collapse_samples,
                         sample_category=sample_category,
                         min_cluster=min_cluster)
    get_methylation_levels_DMRfind(
        output_prefix + "_rms_results_collapsed.tsv",
        output_prefix + "_rms_results_collapsed_with_levels.tsv",
        allc_files,
        samples,
        mc_type=mc_type,
        num_procs=num_procs,
        buffer_line_number=buffer_line_number)
    subprocess.check_call(
        shlex.split("mv " + output_prefix +
                    "_rms_results_collapsed_with_levels.tsv " + output_prefix +
                    "_rms_results_collapsed.tsv"))
    print_checkpoint("Done")