Example #1
0
def bed_intervals_gen(coordgen,
                      interval_bed_fn) -> Generator[Coord, None, None]:
    """Generate coordinate intervals corresponding to the provided bed
    file."""
    with FileParser(
            fn=interval_bed_fn,
            colnames=["chrom", "start", "end"],
            dtypes={
                "start": int,
                "end": int
            },
            force_col_len=False,
            comment="track",
            quiet=True,
    ) as bed:

        prev_ct = None
        for line in bed:
            ct = coordgen(line.chrom, line.start, line.end)
            if prev_ct and ct < prev_ct:
                raise ValueError(
                    "Unsorted coordinate found in bed file {} found after {}. Chromosomes have to be ordered as in fasta reference file"
                    .format(ct, prev_ct))
            prev_ct = ct
            yield (ct)
Example #2
0
def Interval_Aggregate(cpg_aggregate_fn: str,
                       ref_fasta_fn: str,
                       interval_bed_fn: str = None,
                       output_bed_fn: str = None,
                       output_tsv_fn: str = None,
                       interval_size: int = 1000,
                       min_cpg_per_interval: int = 5,
                       sample_id: str = "",
                       min_llr: float = 2,
                       verbose: bool = False,
                       quiet: bool = False,
                       progress: bool = False,
                       **kwargs):
    """
    Bin the output of `pycoMeth CpG_Aggregate` in genomic intervals, using either an annotation file containing intervals or a sliding window.
    * cpg_aggregate_fn
        Output tsv file generated by CpG_Aggregate (can be gzipped)
    * ref_fasta_fn
        Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)
    * interval_bed_fn
        SORTED bed file containing **non-overlapping** intervals to bin CpG data into (Optional) (can be gzipped)
    * output_bed_fn
        Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)
    * output_tsv_fn
        Path to write a more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)
    * interval_size
        Size of the sliding window in which to aggregate CpG sites data from if no BED file is provided
    * min_cpg_per_interval
        Minimal number of CpG sites per interval.
    * sample_id
        Sample ID to be used for the BED track header
    * min_llr
        Minimal log likelyhood ratio to consider a site significantly methylated or unmethylated in output BED file
    """
    # Init method
    opt_summary_dict = opt_summary(local_opt=locals())
    log = get_logger(name="pycoMeth_CpG_Comp", verbose=verbose, quiet=quiet)

    log.warning("Checking options and input files")
    log_dict(opt_summary_dict, log.debug, "Options summary")

    # Init collections
    counter = Counter()
    coordgen = CoordGen(ref_fasta_fn, verbose, quiet)
    log_list(coordgen, log.debug, "Coordinate reference summary")

    # At least one output file is required, otherwise it doesn't make any sense
    log.debug("Checking required output")
    if not output_bed_fn and not output_tsv_fn:
        raise pycoMethError("At least 1 output file is requires (-t or -b)")

    # Define type of window generator
    log.debug("Defining interval generator")
    if interval_bed_fn:
        log.debug("Bed annotation generator")
        intervals_gen = bed_intervals_gen(coordgen=coordgen,
                                          interval_bed_fn=interval_bed_fn)
    else:
        log.debug("Sliding window generator")
        intervals_gen = sliding_intervals_gen(coordgen=coordgen,
                                              interval_size=interval_size)

    # Open file parser, sit writter and progress bar
    log.warning("Parsing CpG_aggregate file")
    try:
        fp_in = FileParser(fn=cpg_aggregate_fn,
                           dtypes={
                               "start": int,
                               "end": int,
                               "median_llr": float,
                               "num_motifs": int
                           },
                           verbose=verbose,
                           quiet=quiet,
                           include_byte_len=True)

        if not fp_in.input_type == "CpG_Aggregate":
            raise pycoMethError(
                "Invalid input file type passed (cpg_aggregate_fn). Expecting pycoMeth CpG_Aggregate output TSV file"
            )

        fp_out = Interval_Writer(bed_fn=output_bed_fn,
                                 tsv_fn=output_tsv_fn,
                                 sample_id=sample_id,
                                 min_llr=min_llr,
                                 min_cpg_per_interval=min_cpg_per_interval,
                                 verbose=verbose)
        try:
            with tqdm(total=len(fp_in),
                      unit=" bytes",
                      unit_scale=True,
                      desc="\tProgress",
                      disable=not progress) as pbar:
                # Get first line
                line = fp_in.next()
                line_coord = coordgen(line.chromosome, line.start, line.end)
                counter["Lines parsed"] += 1
                pbar.update(line.byte_len)

                # Get intervals from either the bed generator or the sliding window generator

                for win_coord in intervals_gen:
                    counter["Total number of intervals"] += 1
                    llr_list = []
                    pos_list = []
                    num_motifs = 0

                    while True:
                        # Check if window and center of CpG overlap
                        center = win_coord.center_comp(line_coord)

                        # Center of CpG is greater than current windows = write off current vals and go to next window
                        if center == "greater":
                            fp_out.write(coord=win_coord,
                                         num_motifs=num_motifs,
                                         llr_list=llr_list,
                                         pos_list=pos_list)
                            break

                        # Center of CpG falls inside current windows =  save llr value
                        if center == "inside":
                            llr_list.append(line.median_llr)
                            pos_list.append(int(line_coord.center))
                            num_motifs += line.num_motifs

                        # Center of CpG lower or inside the current windows = keep reading lines
                        line = fp_in.next()
                        line_coord = coordgen(line.chromosome, line.start,
                                              line.end)
                        counter["Lines parsed"] += 1
                        pbar.update(line.byte_len)

        # Stop when reaching end of input file
        except StopIteration:
            # Write last interval
            fp_out.write(coord=win_coord,
                         num_motifs=num_motifs,
                         llr_list=llr_list,
                         pos_list=pos_list)

    finally:
        # Print counters
        log_dict(counter, log.info, "Results summary")
        log_dict(fp_out.counter, log.info, "Writter summary")

        # Close input and output files
        for fp in (fp_in, fp_out):
            try:
                fp.close()
            except:
                pass
Example #3
0
def Meth_Comp(aggregate_fn_list: [str],
              ref_fasta_fn: str,
              output_tsv_fn: str = None,
              output_bed_fn: str = None,
              max_missing: int = 0,
              min_diff_llr: float = 2,
              sample_id_list: [str] = None,
              pvalue_adj_method: str = "fdr_bh",
              pvalue_threshold: float = 0.01,
              verbose: bool = False,
              quiet: bool = False,
              progress: bool = False,
              **kwargs):
    """
    Compare methylation values for each CpG positions or intervals between n samples and perform a statistical test to evaluate if the positions are
    significantly different. For 2 samples a Mann_Withney test is performed otherwise multiples samples are compared with a Kruskal Wallis test.
    pValues are adjusted for multiple tests using the Benjamini & Hochberg procedure for controlling the false discovery rate.
    * aggregate_fn_list
        A list of output tsv files corresponding to several samples to compare generated by either CpG_Aggregate or Interval_Aggregate. (can be gzipped)
    * ref_fasta_fn
        Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)
    * output_bed_fn
        Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)
    * output_tsv_fn
        Path to write an more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)
    * max_missing
        Max number of missing samples to perform the test
    * min_diff_llr
        Minimal llr boundary for negative and positive median llr.
        The test if only performed if at least one sample has a median llr above (methylated) and 1 sample has a median llr below (unmethylated)
    * sample_id_list
        list of sample ids to annotate results in tsv file
    * pvalue_adj_method
        Method to use for pValue multiple test adjustment
    * pvalue_threshold
        Alpha parameter (family-wise error rate) for pValue adjustment
    """

    # Init method
    opt_summary_dict = opt_summary(local_opt=locals())
    log = get_logger(name="pycoMeth_CpG_Comp", verbose=verbose, quiet=quiet)

    log.warning("Checking options and input files")
    log_dict(opt_summary_dict, log.debug, "Options summary")

    # Init collections
    coordgen = CoordGen(ref_fasta_fn, verbose, quiet)
    log_list(coordgen, log.debug, "Coordinate reference summary")

    # At least one output file is required, otherwise it doesn't make any sense
    log.debug("Checking required output")
    if not output_bed_fn and not output_tsv_fn:
        raise pycoMethError("At least 1 output file is requires (-t or -b)")

    # Automatically define tests and maximal missing samples depending on number of files to compare
    all_samples = len(aggregate_fn_list)
    min_samples = all_samples - max_missing

    # 3 values = Kruskal Wallis test
    if all_samples >= 3:
        pvalue_method = "KW"
        log.debug("Multiple comparison mode (Kruskal_Wallis test)")
        if min_samples < 3:
            log.debug("Automatically raise number of minimal samples to 3")
            min_samples = 3
    # 2 values = Mann_Withney test
    elif all_samples == 2:
        pvalue_method = "MW"
        log.debug("Pairwise comparison mode (Mann_Withney test)")
        if min_samples:
            log.debug("No missing samples allowed for 2 samples comparison")
            min_samples = 2
    else:
        raise pycoMethError("Meth_Comp needs at least 2 input files")

    log.warning("Parsing files")
    try:
        log.info(
            "Reading input files header and checking consistancy between headers"
        )
        colnames = set()
        input_type = ""
        fp_list = []
        all_fp_len = 0

        if not sample_id_list or len(sample_id_list) != len(aggregate_fn_list):
            sample_id_list = list(range(len(aggregate_fn_list)))

        for label, fn in zip(sample_id_list, aggregate_fn_list):
            fp = FileParser(fn=fn,
                            label=label,
                            dtypes={
                                "start": int,
                                "end": int,
                                "median_llr": float
                            },
                            verbose=verbose,
                            quiet=quiet,
                            include_byte_len=True)
            all_fp_len += len(fp)
            fp_list.append(fp)

            # Check colnames
            if not colnames:
                colnames = set(fp.colnames)
            elif colnames != set(fp.colnames):
                raise ValueError(f"Invalid field {fp.colnames} in file {fn}")

            # Get input file type
            if not input_type:
                input_type = fp.input_type
            elif input_type != fp.input_type:
                raise ValueError(f"Inconsistent input types")

        # Check that aggregate_fn_list contains valid input types
        if not input_type in ["CpG_Aggregate", "Interval_Aggregate"]:
            raise pycoMethError(
                "Invalid input file type passed (aggregate_fn_list). Expecting pycoMeth CpG_Aggregate or Interval_Aggregate output TSV files"
            )

        # Define StatsResults to collect valid sites and perform stats
        stats_results = StatsResults(pvalue_method=pvalue_method,
                                     pvalue_adj_method=pvalue_adj_method,
                                     pvalue_threshold=pvalue_threshold,
                                     min_diff_llr=min_diff_llr,
                                     min_samples=min_samples,
                                     input_type=input_type)

        log.info("Starting asynchronous file parsing")
        with tqdm(total=all_fp_len,
                  unit=" bytes",
                  unit_scale=True,
                  desc="\tProgress",
                  disable=not progress) as pbar:

            coord_d = defaultdict(list)

            # Read first line from each file
            log.debug("Reading first lines")
            for fp in fp_list:
                # Move pointer up and index by coordinate
                try:
                    line = fp.next()
                    coord = coordgen(line.chromosome, line.start, line.end)
                    coord_d[coord].append(fp)
                    pbar.update(line.byte_len)
                except StopIteration:
                    raise pycoMethError("Empty file found")

            # Continue reading lines from all files
            log.debug("Starting deep parsing")
            fp_done = 0
            while True:
                # Get lower coord if has enough samples
                lower_coord = sorted(coord_d.keys())[0]
                coord_fp_list = sorted(coord_d[lower_coord],
                                       key=lambda x: x.label)

                # Deal with lower coordinates and compute result if needed
                stats_results.compute_pvalue(
                    coord=lower_coord,
                    line_list=[
                        coord_fp.current() for coord_fp in coord_fp_list
                    ],
                    label_list=[coord_fp.label for coord_fp in coord_fp_list])

                # Remove lower entry and move fp to next sequence
                del (coord_d[lower_coord])
                for fp in coord_fp_list:

                    # Move pointer up and index by coordinate
                    try:
                        line = fp.next()
                        coord = coordgen(line.chromosome, line.start, line.end)
                        coord_d[coord].append(fp)
                        pbar.update(line.byte_len)
                    except StopIteration:
                        fp_done += 1

                # Exit condition = all file are finished
                if fp_done == len(fp_list):
                    break

        # Init file writter
        with Comp_Writer(bed_fn=output_bed_fn,
                         tsv_fn=output_tsv_fn,
                         input_type=input_type,
                         verbose=verbose) as writer:

            # Exit condition
            if not stats_results.res_list:
                log.info("No valid p-Value could be computed")

            else:
                # Convert results to dataframe and correct pvalues for multiple tests
                log.info("Adjust pvalues")
                stats_results.multitest_adjust()

                # Write output file
                log.info("Writing output file")
                for res in tqdm(stats_results.res_list,
                                unit=" sites",
                                unit_scale=True,
                                desc="\tProgress",
                                disable=not progress):
                    writer.write(res)

    finally:
        # Print counters
        log_dict(stats_results.counter, log.info, "Results summary")

        # Close input and output files
        for fp in fp_list:
            try:
                fp.close()
            except:
                pass
Example #4
0
def CpG_Aggregate(nanopolish_fn: [str],
                  ref_fasta_fn: str,
                  output_bed_fn: str = "",
                  output_tsv_fn: str = "",
                  min_depth: int = 10,
                  sample_id: str = "",
                  min_llr: float = 2,
                  verbose: bool = False,
                  quiet: bool = False,
                  progress: bool = False,
                  **kwargs):
    """
    Calculate methylation frequency at genomic CpG sites from the output of `nanopolish call-methylation`
    * nanopolish_fn
        Path to a nanopolish call_methylation tsv output file or a list of files or a regex matching several files (can be gzipped)
    * ref_fasta_fn
        Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx)
    * output_bed_fn
        Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped)
    * output_tsv_fn
        Path to write a more extensive result report in TSV format (At least 1 output file is required) (can be gzipped)
    * min_depth
        Minimal number of reads covering a site to be reported
    * sample_id
        Sample ID to be used for the BED track header
    * min_llr
        Minimal log likelyhood ratio to consider a site significantly methylated or unmethylated in output BED file
    """

    # Init package
    opt_summary_dict = opt_summary(local_opt=locals())
    log = get_logger(name="pycoMeth_CpG_Aggregate",
                     verbose=verbose,
                     quiet=quiet)

    log.warning("Checking options and input files")
    log_dict(opt_summary_dict, log.debug, "Options summary")

    # At least one output file is required, otherwise it doesn't make any sense
    if not output_bed_fn and not output_tsv_fn:
        raise pycoMethError("At least 1 output file is requires (-t or -b)")

    # Init SitesIndex object with ref_fasta_fn to aggregate data at genomic position level
    log.warning("Parsing methylation_calls file")
    sites_index = SitesIndex(ref_fasta_fn=ref_fasta_fn)

    # Open file parser
    # Possible fields chromosome	strand	start	end	read_name	log_lik_ratio	log_lik_methylated	log_lik_unmethylated	num_calling_strands	num_motifs	sequence
    dtypes = {
        "start": int,
        "end": int,
        "log_lik_ratio": float,
        "num_motifs": int
    }
    with FileParser(fn=nanopolish_fn,
                    dtypes=dtypes,
                    verbose=verbose,
                    quiet=quiet,
                    include_byte_len=progress) as fp_in:

        if not fp_in.input_type == "call_methylation":
            raise pycoMethError(
                "Invalid input file type passed (nanopolish_fn). Expecting Nanopolish call_methylation output TSV file"
            )

        log.info("Starting to parse file Nanopolish methylation call file")
        with tqdm(total=len(fp_in),
                  unit=" bytes",
                  unit_scale=True,
                  desc="\tProgress",
                  disable=not progress) as pbar:
            for lt in fp_in:
                sites_index.add(lt)
                # Update progress_bar
                if progress: pbar.update(lt.byte_len)

        log_dict(fp_in.counter, log.info, "Parsing summary")

        log.info("Filtering out low coverage sites")
        sites_index.filter_low_count(min_depth)

        log.info("Sorting each chromosome by coordinates")
        sites_index.sort()

        log_dict(sites_index.counter, log.info, "Sites summary")

    log.warning("Processing valid sites found and write to file")

    with CpG_Writer(bed_fn=output_bed_fn,
                    tsv_fn=output_tsv_fn,
                    sample_id=sample_id,
                    min_llr=min_llr,
                    verbose=verbose) as fp_out:
        for coord, val_dict in tqdm(sites_index,
                                    unit=" sites",
                                    unit_scale=True,
                                    desc="\tProgress",
                                    disable=not progress):
            fp_out.write(coord, val_dict)

        log_dict(fp_out.counter, log.info, "Results summary")