def bed_intervals_gen(coordgen, interval_bed_fn) -> Generator[Coord, None, None]: """Generate coordinate intervals corresponding to the provided bed file.""" with FileParser( fn=interval_bed_fn, colnames=["chrom", "start", "end"], dtypes={ "start": int, "end": int }, force_col_len=False, comment="track", quiet=True, ) as bed: prev_ct = None for line in bed: ct = coordgen(line.chrom, line.start, line.end) if prev_ct and ct < prev_ct: raise ValueError( "Unsorted coordinate found in bed file {} found after {}. Chromosomes have to be ordered as in fasta reference file" .format(ct, prev_ct)) prev_ct = ct yield (ct)
def Interval_Aggregate(cpg_aggregate_fn: str, ref_fasta_fn: str, interval_bed_fn: str = None, output_bed_fn: str = None, output_tsv_fn: str = None, interval_size: int = 1000, min_cpg_per_interval: int = 5, sample_id: str = "", min_llr: float = 2, verbose: bool = False, quiet: bool = False, progress: bool = False, **kwargs): """ Bin the output of `pycoMeth CpG_Aggregate` in genomic intervals, using either an annotation file containing intervals or a sliding window. * cpg_aggregate_fn Output tsv file generated by CpG_Aggregate (can be gzipped) * ref_fasta_fn Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx) * interval_bed_fn SORTED bed file containing **non-overlapping** intervals to bin CpG data into (Optional) (can be gzipped) * output_bed_fn Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped) * output_tsv_fn Path to write a more extensive result report in TSV format (At least 1 output file is required) (can be gzipped) * interval_size Size of the sliding window in which to aggregate CpG sites data from if no BED file is provided * min_cpg_per_interval Minimal number of CpG sites per interval. * sample_id Sample ID to be used for the BED track header * min_llr Minimal log likelyhood ratio to consider a site significantly methylated or unmethylated in output BED file """ # Init method opt_summary_dict = opt_summary(local_opt=locals()) log = get_logger(name="pycoMeth_CpG_Comp", verbose=verbose, quiet=quiet) log.warning("Checking options and input files") log_dict(opt_summary_dict, log.debug, "Options summary") # Init collections counter = Counter() coordgen = CoordGen(ref_fasta_fn, verbose, quiet) log_list(coordgen, log.debug, "Coordinate reference summary") # At least one output file is required, otherwise it doesn't make any sense log.debug("Checking required output") if not output_bed_fn and not output_tsv_fn: raise pycoMethError("At least 1 output file is requires (-t or -b)") # Define type of window generator log.debug("Defining interval generator") if interval_bed_fn: log.debug("Bed annotation generator") intervals_gen = bed_intervals_gen(coordgen=coordgen, interval_bed_fn=interval_bed_fn) else: log.debug("Sliding window generator") intervals_gen = sliding_intervals_gen(coordgen=coordgen, interval_size=interval_size) # Open file parser, sit writter and progress bar log.warning("Parsing CpG_aggregate file") try: fp_in = FileParser(fn=cpg_aggregate_fn, dtypes={ "start": int, "end": int, "median_llr": float, "num_motifs": int }, verbose=verbose, quiet=quiet, include_byte_len=True) if not fp_in.input_type == "CpG_Aggregate": raise pycoMethError( "Invalid input file type passed (cpg_aggregate_fn). Expecting pycoMeth CpG_Aggregate output TSV file" ) fp_out = Interval_Writer(bed_fn=output_bed_fn, tsv_fn=output_tsv_fn, sample_id=sample_id, min_llr=min_llr, min_cpg_per_interval=min_cpg_per_interval, verbose=verbose) try: with tqdm(total=len(fp_in), unit=" bytes", unit_scale=True, desc="\tProgress", disable=not progress) as pbar: # Get first line line = fp_in.next() line_coord = coordgen(line.chromosome, line.start, line.end) counter["Lines parsed"] += 1 pbar.update(line.byte_len) # Get intervals from either the bed generator or the sliding window generator for win_coord in intervals_gen: counter["Total number of intervals"] += 1 llr_list = [] pos_list = [] num_motifs = 0 while True: # Check if window and center of CpG overlap center = win_coord.center_comp(line_coord) # Center of CpG is greater than current windows = write off current vals and go to next window if center == "greater": fp_out.write(coord=win_coord, num_motifs=num_motifs, llr_list=llr_list, pos_list=pos_list) break # Center of CpG falls inside current windows = save llr value if center == "inside": llr_list.append(line.median_llr) pos_list.append(int(line_coord.center)) num_motifs += line.num_motifs # Center of CpG lower or inside the current windows = keep reading lines line = fp_in.next() line_coord = coordgen(line.chromosome, line.start, line.end) counter["Lines parsed"] += 1 pbar.update(line.byte_len) # Stop when reaching end of input file except StopIteration: # Write last interval fp_out.write(coord=win_coord, num_motifs=num_motifs, llr_list=llr_list, pos_list=pos_list) finally: # Print counters log_dict(counter, log.info, "Results summary") log_dict(fp_out.counter, log.info, "Writter summary") # Close input and output files for fp in (fp_in, fp_out): try: fp.close() except: pass
def Meth_Comp(aggregate_fn_list: [str], ref_fasta_fn: str, output_tsv_fn: str = None, output_bed_fn: str = None, max_missing: int = 0, min_diff_llr: float = 2, sample_id_list: [str] = None, pvalue_adj_method: str = "fdr_bh", pvalue_threshold: float = 0.01, verbose: bool = False, quiet: bool = False, progress: bool = False, **kwargs): """ Compare methylation values for each CpG positions or intervals between n samples and perform a statistical test to evaluate if the positions are significantly different. For 2 samples a Mann_Withney test is performed otherwise multiples samples are compared with a Kruskal Wallis test. pValues are adjusted for multiple tests using the Benjamini & Hochberg procedure for controlling the false discovery rate. * aggregate_fn_list A list of output tsv files corresponding to several samples to compare generated by either CpG_Aggregate or Interval_Aggregate. (can be gzipped) * ref_fasta_fn Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx) * output_bed_fn Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped) * output_tsv_fn Path to write an more extensive result report in TSV format (At least 1 output file is required) (can be gzipped) * max_missing Max number of missing samples to perform the test * min_diff_llr Minimal llr boundary for negative and positive median llr. The test if only performed if at least one sample has a median llr above (methylated) and 1 sample has a median llr below (unmethylated) * sample_id_list list of sample ids to annotate results in tsv file * pvalue_adj_method Method to use for pValue multiple test adjustment * pvalue_threshold Alpha parameter (family-wise error rate) for pValue adjustment """ # Init method opt_summary_dict = opt_summary(local_opt=locals()) log = get_logger(name="pycoMeth_CpG_Comp", verbose=verbose, quiet=quiet) log.warning("Checking options and input files") log_dict(opt_summary_dict, log.debug, "Options summary") # Init collections coordgen = CoordGen(ref_fasta_fn, verbose, quiet) log_list(coordgen, log.debug, "Coordinate reference summary") # At least one output file is required, otherwise it doesn't make any sense log.debug("Checking required output") if not output_bed_fn and not output_tsv_fn: raise pycoMethError("At least 1 output file is requires (-t or -b)") # Automatically define tests and maximal missing samples depending on number of files to compare all_samples = len(aggregate_fn_list) min_samples = all_samples - max_missing # 3 values = Kruskal Wallis test if all_samples >= 3: pvalue_method = "KW" log.debug("Multiple comparison mode (Kruskal_Wallis test)") if min_samples < 3: log.debug("Automatically raise number of minimal samples to 3") min_samples = 3 # 2 values = Mann_Withney test elif all_samples == 2: pvalue_method = "MW" log.debug("Pairwise comparison mode (Mann_Withney test)") if min_samples: log.debug("No missing samples allowed for 2 samples comparison") min_samples = 2 else: raise pycoMethError("Meth_Comp needs at least 2 input files") log.warning("Parsing files") try: log.info( "Reading input files header and checking consistancy between headers" ) colnames = set() input_type = "" fp_list = [] all_fp_len = 0 if not sample_id_list or len(sample_id_list) != len(aggregate_fn_list): sample_id_list = list(range(len(aggregate_fn_list))) for label, fn in zip(sample_id_list, aggregate_fn_list): fp = FileParser(fn=fn, label=label, dtypes={ "start": int, "end": int, "median_llr": float }, verbose=verbose, quiet=quiet, include_byte_len=True) all_fp_len += len(fp) fp_list.append(fp) # Check colnames if not colnames: colnames = set(fp.colnames) elif colnames != set(fp.colnames): raise ValueError(f"Invalid field {fp.colnames} in file {fn}") # Get input file type if not input_type: input_type = fp.input_type elif input_type != fp.input_type: raise ValueError(f"Inconsistent input types") # Check that aggregate_fn_list contains valid input types if not input_type in ["CpG_Aggregate", "Interval_Aggregate"]: raise pycoMethError( "Invalid input file type passed (aggregate_fn_list). Expecting pycoMeth CpG_Aggregate or Interval_Aggregate output TSV files" ) # Define StatsResults to collect valid sites and perform stats stats_results = StatsResults(pvalue_method=pvalue_method, pvalue_adj_method=pvalue_adj_method, pvalue_threshold=pvalue_threshold, min_diff_llr=min_diff_llr, min_samples=min_samples, input_type=input_type) log.info("Starting asynchronous file parsing") with tqdm(total=all_fp_len, unit=" bytes", unit_scale=True, desc="\tProgress", disable=not progress) as pbar: coord_d = defaultdict(list) # Read first line from each file log.debug("Reading first lines") for fp in fp_list: # Move pointer up and index by coordinate try: line = fp.next() coord = coordgen(line.chromosome, line.start, line.end) coord_d[coord].append(fp) pbar.update(line.byte_len) except StopIteration: raise pycoMethError("Empty file found") # Continue reading lines from all files log.debug("Starting deep parsing") fp_done = 0 while True: # Get lower coord if has enough samples lower_coord = sorted(coord_d.keys())[0] coord_fp_list = sorted(coord_d[lower_coord], key=lambda x: x.label) # Deal with lower coordinates and compute result if needed stats_results.compute_pvalue( coord=lower_coord, line_list=[ coord_fp.current() for coord_fp in coord_fp_list ], label_list=[coord_fp.label for coord_fp in coord_fp_list]) # Remove lower entry and move fp to next sequence del (coord_d[lower_coord]) for fp in coord_fp_list: # Move pointer up and index by coordinate try: line = fp.next() coord = coordgen(line.chromosome, line.start, line.end) coord_d[coord].append(fp) pbar.update(line.byte_len) except StopIteration: fp_done += 1 # Exit condition = all file are finished if fp_done == len(fp_list): break # Init file writter with Comp_Writer(bed_fn=output_bed_fn, tsv_fn=output_tsv_fn, input_type=input_type, verbose=verbose) as writer: # Exit condition if not stats_results.res_list: log.info("No valid p-Value could be computed") else: # Convert results to dataframe and correct pvalues for multiple tests log.info("Adjust pvalues") stats_results.multitest_adjust() # Write output file log.info("Writing output file") for res in tqdm(stats_results.res_list, unit=" sites", unit_scale=True, desc="\tProgress", disable=not progress): writer.write(res) finally: # Print counters log_dict(stats_results.counter, log.info, "Results summary") # Close input and output files for fp in fp_list: try: fp.close() except: pass
def CpG_Aggregate(nanopolish_fn: [str], ref_fasta_fn: str, output_bed_fn: str = "", output_tsv_fn: str = "", min_depth: int = 10, sample_id: str = "", min_llr: float = 2, verbose: bool = False, quiet: bool = False, progress: bool = False, **kwargs): """ Calculate methylation frequency at genomic CpG sites from the output of `nanopolish call-methylation` * nanopolish_fn Path to a nanopolish call_methylation tsv output file or a list of files or a regex matching several files (can be gzipped) * ref_fasta_fn Reference file used for alignment in Fasta format (ideally already indexed with samtools faidx) * output_bed_fn Path to write a summary result file in BED format (At least 1 output file is required) (can be gzipped) * output_tsv_fn Path to write a more extensive result report in TSV format (At least 1 output file is required) (can be gzipped) * min_depth Minimal number of reads covering a site to be reported * sample_id Sample ID to be used for the BED track header * min_llr Minimal log likelyhood ratio to consider a site significantly methylated or unmethylated in output BED file """ # Init package opt_summary_dict = opt_summary(local_opt=locals()) log = get_logger(name="pycoMeth_CpG_Aggregate", verbose=verbose, quiet=quiet) log.warning("Checking options and input files") log_dict(opt_summary_dict, log.debug, "Options summary") # At least one output file is required, otherwise it doesn't make any sense if not output_bed_fn and not output_tsv_fn: raise pycoMethError("At least 1 output file is requires (-t or -b)") # Init SitesIndex object with ref_fasta_fn to aggregate data at genomic position level log.warning("Parsing methylation_calls file") sites_index = SitesIndex(ref_fasta_fn=ref_fasta_fn) # Open file parser # Possible fields chromosome strand start end read_name log_lik_ratio log_lik_methylated log_lik_unmethylated num_calling_strands num_motifs sequence dtypes = { "start": int, "end": int, "log_lik_ratio": float, "num_motifs": int } with FileParser(fn=nanopolish_fn, dtypes=dtypes, verbose=verbose, quiet=quiet, include_byte_len=progress) as fp_in: if not fp_in.input_type == "call_methylation": raise pycoMethError( "Invalid input file type passed (nanopolish_fn). Expecting Nanopolish call_methylation output TSV file" ) log.info("Starting to parse file Nanopolish methylation call file") with tqdm(total=len(fp_in), unit=" bytes", unit_scale=True, desc="\tProgress", disable=not progress) as pbar: for lt in fp_in: sites_index.add(lt) # Update progress_bar if progress: pbar.update(lt.byte_len) log_dict(fp_in.counter, log.info, "Parsing summary") log.info("Filtering out low coverage sites") sites_index.filter_low_count(min_depth) log.info("Sorting each chromosome by coordinates") sites_index.sort() log_dict(sites_index.counter, log.info, "Sites summary") log.warning("Processing valid sites found and write to file") with CpG_Writer(bed_fn=output_bed_fn, tsv_fn=output_tsv_fn, sample_id=sample_id, min_llr=min_llr, verbose=verbose) as fp_out: for coord, val_dict in tqdm(sites_index, unit=" sites", unit_scale=True, desc="\tProgress", disable=not progress): fp_out.write(coord, val_dict) log_dict(fp_out.counter, log.info, "Results summary")