def __init__(self, h5_read_groups_key: str, sample_id_list: List, h5_file_list: List): self.h5_read_groups_key = h5_read_groups_key self.sample_hf_files: Dict[str, MetH5File] = {} self.llr_threshold = 2.0 # TODO expose parameter self.min_diff_bs = 0.25 # TODO expose parameter if h5_read_groups_key is None: for sample_id, h5_file in zip(sample_id_list, h5_file_list): hf = MetH5File(h5_file, "r") self.sample_hf_files[sample_id] = hf else: hf = MetH5File(h5_file_list[0], "r") for sample_id in sample_id_list: self.sample_hf_files[sample_id] = hf
def argtype_M5File(value): try: MetH5File(value, "r").get_chromosomes() except: raise argparse.ArgumentTypeError( f"Failed to read '{value}'. Is it a valid MetH5 file?") return Path(value)
def validate_chunk_selection(m5file: Path, chromosome: str, chunk_size: int, chunks: List[int]): with MetH5File(m5file, "r", chunk_size=chunk_size) as m5: num_chunks = m5[chromosome].get_number_of_chunks() if max(chunks) >= m5[chromosome].get_number_of_chunks(): raise ValueError( f"Chunk {max(chunks)} not in chromosome. Must be in range {0}-{num_chunks - 1}" )
def compute_total_chrom_sizes(input_m5_files: List[str]) -> Dict[str, int]: chrom_size = {} for i, input_file in enumerate(input_m5_files): with MetH5File(input_file, "r") as m5_in: for chrom in m5_in.get_chromosomes(): if chrom not in chrom_size: chrom_size[chrom] = 0 chrom_size[chrom] += len(m5_in[chrom]) return chrom_size
def read_sample_ids_from_read_groups(h5_file_list, read_group_key, labels=None): rg_dict = {} for fn in h5_file_list: with MetH5File(fn, "r") as f: f_rg_dict = f.get_all_read_groups(read_group_key) for k, v in f_rg_dict.items(): if k in rg_dict and rg_dict[k] != v: raise pycoMethError( "Read groups in meth5 files must have the same encoding" ) rg_dict.update(f_rg_dict) if labels is not None: return [k for k, v in rg_dict.items() if v in labels] else: return [k for k in rg_dict]
def main( m5file: Path, read_groups_key: str, read_group_file: Path, chunk_size: int, ): read_annotation = read_readgroups(read_group_file) all_groups = list(set(read_annotation["group"])) group_dict = {g: i for i, g in enumerate(all_groups)} read_annotation["group"] = read_annotation["group"].map(group_dict.get) group_dict = {v: k for k, v in group_dict.items()} read_annotation = read_annotation.set_index("read_name")["group"].to_dict() with MetH5File(m5file, mode="a", chunk_size=chunk_size) as m5: m5.annotate_read_groups(read_groups_key, read_annotation, labels=group_dict, exists_ok=True, overwrite=True)
def main( chunk_size: int, input_paths: List[Path], output_file: Path, compression: str, allowed_chromosomes: List[str], quiet: bool, ): if compression == "None": compression = None if allowed_chromosomes is not None and len(allowed_chromosomes) == 0: allowed_chromosomes = None input_files = [] for input_path in input_paths: if not input_path.exists(): raise ValueError(f"Cannot find path {input_path}") if input_path.is_file(): input_files.append(input_path) elif input_path.is_dir(): subfiles = list(input_path.iterdir()) if len(subfiles) == 0: raise ValueError( f"Provided input path refers to a directory which is empty: {input_path}" ) input_files += [f for f in subfiles if f.is_file()] with MetH5File(output_file, chunk_size=chunk_size, mode="w", compression=compression) as m5_out: for input_file in tqdm.tqdm(input_files) if not quiet else input_files: m5_out.parse_and_add_nanopolish_file( input_file, postpone_sorting_until_close=True, include_chromosomes=allowed_chromosomes) m5_out.create_chunk_index()
def worker_reader( m5files: List[Path], chunk_size: int, chromosome: str, window_size: int, input_queue: Queue, chunks: List[int], progress_per_chunk: float, read_groups_keys: List[str], ): firstfile = m5files[0] with MetH5File(firstfile, "r", chunk_size=chunk_size) as m5: chrom_container = m5[chromosome] for chunk in chunks: values_container = chrom_container.get_chunk(chunk) met_matrix: SparseMethylationMatrixContainer = values_container.to_sparse_methylation_matrix( read_read_names=False, read_groups_key=read_groups_keys) if len(m5files) > 0: if read_groups_keys is None: met_matrix.read_samples = np.array( [f"{firstfile.name}" for _ in met_matrix.read_names]) else: met_matrix.read_samples = np.array([ f"{firstfile.name}_{sn}" for sn in met_matrix.read_samples ]) for other_m5file in m5files[1:]: with MetH5File(other_m5file, "r", chunk_size=chunk_size) as other_m5: other_ranges = values_container.get_ranges() other_values_container = other_m5[ chromosome].get_values_in_range( other_ranges[0, 0], other_ranges[-1, 1]) other_met_matrix = other_values_container.to_sparse_methylation_matrix( read_read_names=False, read_groups_key=read_groups_keys) if other_met_matrix.met_matrix.shape[0] <= 1: continue if read_groups_keys is None: other_met_matrix.read_samples = np.array([ f"{other_m5file.name}" for _ in other_met_matrix.read_names ]) else: other_met_matrix.read_samples = np.array([ f"{other_m5file.name}_{sn}" for sn in other_met_matrix.read_samples ]) met_matrix = met_matrix.merge(other_met_matrix, sample_names_mode="keep") if read_groups_keys is None and len(m5files) == 1: met_matrix.read_samples = met_matrix.read_names total_sites = len(met_matrix.genomic_coord) num_windows = (total_sites // window_size) + 1 progress_per_window = progress_per_chunk / num_windows for window_start in range(0, total_sites + 1, window_size): window_end = window_start + window_size logging.debug(f"Submitting window {window_start}-{window_end}") sub_matrix = met_matrix.get_submatrix(window_start, window_end) input_queue.put((sub_matrix, progress_per_window))
def Meth_Seg( h5_file_list: [Path], output_tsv_fn: str, chromosome: str, chunk_size: int = int(5e4), chunks: [int] = None, workers: int = 1, reader_workers: int = 1, progress: bool = False, window_size: int = 300, max_segments_per_window: int = 10, read_groups_keys: [str] = None, print_diff_met: bool = False, output_bedgraph_fn: str = None, **kwargs, ): """ Methylation segmentation method implemented as a bayesian changepoint detection algorithm * h5_file_list A list of MetH5 files containing methylation llr * chromosome The chromosome to segment * chunk_size Number of llrs per chunk - for best performance, should be a multiple of the chunksize used in creating of the h5 files Default is the same as the default for creating meth5 files. * chunks List of chunk IDs or None if all chunks of the chromsome are to be segmented * workers Number of worker processes * reader_workers Number of reader worker processes * progress True if progress bar is desired * output_tsv_fn Output TSV file * window_size Window size for segmentation in number of CpG calling sites. Default: 300. Increasing this increases memory requirement * max_segments_per_window Maximum number of segments per window. Should probably be somewhere between 8 and 20. The larger the number, the more expensive the computation. * read_groups_keys If read groups should be considered (e.g. haplotype) pass the read group key. You can provide more than one. * print_diff_met Whether output TSV file should contain methylation rate difference between samples * output_bedgraph_fn Base name for bedgraphs to be written. One bedgraph per sample/read_group will be created. """ input_queue = Queue(maxsize=workers * 5) output_queue = Queue(maxsize=workers * 100) for m5file in h5_file_list: validate_chromosome_selection(m5file, chromosome, chunk_size) firstm5 = h5_file_list[0] if chunks is None: # No chunks have been provided, take all with MetH5File(firstm5, mode="r", chunk_size=chunk_size) as f: chunks = list(range(f[chromosome].get_number_of_chunks())) else: # flatten chunk list, since we allow a list of chunks or a list of chunk ranges # (which are converted to lists in parsing) chunks = [ chunk for subchunks in chunks for chunk in ( [subchunks] if isinstance(subchunks, int) else subchunks) ] validate_chunk_selection(firstm5, chromosome, chunk_size, chunks) # sort and make unique chunks = sorted(list(set(chunks))) progress_per_chunk = 100 / len(chunks) segmentation_processes = [ Process(target=worker_segment, args=(input_queue, output_queue, max_segments_per_window)) ] for p in segmentation_processes: p.start() reader_workers = min(reader_workers, len(chunks)) chunk_per_process = np.array_split(chunks, reader_workers) reader_processes = [ Process( target=worker_reader, args=( h5_file_list, chunk_size, chromosome, window_size, input_queue, p_chunks, progress_per_chunk, read_groups_keys, ), ) for p_chunks in chunk_per_process ] for p in reader_processes: p.start() output_process = Process( target=worker_output, args=( output_queue, output_tsv_fn, output_bedgraph_fn, chromosome, read_groups_keys, print_diff_met, not progress, ), ) output_process.start() for p in reader_processes: p.join() # Deal poison pills to segmentation workers for p in segmentation_processes: input_queue.put(None) for p in segmentation_processes: p.join() # Deal poison pill to writer worker output_queue.put(None) output_process.join()
def validate_chromosome_selection(m5file: Path, chromosome: str, chunk_size: int): with MetH5File(m5file, "r", chunk_size=chunk_size) as m5: if chromosome not in m5.get_chromosomes(): raise ValueError(f"Chromosome {chromosome} not found in m5 file.")
def main(m5files: List[Path], chunk_size: int): for m5file in m5files: print(f"{m5file.basename()}: ") with MetH5File(m5file, "r", chunk_size=chunk_size) as f: for chrom in f.get_chromosomes(): print(f"{chrom}: {f[chrom].get_number_of_chunks()}")
def main( chunk_size: int, input_m5_files: List[Path], read_group_names: List[str], read_groups_key: str, output_file: Path, compression: str, compression_level: int, allowed_chromosomes: List[str], no_read_groups: bool, quiet: bool, no_read_names: bool, ): if compression == "None": compression = None if allowed_chromosomes is not None and len(allowed_chromosomes) == 0: allowed_chromosomes = None if read_groups_key is not None: if len(read_group_names) != len(input_m5_files): raise ValueError( f"List of read group prefixes must match number of input files" ) total_chrom_sizes = compute_total_chrom_sizes(input_m5_files) with MetH5File( output_file, chunk_size=chunk_size, mode="w", compression=compression, compression_level=compression_level, max_calls=total_chrom_sizes, ) as m5_out: if read_groups_key is not None: rg_maps = {read_groups_key: {}} else: rg_maps = {} read_id_offset = 0 for i, input_file in enumerate(input_m5_files): max_read_id_local = 0 with MetH5File(input_file, "r") as m5_in, tqdm.tqdm(total=100, disable=quiet) as pbar: print("Reading ", input_file) read_read_groups(m5_in, no_read_groups, read_group_names[i], read_groups_key, read_names, rg_maps) if allowed_chromosomes is None: chromosomes = set(m5_in.get_chromosomes()) else: chromosomes = set(allowed_chromosomes).intersection( set(m5_in.get_chromosomes())) print("Copying methylation calls") progress = 0 percent_per_chrom = 100 / len(chromosomes) for chromosome in chromosomes: chrom_container = m5_in[chromosome] percent_per_chunk = percent_per_chrom / chrom_container.get_number_of_chunks( ) for chunk in chrom_container.get_chunk_ids(): values_container = chrom_container.get_chunk( chunk, overlap=False) ranges = values_container.get_ranges() llrs = values_container.get_llrs() if no_read_names: read_names = values_container.get_read_ids() read_names += read_id_offset max_read_id_local = max(max_read_id_local, max(read_names)) read_names_key = "read_id" else: read_names = values_container.get_read_names() read_names_key = "read_name" df = pd.DataFrame({ "chromosome": chromosome, "start": ranges[:, 0], "end": ranges[:, 1], read_names_key: read_names, "log_lik_ratio": llrs, }) m5_out.add_to_h5_file( df, postpone_sorting_until_close=True) progress += percent_per_chunk pbar.n = progress pbar.refresh() print("Indexing") m5_out.create_chunk_index() write_read_groups(m5_out, no_read_groups, rg_maps)
def test_create_h5(self): with TemporaryFile() as tmp_f: # ==== Creating a new H5 file from a regular nanopolish output ==== # Note that chunk size is only that small for the sake of the test with MetH5File(tmp_f, "w", chunk_size=10) as mf: mf.parse_and_add_nanopolish_file( self.datadir.joinpath("nanopolish_calls.tsv.gz")) # Creating this index will make random access MUCH faster for large # H5 files mf.create_chunk_index() # Test adding read groups mf.annotate_read_groups("test_group", self.read_groups) # ==== Test if we can read the new H5 file we just created ==== with MetH5File(tmp_f, "r", chunk_size=10) as mf: chroms = mf.get_chromosomes() # Our test data only contains a single chromosome (chr8) self.assertEqual(len(chroms), 1) self.assertEqual(chroms[0], "8") # Access values for that chromosome chrom_container = mf[chroms[0]] # ---- Access chunk-wise: ---- value_container = chrom_container.get_chunk(0) self.assertEqual( len(value_container.get_read_groups("test_group")), 10) # Test conversion to sparse matrix matrix_container = value_container.to_sparse_methylation_matrix( read_groups_key="test_group") # Only 1 read in that chunk self.assertEqual(matrix_container.met_matrix.shape[0], 1) # 10 sites in that chunk self.assertEqual(matrix_container.met_matrix.shape[1], 10) # ---- Access region-wise: ---- value_container = chrom_container.get_values_in_range( 97732352, 97745195) # Test conversion to sparse matrix matrix_container = value_container.to_sparse_methylation_matrix( read_groups_key="test_group") # All 4 reads in that region self.assertEqual(matrix_container.met_matrix.shape[0], 4) # 184 sites in that region self.assertEqual(matrix_container.met_matrix.shape[1], 184) # Test subsetting by extracting reads from a sample: sub_matrix = matrix_container.get_submatrix_from_read_mask( matrix_container.read_samples == 1) # Only 2 of the reads passed the filter self.assertEqual(sub_matrix.met_matrix.shape[0], 2) # Subsetting reads also compacted the sites dimension: self.assertEqual(sub_matrix.met_matrix.shape[1], 146) # Creates a matrix from two value container (the same one - just for # testing) two_sample_matrix = create_sparse_matrix_from_samples( { "A": value_container, "B": value_container }, sample_prefix_readnames=True, ) # Matrix should have twice as many reads in two samples but the same # number of sites self.assertEqual(len(two_sample_matrix.read_samples), 8) self.assertEqual(len(set(two_sample_matrix.read_samples)), 2) self.assertEqual(two_sample_matrix.met_matrix.shape[1], 184) # Try the llr aggregation self.assertEqual( value_container.get_llr_site_median()[0].shape[0], 184)