Ejemplo n.º 1
0
    def __init__(self, h5_read_groups_key: str, sample_id_list: List,
                 h5_file_list: List):
        self.h5_read_groups_key = h5_read_groups_key
        self.sample_hf_files: Dict[str, MetH5File] = {}
        self.llr_threshold = 2.0  # TODO expose parameter
        self.min_diff_bs = 0.25  # TODO expose parameter

        if h5_read_groups_key is None:
            for sample_id, h5_file in zip(sample_id_list, h5_file_list):
                hf = MetH5File(h5_file, "r")
                self.sample_hf_files[sample_id] = hf
        else:
            hf = MetH5File(h5_file_list[0], "r")
            for sample_id in sample_id_list:
                self.sample_hf_files[sample_id] = hf
Ejemplo n.º 2
0
def argtype_M5File(value):
    try:
        MetH5File(value, "r").get_chromosomes()
    except:
        raise argparse.ArgumentTypeError(
            f"Failed to read '{value}'. Is it a valid MetH5 file?")
    return Path(value)
Ejemplo n.º 3
0
def validate_chunk_selection(m5file: Path, chromosome: str, chunk_size: int,
                             chunks: List[int]):
    with MetH5File(m5file, "r", chunk_size=chunk_size) as m5:
        num_chunks = m5[chromosome].get_number_of_chunks()
        if max(chunks) >= m5[chromosome].get_number_of_chunks():
            raise ValueError(
                f"Chunk {max(chunks)} not in chromosome. Must be in range {0}-{num_chunks - 1}"
            )
Ejemplo n.º 4
0
def compute_total_chrom_sizes(input_m5_files: List[str]) -> Dict[str, int]:
    chrom_size = {}
    for i, input_file in enumerate(input_m5_files):
        with MetH5File(input_file, "r") as m5_in:
            for chrom in m5_in.get_chromosomes():
                if chrom not in chrom_size:
                    chrom_size[chrom] = 0
                chrom_size[chrom] += len(m5_in[chrom])
    return chrom_size
Ejemplo n.º 5
0
def read_sample_ids_from_read_groups(h5_file_list,
                                     read_group_key,
                                     labels=None):
    rg_dict = {}
    for fn in h5_file_list:
        with MetH5File(fn, "r") as f:
            f_rg_dict = f.get_all_read_groups(read_group_key)
            for k, v in f_rg_dict.items():
                if k in rg_dict and rg_dict[k] != v:
                    raise pycoMethError(
                        "Read groups in meth5 files must have the same encoding"
                    )
            rg_dict.update(f_rg_dict)
    if labels is not None:
        return [k for k, v in rg_dict.items() if v in labels]
    else:
        return [k for k in rg_dict]
Ejemplo n.º 6
0
def main(
    m5file: Path,
    read_groups_key: str,
    read_group_file: Path,
    chunk_size: int,
):
    read_annotation = read_readgroups(read_group_file)
    all_groups = list(set(read_annotation["group"]))
    group_dict = {g: i for i, g in enumerate(all_groups)}
    read_annotation["group"] = read_annotation["group"].map(group_dict.get)
    group_dict = {v: k for k, v in group_dict.items()}
    read_annotation = read_annotation.set_index("read_name")["group"].to_dict()

    with MetH5File(m5file, mode="a", chunk_size=chunk_size) as m5:
        m5.annotate_read_groups(read_groups_key,
                                read_annotation,
                                labels=group_dict,
                                exists_ok=True,
                                overwrite=True)
Ejemplo n.º 7
0
def main(
    chunk_size: int,
    input_paths: List[Path],
    output_file: Path,
    compression: str,
    allowed_chromosomes: List[str],
    quiet: bool,
):
    if compression == "None":
        compression = None

    if allowed_chromosomes is not None and len(allowed_chromosomes) == 0:
        allowed_chromosomes = None

    input_files = []
    for input_path in input_paths:
        if not input_path.exists():
            raise ValueError(f"Cannot find path {input_path}")
        if input_path.is_file():
            input_files.append(input_path)
        elif input_path.is_dir():
            subfiles = list(input_path.iterdir())
            if len(subfiles) == 0:
                raise ValueError(
                    f"Provided input path refers to a directory which is empty: {input_path}"
                )
            input_files += [f for f in subfiles if f.is_file()]

    with MetH5File(output_file,
                   chunk_size=chunk_size,
                   mode="w",
                   compression=compression) as m5_out:
        for input_file in tqdm.tqdm(input_files) if not quiet else input_files:
            m5_out.parse_and_add_nanopolish_file(
                input_file,
                postpone_sorting_until_close=True,
                include_chromosomes=allowed_chromosomes)
        m5_out.create_chunk_index()
Ejemplo n.º 8
0
def worker_reader(
    m5files: List[Path],
    chunk_size: int,
    chromosome: str,
    window_size: int,
    input_queue: Queue,
    chunks: List[int],
    progress_per_chunk: float,
    read_groups_keys: List[str],
):
    firstfile = m5files[0]
    with MetH5File(firstfile, "r", chunk_size=chunk_size) as m5:
        chrom_container = m5[chromosome]

        for chunk in chunks:
            values_container = chrom_container.get_chunk(chunk)
            met_matrix: SparseMethylationMatrixContainer = values_container.to_sparse_methylation_matrix(
                read_read_names=False, read_groups_key=read_groups_keys)

            if len(m5files) > 0:
                if read_groups_keys is None:
                    met_matrix.read_samples = np.array(
                        [f"{firstfile.name}" for _ in met_matrix.read_names])
                else:
                    met_matrix.read_samples = np.array([
                        f"{firstfile.name}_{sn}"
                        for sn in met_matrix.read_samples
                    ])

            for other_m5file in m5files[1:]:
                with MetH5File(other_m5file, "r",
                               chunk_size=chunk_size) as other_m5:
                    other_ranges = values_container.get_ranges()
                    other_values_container = other_m5[
                        chromosome].get_values_in_range(
                            other_ranges[0, 0], other_ranges[-1, 1])
                    other_met_matrix = other_values_container.to_sparse_methylation_matrix(
                        read_read_names=False,
                        read_groups_key=read_groups_keys)
                    if other_met_matrix.met_matrix.shape[0] <= 1:
                        continue

                    if read_groups_keys is None:
                        other_met_matrix.read_samples = np.array([
                            f"{other_m5file.name}"
                            for _ in other_met_matrix.read_names
                        ])
                    else:
                        other_met_matrix.read_samples = np.array([
                            f"{other_m5file.name}_{sn}"
                            for sn in other_met_matrix.read_samples
                        ])
                    met_matrix = met_matrix.merge(other_met_matrix,
                                                  sample_names_mode="keep")

            if read_groups_keys is None and len(m5files) == 1:
                met_matrix.read_samples = met_matrix.read_names
            total_sites = len(met_matrix.genomic_coord)
            num_windows = (total_sites // window_size) + 1
            progress_per_window = progress_per_chunk / num_windows
            for window_start in range(0, total_sites + 1, window_size):
                window_end = window_start + window_size
                logging.debug(f"Submitting window {window_start}-{window_end}")
                sub_matrix = met_matrix.get_submatrix(window_start, window_end)
                input_queue.put((sub_matrix, progress_per_window))
Ejemplo n.º 9
0
def Meth_Seg(
    h5_file_list: [Path],
    output_tsv_fn: str,
    chromosome: str,
    chunk_size: int = int(5e4),
    chunks: [int] = None,
    workers: int = 1,
    reader_workers: int = 1,
    progress: bool = False,
    window_size: int = 300,
    max_segments_per_window: int = 10,
    read_groups_keys: [str] = None,
    print_diff_met: bool = False,
    output_bedgraph_fn: str = None,
    **kwargs,
):
    """
    Methylation segmentation method implemented as a bayesian changepoint detection algorithm
    * h5_file_list
        A list of MetH5 files containing methylation llr
    * chromosome
        The chromosome to segment
    * chunk_size
        Number of llrs per chunk - for best performance, should be a multiple of the  chunksize used in creating of the h5 files
        Default is the same as the default for creating meth5 files.
    * chunks
        List of chunk IDs or None if all chunks of the chromsome are to be segmented
    * workers
        Number of worker processes
    * reader_workers
        Number of reader worker processes
    * progress
        True if  progress bar is desired
    * output_tsv_fn
        Output TSV file
    * window_size
        Window size for segmentation in number of CpG calling sites. Default: 300.
        Increasing this increases memory requirement
    * max_segments_per_window
        Maximum number of segments per window. Should probably be somewhere between 8 and 20.
        The larger the number, the more expensive the computation.
    * read_groups_keys
        If read groups should be considered (e.g. haplotype) pass the read group key. You can provide more than one.
    * print_diff_met
        Whether output TSV file should contain methylation rate difference between samples
    * output_bedgraph_fn
        Base name for bedgraphs to be written. One bedgraph per sample/read_group will be created.
    """

    input_queue = Queue(maxsize=workers * 5)
    output_queue = Queue(maxsize=workers * 100)

    for m5file in h5_file_list:
        validate_chromosome_selection(m5file, chromosome, chunk_size)

    firstm5 = h5_file_list[0]
    if chunks is None:
        # No chunks have been provided, take all
        with MetH5File(firstm5, mode="r", chunk_size=chunk_size) as f:
            chunks = list(range(f[chromosome].get_number_of_chunks()))
    else:
        # flatten chunk list, since we allow a list of chunks or a list of chunk ranges
        # (which are converted to lists in parsing)
        chunks = [
            chunk for subchunks in chunks for chunk in (
                [subchunks] if isinstance(subchunks, int) else subchunks)
        ]

    validate_chunk_selection(firstm5, chromosome, chunk_size, chunks)

    # sort and make unique
    chunks = sorted(list(set(chunks)))
    progress_per_chunk = 100 / len(chunks)

    segmentation_processes = [
        Process(target=worker_segment,
                args=(input_queue, output_queue, max_segments_per_window))
    ]
    for p in segmentation_processes:
        p.start()

    reader_workers = min(reader_workers, len(chunks))
    chunk_per_process = np.array_split(chunks, reader_workers)
    reader_processes = [
        Process(
            target=worker_reader,
            args=(
                h5_file_list,
                chunk_size,
                chromosome,
                window_size,
                input_queue,
                p_chunks,
                progress_per_chunk,
                read_groups_keys,
            ),
        ) for p_chunks in chunk_per_process
    ]
    for p in reader_processes:
        p.start()

    output_process = Process(
        target=worker_output,
        args=(
            output_queue,
            output_tsv_fn,
            output_bedgraph_fn,
            chromosome,
            read_groups_keys,
            print_diff_met,
            not progress,
        ),
    )
    output_process.start()

    for p in reader_processes:
        p.join()

    # Deal poison pills to segmentation workers
    for p in segmentation_processes:
        input_queue.put(None)

    for p in segmentation_processes:
        p.join()

    # Deal poison pill to writer worker
    output_queue.put(None)
    output_process.join()
Ejemplo n.º 10
0
def validate_chromosome_selection(m5file: Path, chromosome: str,
                                  chunk_size: int):
    with MetH5File(m5file, "r", chunk_size=chunk_size) as m5:
        if chromosome not in m5.get_chromosomes():
            raise ValueError(f"Chromosome {chromosome} not found in m5 file.")
Ejemplo n.º 11
0
def main(m5files: List[Path], chunk_size: int):
    for m5file in m5files:
        print(f"{m5file.basename()}: ")
        with MetH5File(m5file, "r", chunk_size=chunk_size) as f:
            for chrom in f.get_chromosomes():
                print(f"{chrom}: {f[chrom].get_number_of_chunks()}")
Ejemplo n.º 12
0
def main(
    chunk_size: int,
    input_m5_files: List[Path],
    read_group_names: List[str],
    read_groups_key: str,
    output_file: Path,
    compression: str,
    compression_level: int,
    allowed_chromosomes: List[str],
    no_read_groups: bool,
    quiet: bool,
    no_read_names: bool,
):
    if compression == "None":
        compression = None

    if allowed_chromosomes is not None and len(allowed_chromosomes) == 0:
        allowed_chromosomes = None

    if read_groups_key is not None:
        if len(read_group_names) != len(input_m5_files):
            raise ValueError(
                f"List of read group prefixes must match number of input files"
            )

    total_chrom_sizes = compute_total_chrom_sizes(input_m5_files)

    with MetH5File(
            output_file,
            chunk_size=chunk_size,
            mode="w",
            compression=compression,
            compression_level=compression_level,
            max_calls=total_chrom_sizes,
    ) as m5_out:

        if read_groups_key is not None:
            rg_maps = {read_groups_key: {}}
        else:
            rg_maps = {}

        read_id_offset = 0
        for i, input_file in enumerate(input_m5_files):
            max_read_id_local = 0
            with MetH5File(input_file,
                           "r") as m5_in, tqdm.tqdm(total=100,
                                                    disable=quiet) as pbar:
                print("Reading ", input_file)
                read_read_groups(m5_in, no_read_groups, read_group_names[i],
                                 read_groups_key, read_names, rg_maps)

                if allowed_chromosomes is None:
                    chromosomes = set(m5_in.get_chromosomes())
                else:
                    chromosomes = set(allowed_chromosomes).intersection(
                        set(m5_in.get_chromosomes()))

                print("Copying methylation calls")

                progress = 0
                percent_per_chrom = 100 / len(chromosomes)
                for chromosome in chromosomes:
                    chrom_container = m5_in[chromosome]
                    percent_per_chunk = percent_per_chrom / chrom_container.get_number_of_chunks(
                    )
                    for chunk in chrom_container.get_chunk_ids():
                        values_container = chrom_container.get_chunk(
                            chunk, overlap=False)
                        ranges = values_container.get_ranges()
                        llrs = values_container.get_llrs()

                        if no_read_names:
                            read_names = values_container.get_read_ids()
                            read_names += read_id_offset
                            max_read_id_local = max(max_read_id_local,
                                                    max(read_names))
                            read_names_key = "read_id"
                        else:
                            read_names = values_container.get_read_names()
                            read_names_key = "read_name"

                        df = pd.DataFrame({
                            "chromosome": chromosome,
                            "start": ranges[:, 0],
                            "end": ranges[:, 1],
                            read_names_key: read_names,
                            "log_lik_ratio": llrs,
                        })
                        m5_out.add_to_h5_file(
                            df, postpone_sorting_until_close=True)
                        progress += percent_per_chunk
                        pbar.n = progress
                        pbar.refresh()
        print("Indexing")
        m5_out.create_chunk_index()
        write_read_groups(m5_out, no_read_groups, rg_maps)
Ejemplo n.º 13
0
    def test_create_h5(self):
        with TemporaryFile() as tmp_f:
            #  ==== Creating a new H5 file from a regular nanopolish output ====
            # Note that chunk size is only that small for the sake of the test
            with MetH5File(tmp_f, "w", chunk_size=10) as mf:
                mf.parse_and_add_nanopolish_file(
                    self.datadir.joinpath("nanopolish_calls.tsv.gz"))
                # Creating this index will make random access MUCH faster for large
                # H5 files
                mf.create_chunk_index()
                # Test adding read groups
                mf.annotate_read_groups("test_group", self.read_groups)

            # ==== Test if we can read the new H5 file we just created ====
            with MetH5File(tmp_f, "r", chunk_size=10) as mf:
                chroms = mf.get_chromosomes()

                # Our test data only contains a single chromosome (chr8)
                self.assertEqual(len(chroms), 1)
                self.assertEqual(chroms[0], "8")

                # Access values for that chromosome
                chrom_container = mf[chroms[0]]

                # ---- Access chunk-wise: ----
                value_container = chrom_container.get_chunk(0)
                self.assertEqual(
                    len(value_container.get_read_groups("test_group")), 10)

                # Test conversion to sparse matrix
                matrix_container = value_container.to_sparse_methylation_matrix(
                    read_groups_key="test_group")
                # Only 1 read in that chunk
                self.assertEqual(matrix_container.met_matrix.shape[0], 1)
                # 10 sites in that chunk
                self.assertEqual(matrix_container.met_matrix.shape[1], 10)

                # ---- Access region-wise: ----
                value_container = chrom_container.get_values_in_range(
                    97732352, 97745195)

                # Test conversion to sparse matrix
                matrix_container = value_container.to_sparse_methylation_matrix(
                    read_groups_key="test_group")

                # All 4 reads in that region
                self.assertEqual(matrix_container.met_matrix.shape[0], 4)
                # 184 sites in that region
                self.assertEqual(matrix_container.met_matrix.shape[1], 184)

                # Test subsetting by extracting reads from a sample:
                sub_matrix = matrix_container.get_submatrix_from_read_mask(
                    matrix_container.read_samples == 1)

                # Only 2 of the reads passed the filter
                self.assertEqual(sub_matrix.met_matrix.shape[0], 2)
                # Subsetting reads also compacted the sites dimension:
                self.assertEqual(sub_matrix.met_matrix.shape[1], 146)

                # Creates a matrix from two value container (the same one - just for
                # testing)
                two_sample_matrix = create_sparse_matrix_from_samples(
                    {
                        "A": value_container,
                        "B": value_container
                    },
                    sample_prefix_readnames=True,
                )
                # Matrix should have twice as many reads in two samples but the same
                # number of sites
                self.assertEqual(len(two_sample_matrix.read_samples), 8)
                self.assertEqual(len(set(two_sample_matrix.read_samples)), 2)
                self.assertEqual(two_sample_matrix.met_matrix.shape[1], 184)

                # Try the llr aggregation
                self.assertEqual(
                    value_container.get_llr_site_median()[0].shape[0], 184)