def test_create_new_genome_object():
    sacCer3 = Genome(
        "sacCer3",
        chromosomes=sacCer3_chromosomes,
    )
    for path in glob("{path}/*.json".format(path=sacCer3.path)):
        os.remove(path)
    with pytest.warns(RuntimeWarning):
        sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3.gaps()
    sacCer3.filled()
    str(sacCer3)
    sacCer3.delete()
Example #2
0
def get_gaps_statistics(
        genome: Genome, max_gap_size: int,
        window_size: int) -> Tuple[int, np.ndarray, np.ndarray]:
    """Return number, mean and covariance of gaps.

    Parameters
    --------------------------
    genome:Genome,
        The genome to use.
    max_gap_size:int,
        The maximum gap size to take in consideration.
    window_size:int
        The target window size

    Returns
    --------------------------
    Returns Tuple containing number of gaps, mean and covariance.
    """
    # Obtaining gaps
    gaps = genome.gaps()
    # Getting gaps whose size is below given threshold
    gaps = gaps[gaps.chromEnd - gaps.chromStart < max_gap_size]
    # Expanding gaps to given window size
    gaps = expand_bed_regions(gaps, window_size, alignment="center")
    # Retrieving the sequences corresponding to given gaps
    sequences = genome.bed_to_sequence(gaps).sequence.str.lower()
    # Obtaining a mask of gaps
    gaps_mask = np.array([list(sequence) for sequence in sequences]) == "n"
    number = len(gaps_mask)
    mean = gaps_mask.mean(axis=0)
    covariance = np.cov(gaps_mask.T)
    return number, mean, covariance
Example #3
0
def test_expand_bed_regions():
    hg19 = Genome("hg19", chromosomes=["chr2", "chr3"])
    gaps = hg19.gaps(chromosomes=["chr2", "chr3"])
    gaps = gaps[gaps.chromEnd - gaps.chromStart < 500]
    result = expand_bed_regions(gaps, 200, "left")
    assert (result.chromEnd - result.chromStart == 200).all()
    result = expand_bed_regions(gaps, 201, "right")
    assert (result.chromEnd - result.chromStart == 201).all()
    result = expand_bed_regions(gaps, 200, "center")
    assert (result.chromEnd - result.chromStart == 200).all()
    result = expand_bed_regions(gaps, 201, "center")
    assert (result.chromEnd - result.chromStart == 201).all()
    result = expand_bed_regions(gaps, 173, "center")
    assert (result.chromEnd - result.chromStart == 173).all()
Example #4
0
def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    # Check that no gap is with 0 length
    gaps = hg19.gaps(["chr1"])
    assert (gaps.chromEnd - gaps.chromStart != 0).all()
    # Converting gaps to sequences: should all be Nns
    gaps_tesselate = tessellate_bed(gaps, 200, verbose=False)
    gaps_sequences = hg19.bed_to_sequence(gaps_tesselate)
    for gap in gaps_sequences:
        assert set(gap.lower()) == set(["n"])
    filled = hg19.filled(["chr1"])
    assert (filled.chromEnd - filled.chromStart != 0).all()
    filled_tesselate = tessellate_bed(filled, 200, verbose=False)
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    filled_tesselate["strand"] = "."
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    hg19.delete()
Example #5
0
class GenomeWindowsGenerator:

    n_types = ["uniform", "normal"]

    def __init__(self,
                 assembly,
                 window_size,
                 batch_size,
                 buffer_size=None,
                 max_gap_size=100,
                 train_chromosomes=None,
                 val_chromosomes=None,
                 cache_dir=None,
                 lazy_load=True,
                 clear_cache=False,
                 compile_on_start=True,
                 n_type="uniform"):
        self.assembly, self.window_size = assembly, window_size
        self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes

        # Buffersize default None == cpu count for optimal performance:
        if not buffer_size:
            buffer_size = cpu_count()
        self.buffer_size = buffer_size

        # Validate the type of N
        if n_type not in self.n_types:
            raise ValueError("n_type must be one of %s" % n_type)
        self.n_type = n_type

        # Get the cache dir
        cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp"

        self._cache_directory = "/".join(
            [cache_dir, assembly, str(window_size)])

        if clear_cache:
            self.clean_cache()

        # Generate a pool of processes to save the overhead
        self.workers = max(2, cpu_count())
        self.pool = Pool(self.workers)

        # Preprocess all the possible data
        self.genome = Genome(
            assembly=assembly,
            lazy_load=lazy_load,
            cache_directory=cache_dir,
        )

        if not val_chromosomes:
            self.val_chromosomes = []

        # If no chromosomes passed then use all the genome
        if not train_chromosomes:
            self.chromosomes = sorted(list(self.genome))
        else:
            self.chromosomes = train_chromosomes + self.val_chromosomes

        self.instance_hash = sha256({
            "assembly": self.assembly,
            "chromosomes": self.chromosomes,
            "window_size": self.window_size,
            "max_gap_size": self.max_gap_size,
            "n_type": n_type,
        })

        if compile_on_start:
            self.compile()

    def compile(self):
        filled = self._filled()
        windows = self._tasselize_windows(filled, self.window_size)
        sequences = self._encode_sequences(windows)

        self._windows_train, self._windows_val = self._train_val_split(
            sequences)

        gap_mask = self._render_gaps()
        self._mean, self._cov = _model_gaps(gap_mask)

    def _train_val_split(self, sequences):
        # Get the set of chromosomes
        # TODO do we need a seed here?
        # Find the splitting index
        windows_train = sum([
            sequences[chrom].sequence.tolist() for chrom in tqdm(
                self.chromosomes, desc="Groupping Train windows", leave=False)
            if chrom not in self.val_chromosomes
        ], [])
        windows_val = sum(
            (sequences[chrom].sequence.tolist() for chrom in tqdm(
                self.chromosomes, desc="Groupping val windows", leave=False)
             if chrom in self.val_chromosomes), [])
        return windows_train, windows_val

    def steps_per_epoch(self):
        return len(self._windows_train) // self.batch_size

    def validation_steps(self):
        return len(self._windows_val) // self.batch_size

    @cache_method("{_cache_directory}/{instance_hash}_filled.pkl")
    def _filled(self):
        return self.genome.filled(chromosomes=self.chromosomes)

    @cache_method("{_cache_directory}/{instance_hash}_gap_mask.pkl")
    def _render_gaps(self):
        # Compute
        gaps = self.genome.gaps(chromosomes=self.chromosomes)
        # Keeping only small gaps
        gaps = gaps[gaps.chromEnd - gaps.chromStart <= self.max_gap_size]
        # Expand windows
        mid_point = ((gaps.chromEnd + gaps.chromStart) / 2).astype(int)
        gaps.chromStart = (mid_point - self.window_size / 2).astype(int)
        gaps.chromEnd = (mid_point + self.window_size / 2).astype(int)
        # Rendering gap sequences
        gapped_sequences = self.genome.bed_to_sequence(gaps)
        # Rendering gap mask
        return np.array([
            np.array(list(sequence.lower())) == "n"
            for sequence in gapped_sequences.sequence
        ])

    @cache_method("{_cache_directory}/{instance_hash}_tasselized.pkl")
    def _tasselize_windows(self, bed: pd.DataFrame, window_size: int):
        # Compute
        tasks = ((row.chrom, row.chromStart, row.chromEnd, window_size)
                 for _, row in bed.iterrows())
        return pd.concat(
            list(
                tqdm(self.pool.imap(tasselize_window, tasks),
                     total=bed.shape[0],
                     desc="Tasselizing windows",
                     leave=False)))

    @cache_method("{_cache_directory}/{instance_hash}_sequences.pkl")
    def _encode_sequences(self, windows):
        bed = self.genome.bed_to_sequence(windows)
        return {chrom: data for chrom, data in bed.groupby("chrom")}

    def batchsize_scheduler(self):
        while True:
            yield self.batch_size

    def _buffer_generator(self, dataset):
        iterable = _dataset_generator(dataset)
        for batch_size in self.batchsize_scheduler():
            yield [
                list(itertools.islice(iterable, batch_size))
                for _ in range(self.buffer_size)
            ]

    def _buffer_encoder_generator(self, dataset):
        for buffer in self._buffer_generator(dataset):
            yield list(self.pool.imap(one_hot_encoder, buffer))

    def _generator(self, dataset):
        for buffer in self._buffer_encoder_generator(dataset):
            for batch in buffer:
                yield batch

    def generator(self):
        return self._generator(self._windows_train)

    def validation_data(self):
        if not self.val_chromosomes:
            raise ValueError("Can't return the val generator since "
                             "no val chromosomes were specified")
        return self._generator(self._windows_val)

    def clean_cache(self):
        if os.path.exists(self._cache_directory):
            shutil.rmtree(self._cache_directory)

    def close(self):
        if "pool" in vars(self):
            self.pool.close()
            self.pool.join()