def test_create_new_genome_object():
    sacCer3 = Genome(
        "sacCer3",
        chromosomes=sacCer3_chromosomes,
    )
    for path in glob("{path}/*.json".format(path=sacCer3.path)):
        os.remove(path)
    with pytest.warns(RuntimeWarning):
        sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3 = Genome("sacCer3", chromosomes=sacCer3_chromosomes)
    sacCer3.gaps()
    sacCer3.filled()
    str(sacCer3)
    sacCer3.delete()
def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    filled = hg19.filled(chromosomes=["chr1"])
    hg19.bed_to_sequence(filled)
    hg19.delete()
def test_tessellate():
    hg19 = Genome("hg19", chromosomes=["chrM"])
    filled = hg19.filled(chromosomes=["chrM"])
    tessellate_bed(filled, window_size=200, alignment="left")
    tessellate_bed(filled, window_size=200, alignment="right")
    tessellate_bed(filled, window_size=200, alignment="center")
    hg19.delete()
Example #4
0
def test_wiggle():
    hg19 = Genome("hg19", chromosomes=["chr17"])
    filled = hg19.filled(chromosomes=["chr17"])
    wiggles = wiggle_bed_regions(
        filled,
        max_wiggle_size=100,
        wiggles=10,
        seed=42
    )
    path = "{pwd}/expected_wiggles.csv".format(
        pwd=os.path.dirname(os.path.abspath(__file__))
    )
    if not os.path.exists(path):
        wiggles.to_csv(path, index=False)
    pd.testing.assert_frame_equal(
        wiggles,
        pd.read_csv(path),
        check_dtype=False
    )
    hg19.delete()
Example #5
0
def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    # Check that no gap is with 0 length
    gaps = hg19.gaps(["chr1"])
    assert (gaps.chromEnd - gaps.chromStart != 0).all()
    # Converting gaps to sequences: should all be Nns
    gaps_tesselate = tessellate_bed(gaps, 200, verbose=False)
    gaps_sequences = hg19.bed_to_sequence(gaps_tesselate)
    for gap in gaps_sequences:
        assert set(gap.lower()) == set(["n"])
    filled = hg19.filled(["chr1"])
    assert (filled.chromEnd - filled.chromStart != 0).all()
    filled_tesselate = tessellate_bed(filled, 200, verbose=False)
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    filled_tesselate["strand"] = "."
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    hg19.delete()
Example #6
0
class GenomeWindowsGenerator:

    n_types = ["uniform", "normal"]

    def __init__(self,
                 assembly,
                 window_size,
                 batch_size,
                 buffer_size=None,
                 max_gap_size=100,
                 train_chromosomes=None,
                 val_chromosomes=None,
                 cache_dir=None,
                 lazy_load=True,
                 clear_cache=False,
                 compile_on_start=True,
                 n_type="uniform"):
        self.assembly, self.window_size = assembly, window_size
        self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes

        # Buffersize default None == cpu count for optimal performance:
        if not buffer_size:
            buffer_size = cpu_count()
        self.buffer_size = buffer_size

        # Validate the type of N
        if n_type not in self.n_types:
            raise ValueError("n_type must be one of %s" % n_type)
        self.n_type = n_type

        # Get the cache dir
        cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp"

        self._cache_directory = "/".join(
            [cache_dir, assembly, str(window_size)])

        if clear_cache:
            self.clean_cache()

        # Generate a pool of processes to save the overhead
        self.workers = max(2, cpu_count())
        self.pool = Pool(self.workers)

        # Preprocess all the possible data
        self.genome = Genome(
            assembly=assembly,
            lazy_load=lazy_load,
            cache_directory=cache_dir,
        )

        if not val_chromosomes:
            self.val_chromosomes = []

        # If no chromosomes passed then use all the genome
        if not train_chromosomes:
            self.chromosomes = sorted(list(self.genome))
        else:
            self.chromosomes = train_chromosomes + self.val_chromosomes

        self.instance_hash = sha256({
            "assembly": self.assembly,
            "chromosomes": self.chromosomes,
            "window_size": self.window_size,
            "max_gap_size": self.max_gap_size,
            "n_type": n_type,
        })

        if compile_on_start:
            self.compile()

    def compile(self):
        filled = self._filled()
        windows = self._tasselize_windows(filled, self.window_size)
        sequences = self._encode_sequences(windows)

        self._windows_train, self._windows_val = self._train_val_split(
            sequences)

        gap_mask = self._render_gaps()
        self._mean, self._cov = _model_gaps(gap_mask)

    def _train_val_split(self, sequences):
        # Get the set of chromosomes
        # TODO do we need a seed here?
        # Find the splitting index
        windows_train = sum([
            sequences[chrom].sequence.tolist() for chrom in tqdm(
                self.chromosomes, desc="Groupping Train windows", leave=False)
            if chrom not in self.val_chromosomes
        ], [])
        windows_val = sum(
            (sequences[chrom].sequence.tolist() for chrom in tqdm(
                self.chromosomes, desc="Groupping val windows", leave=False)
             if chrom in self.val_chromosomes), [])
        return windows_train, windows_val

    def steps_per_epoch(self):
        return len(self._windows_train) // self.batch_size

    def validation_steps(self):
        return len(self._windows_val) // self.batch_size

    @cache_method("{_cache_directory}/{instance_hash}_filled.pkl")
    def _filled(self):
        return self.genome.filled(chromosomes=self.chromosomes)

    @cache_method("{_cache_directory}/{instance_hash}_gap_mask.pkl")
    def _render_gaps(self):
        # Compute
        gaps = self.genome.gaps(chromosomes=self.chromosomes)
        # Keeping only small gaps
        gaps = gaps[gaps.chromEnd - gaps.chromStart <= self.max_gap_size]
        # Expand windows
        mid_point = ((gaps.chromEnd + gaps.chromStart) / 2).astype(int)
        gaps.chromStart = (mid_point - self.window_size / 2).astype(int)
        gaps.chromEnd = (mid_point + self.window_size / 2).astype(int)
        # Rendering gap sequences
        gapped_sequences = self.genome.bed_to_sequence(gaps)
        # Rendering gap mask
        return np.array([
            np.array(list(sequence.lower())) == "n"
            for sequence in gapped_sequences.sequence
        ])

    @cache_method("{_cache_directory}/{instance_hash}_tasselized.pkl")
    def _tasselize_windows(self, bed: pd.DataFrame, window_size: int):
        # Compute
        tasks = ((row.chrom, row.chromStart, row.chromEnd, window_size)
                 for _, row in bed.iterrows())
        return pd.concat(
            list(
                tqdm(self.pool.imap(tasselize_window, tasks),
                     total=bed.shape[0],
                     desc="Tasselizing windows",
                     leave=False)))

    @cache_method("{_cache_directory}/{instance_hash}_sequences.pkl")
    def _encode_sequences(self, windows):
        bed = self.genome.bed_to_sequence(windows)
        return {chrom: data for chrom, data in bed.groupby("chrom")}

    def batchsize_scheduler(self):
        while True:
            yield self.batch_size

    def _buffer_generator(self, dataset):
        iterable = _dataset_generator(dataset)
        for batch_size in self.batchsize_scheduler():
            yield [
                list(itertools.islice(iterable, batch_size))
                for _ in range(self.buffer_size)
            ]

    def _buffer_encoder_generator(self, dataset):
        for buffer in self._buffer_generator(dataset):
            yield list(self.pool.imap(one_hot_encoder, buffer))

    def _generator(self, dataset):
        for buffer in self._buffer_encoder_generator(dataset):
            for batch in buffer:
                yield batch

    def generator(self):
        return self._generator(self._windows_train)

    def validation_data(self):
        if not self.val_chromosomes:
            raise ValueError("Can't return the val generator since "
                             "no val chromosomes were specified")
        return self._generator(self._windows_val)

    def clean_cache(self):
        if os.path.exists(self._cache_directory):
            shutil.rmtree(self._cache_directory)

    def close(self):
        if "pool" in vars(self):
            self.pool.close()
            self.pool.join()