def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    filled = hg19.filled(chromosomes=["chr1"])
    hg19.bed_to_sequence(filled)
    hg19.delete()
Example #2
0
def get_gaps_statistics(
        genome: Genome, max_gap_size: int,
        window_size: int) -> Tuple[int, np.ndarray, np.ndarray]:
    """Return number, mean and covariance of gaps.

    Parameters
    --------------------------
    genome:Genome,
        The genome to use.
    max_gap_size:int,
        The maximum gap size to take in consideration.
    window_size:int
        The target window size

    Returns
    --------------------------
    Returns Tuple containing number of gaps, mean and covariance.
    """
    # Obtaining gaps
    gaps = genome.gaps()
    # Getting gaps whose size is below given threshold
    gaps = gaps[gaps.chromEnd - gaps.chromStart < max_gap_size]
    # Expanding gaps to given window size
    gaps = expand_bed_regions(gaps, window_size, alignment="center")
    # Retrieving the sequences corresponding to given gaps
    sequences = genome.bed_to_sequence(gaps).sequence.str.lower()
    # Obtaining a mask of gaps
    gaps_mask = np.array([list(sequence) for sequence in sequences]) == "n"
    number = len(gaps_mask)
    mean = gaps_mask.mean(axis=0)
    covariance = np.cov(gaps_mask.T)
    return number, mean, covariance
Example #3
0
    def __init__(self,
                 genome: Genome,
                 bed: pd.DataFrame,
                 batch_size: int,
                 nucleotides: str = "actg",
                 unknown_nucleotide_value: float = 0.25,
                 random_state: int = 42,
                 elapsed_epochs: int = 0,
                 shuffle: bool = True):
        """Return new BedSequence object.

        Parameters
        --------------------
        genome: Genome,
            Genomic assembly from ucsc from which to extract sequences.
        bed: pd.DataFrame,
            Pandas DataFrame containing minimal bed columns,
            like "chrom", "chromStart" and "chromEnd".
        batch_size: int,
            Batch size to be returned for each request.
        nucleotides: str = "actg",
            Nucleotides to consider when one-hot encoding.
        unknown_nucleotide_value: float = 0.25,
            The default value to use for encoding unknown nucleotides.
        random_state: int = 42,
            Starting random_state to use if shuffling the dataset.
        elapsed_epochs: int = 0,
            Number of elapsed epochs to init state of generator.
        shuffle: bool = True,
            Wethever to shuffle or not the sequence.

        Raises
        --------------------
        ValueError:
            If the bed file regions does not have the same length.
        """
        # Every window in the bed file must be
        # of the same length.
        if len(set((bed.chromEnd - bed.chromStart).values)) != 1:
            raise ValueError("The bed file regions must have the same length!")

        self._window_length = (bed.chromEnd - bed.chromStart).values[0]
        self._nucleotides = nucleotides
        self._nucleotides_number = len(nucleotides)
        self._unknown_nucleotide_value = unknown_nucleotide_value

        # We extract the sequences of the bed file from
        # the given genome.
        sequences = np.array(genome.bed_to_sequence(bed), dtype=str)

        super().__init__(nucleotides_to_numbers(self.nucleotides, sequences),
                         batch_size,
                         random_state=random_state,
                         elapsed_epochs=elapsed_epochs,
                         shuffle=shuffle)
Example #4
0
def test_gaps():
    hg19 = Genome("hg19", chromosomes=["chr1"])
    assert "chr1" in hg19
    assert "chr2" not in hg19
    # Check that no gap is with 0 length
    gaps = hg19.gaps(["chr1"])
    assert (gaps.chromEnd - gaps.chromStart != 0).all()
    # Converting gaps to sequences: should all be Nns
    gaps_tesselate = tessellate_bed(gaps, 200, verbose=False)
    gaps_sequences = hg19.bed_to_sequence(gaps_tesselate)
    for gap in gaps_sequences:
        assert set(gap.lower()) == set(["n"])
    filled = hg19.filled(["chr1"])
    assert (filled.chromEnd - filled.chromStart != 0).all()
    filled_tesselate = tessellate_bed(filled, 200, verbose=False)
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    filled_tesselate["strand"] = "."
    filled_sequences = hg19.bed_to_sequence(filled_tesselate)
    for fl in filled_sequences:
        assert "n" not in fl.lower()
    hg19.delete()
Example #5
0
def preprocess_mode_exec(c):
    logging.basicConfig(format='[%(asctime)s] - %(levelname)s - %(message)s',
                        level=logging.DEBUG)
    logging.debug("PREPROCESSING MODE")

    root_path = c['import_path']
    saving_path = c['export_path']
    cell_lines = c['cell_lines']
    window_size = c['window_size']
    dataset_type = c['dataset']

    if not os.path.exists(root_path):
        raise FileNotFoundError("Files path not found: {}".format(root_path))

    if not os.path.exists(saving_path):
        logging.debug("{} not found, folder will be created")
        os.makedirs(saving_path)

    label_epi_path = get_full_path(root_path, window_size, dataset_type)

    # Importing regions for enhancers and promoters
    enhancers_regions, promoters_regions = get_regions(root_path)

    # Importing and converting labels of enhancers and promoters and join them in a single dataframe
    full_sequences = get_categorical_labels(label_epi_path)
    logging.debug("Saving the sequences bed file in {}".format(saving_path))

    rows = 0
    if c['sample']:
        sample_size = int(len(full_sequences) * c['sample_perc'])
        rows = np.random.randint(len(full_sequences), size=sample_size)
        full_sequences = full_sequences.iloc[rows]
    full_sequences.to_csv("{}/sequences.bed".format(saving_path),
                          sep="\t",
                          columns=['chrom', 'chromStart', 'chromEnd'],
                          header=False,
                          index=False)

    logging.debug("Downloading the hg19 genome")
    chroms = [k for k, _ in full_sequences.groupby(['chrom'])]
    hg19 = Genome(assembly="hg19", chromosomes=chroms)
    logging.debug("Downloading the hg19 genome")
    sequences = hg19.bed_to_sequence(full_sequences)

    logging.debug("Saving sequences to file...")
    seqIO_seq = [
        creating_seqIO(
            "{}:{}-{}".format(row['chrom'],
                              row['chromStart'], row['chromEnd']),
            Seq(row['sequence'].upper())) for _, row in sequences.iterrows()
    ]
    save_sequences(saving_path, seqIO_seq)

    # Importing epigenetic data
    logging.debug("Importing epigenetic data for: {}".format(
        ", ".join(cell_lines)))
    logging.debug(
        "-------------------------------------------------------------")
    for l in cell_lines:
        logging.debug("Importing {} data".format(l))

        df_epi_enanchers, df_epi_promoters = get_epigenetic_data(
            label_epi_path, l)

        # building type dictionary
        converting_dictionary = {
            c: get_type(c)
            for c in df_epi_promoters.columns
        }
        df_epi_enanchers = df_epi_enanchers.astype(converting_dictionary)
        df_epi_promoters = df_epi_promoters.astype(converting_dictionary)

        assert len(df_epi_promoters.columns) == len(df_epi_enanchers.columns)
        logging.debug("number features for {}: {}".format(
            l,
            len(df_epi_promoters.columns) - 4))
        logging.debug("Number of missing values in enhancers: {}".format(
            df_epi_enanchers.isna().sum().sum()))
        logging.debug("Number of missing values in promoters: {}".format(
            df_epi_promoters.isna().sum().sum()))

        df_epi_enanchers = fill_missing(df_epi_enanchers, metric="median")
        df_epi_promoters = fill_missing(df_epi_promoters, metric="median")

        assert len(enhancers_regions) == len(df_epi_enanchers)
        logging.debug("Enhancers - regions: {}, epigenetics: {}".format(
            len(enhancers_regions), len(df_epi_enanchers)))

        assert len(promoters_regions) == len(df_epi_promoters)
        logging.debug("Promoters - regions: {}, epigenetics: {}".format(
            len(promoters_regions), len(df_epi_promoters)))

        full_epi = append_without_duplicates(df_epi_enanchers,
                                             df_epi_promoters)
        if c['sample']:
            full_epi = full_epi.iloc[rows]
        # Check if the data are aligned dataframe are equals before save.
        assert len(full_sequences) == len(full_epi)
        assert_frame_equal(full_sequences[['chrom', 'chromStart', 'chromEnd']],
                           full_epi[['chrom', 'chromStart', 'chromEnd']])
        logging.debug("Number of total sequences: {}".format(
            len(full_sequences)))

        logging.debug("Saving results in {}".format(saving_path))
        np.savetxt("{}/{}_epigenetic.txt".format(saving_path, l),
                   full_epi.iloc[:, 4:].values,
                   fmt='%f')
        np.savetxt("{}/{}_labels.txt".format(saving_path, l),
                   full_sequences[l].values,
                   fmt='%s')

        logging.debug(
            "-------------------------------------------------------------")
Example #6
0
class GenomeWindowsGenerator:

    n_types = ["uniform", "normal"]

    def __init__(self,
                 assembly,
                 window_size,
                 batch_size,
                 buffer_size=None,
                 max_gap_size=100,
                 train_chromosomes=None,
                 val_chromosomes=None,
                 cache_dir=None,
                 lazy_load=True,
                 clear_cache=False,
                 compile_on_start=True,
                 n_type="uniform"):
        self.assembly, self.window_size = assembly, window_size
        self.max_gap_size, self.batch_size, self.val_chromosomes = max_gap_size, batch_size, val_chromosomes

        # Buffersize default None == cpu count for optimal performance:
        if not buffer_size:
            buffer_size = cpu_count()
        self.buffer_size = buffer_size

        # Validate the type of N
        if n_type not in self.n_types:
            raise ValueError("n_type must be one of %s" % n_type)
        self.n_type = n_type

        # Get the cache dir
        cache_dir = cache_dir or os.environ.get("CACHE_PATH", None) or "/tmp"

        self._cache_directory = "/".join(
            [cache_dir, assembly, str(window_size)])

        if clear_cache:
            self.clean_cache()

        # Generate a pool of processes to save the overhead
        self.workers = max(2, cpu_count())
        self.pool = Pool(self.workers)

        # Preprocess all the possible data
        self.genome = Genome(
            assembly=assembly,
            lazy_load=lazy_load,
            cache_directory=cache_dir,
        )

        if not val_chromosomes:
            self.val_chromosomes = []

        # If no chromosomes passed then use all the genome
        if not train_chromosomes:
            self.chromosomes = sorted(list(self.genome))
        else:
            self.chromosomes = train_chromosomes + self.val_chromosomes

        self.instance_hash = sha256({
            "assembly": self.assembly,
            "chromosomes": self.chromosomes,
            "window_size": self.window_size,
            "max_gap_size": self.max_gap_size,
            "n_type": n_type,
        })

        if compile_on_start:
            self.compile()

    def compile(self):
        filled = self._filled()
        windows = self._tasselize_windows(filled, self.window_size)
        sequences = self._encode_sequences(windows)

        self._windows_train, self._windows_val = self._train_val_split(
            sequences)

        gap_mask = self._render_gaps()
        self._mean, self._cov = _model_gaps(gap_mask)

    def _train_val_split(self, sequences):
        # Get the set of chromosomes
        # TODO do we need a seed here?
        # Find the splitting index
        windows_train = sum([
            sequences[chrom].sequence.tolist() for chrom in tqdm(
                self.chromosomes, desc="Groupping Train windows", leave=False)
            if chrom not in self.val_chromosomes
        ], [])
        windows_val = sum(
            (sequences[chrom].sequence.tolist() for chrom in tqdm(
                self.chromosomes, desc="Groupping val windows", leave=False)
             if chrom in self.val_chromosomes), [])
        return windows_train, windows_val

    def steps_per_epoch(self):
        return len(self._windows_train) // self.batch_size

    def validation_steps(self):
        return len(self._windows_val) // self.batch_size

    @cache_method("{_cache_directory}/{instance_hash}_filled.pkl")
    def _filled(self):
        return self.genome.filled(chromosomes=self.chromosomes)

    @cache_method("{_cache_directory}/{instance_hash}_gap_mask.pkl")
    def _render_gaps(self):
        # Compute
        gaps = self.genome.gaps(chromosomes=self.chromosomes)
        # Keeping only small gaps
        gaps = gaps[gaps.chromEnd - gaps.chromStart <= self.max_gap_size]
        # Expand windows
        mid_point = ((gaps.chromEnd + gaps.chromStart) / 2).astype(int)
        gaps.chromStart = (mid_point - self.window_size / 2).astype(int)
        gaps.chromEnd = (mid_point + self.window_size / 2).astype(int)
        # Rendering gap sequences
        gapped_sequences = self.genome.bed_to_sequence(gaps)
        # Rendering gap mask
        return np.array([
            np.array(list(sequence.lower())) == "n"
            for sequence in gapped_sequences.sequence
        ])

    @cache_method("{_cache_directory}/{instance_hash}_tasselized.pkl")
    def _tasselize_windows(self, bed: pd.DataFrame, window_size: int):
        # Compute
        tasks = ((row.chrom, row.chromStart, row.chromEnd, window_size)
                 for _, row in bed.iterrows())
        return pd.concat(
            list(
                tqdm(self.pool.imap(tasselize_window, tasks),
                     total=bed.shape[0],
                     desc="Tasselizing windows",
                     leave=False)))

    @cache_method("{_cache_directory}/{instance_hash}_sequences.pkl")
    def _encode_sequences(self, windows):
        bed = self.genome.bed_to_sequence(windows)
        return {chrom: data for chrom, data in bed.groupby("chrom")}

    def batchsize_scheduler(self):
        while True:
            yield self.batch_size

    def _buffer_generator(self, dataset):
        iterable = _dataset_generator(dataset)
        for batch_size in self.batchsize_scheduler():
            yield [
                list(itertools.islice(iterable, batch_size))
                for _ in range(self.buffer_size)
            ]

    def _buffer_encoder_generator(self, dataset):
        for buffer in self._buffer_generator(dataset):
            yield list(self.pool.imap(one_hot_encoder, buffer))

    def _generator(self, dataset):
        for buffer in self._buffer_encoder_generator(dataset):
            for batch in buffer:
                yield batch

    def generator(self):
        return self._generator(self._windows_train)

    def validation_data(self):
        if not self.val_chromosomes:
            raise ValueError("Can't return the val generator since "
                             "no val chromosomes were specified")
        return self._generator(self._windows_val)

    def clean_cache(self):
        if os.path.exists(self._cache_directory):
            shutil.rmtree(self._cache_directory)

    def close(self):
        if "pool" in vars(self):
            self.pool.close()
            self.pool.join()