Ejemplo n.º 1
0
def get_holdout(train: np.ndarray, test: np.ndarray, bed: pd.DataFrame, labels: np.ndarray, genome, batch_size=1024) -> Tuple[Sequence, Sequence]:
    return (
        MixedSequence(
            x=BedSequence(genome, bed.iloc[train], batch_size=batch_size),
            y=labels[train],
            batch_size=batch_size
        ),
        MixedSequence(
            x=BedSequence(genome, bed.iloc[test], batch_size=batch_size),
            y=labels[test],
            batch_size=batch_size
        )
    )
Ejemplo n.º 2
0
def get_holdout(train: np.ndarray, test: np.ndarray, bed: pd.DataFrame,
                labels: pd.DataFrame) -> Tuple[Sequence, Sequence]:
    genome = get_genome()
    batch_size = get_default('batch_size')
    return (MixedSequence(x=BedSequence(genome,
                                        bed.iloc[train],
                                        batch_size=batch_size),
                          y=labels[train],
                          batch_size=batch_size),
            MixedSequence(x=BedSequence(genome,
                                        bed.iloc[test],
                                        batch_size=batch_size),
                          y=labels[test],
                          batch_size=batch_size))
def get_sequence_holdout(train: np.ndarray, test: np.ndarray, bed: pd.DataFrame, labels: np.ndarray, genome,
                         batch_size=32) -> Tuple[Sequence, Sequence]:
    logging.info("Computing train sequence data...")
    train = MixedSequence(
        x=BedSequence(genome, bed.iloc[train], batch_size=batch_size),
        y=labels[train],
        batch_size=batch_size
    )
    logging.info("Computing test sequence data...")
    test = MixedSequence(
        x=BedSequence(genome, bed.iloc[test], batch_size=batch_size),
        y=labels[test],
        batch_size=batch_size
    )
    return (train, test)
def one_hot_encode(genome, data: pd.DataFrame, nucleotides: str = "actg") -> np.ndarray:
    return np.array(BedSequence(
        genome,
        bed=to_bed(data),
        nucleotides=nucleotides,
        batch_size=1
    ))
Ejemplo n.º 5
0
def one_hot_encode(data: pd.DataFrame, genome: Genome) -> np.ndarray:
    """Set to one only the nucletoide, zero the others

    :param data
    :param genome
    """
    return np.array(
        BedSequence(genome,
                    bed=to_bed(data),
                    nucleotides=get_default('nucleotides'),
                    batch_size=1)).reshape(
                        -1,
                        get_default('window_size') *
                        len(get_default('nucleotides'))).astype(int)
Ejemplo n.º 6
0
def test_genomic_sequence_determinism():
    batch_size = 32
    epochs = 5
    enhancers = pd.read_csv("tests/enhancers.csv")
    promoters = pd.read_csv("tests/promoters.csv")

    genome = Genome("hg19", chromosomes=["chr1"])
    for region in tqdm((enhancers, promoters), desc="Region types"):
        y = np.arange(0, len(region), dtype=np.int64)
        mixed_sequence = MixedSequence(x=BedSequence(genome, region,
                                                     batch_size),
                                       y=VectorSequence(y, batch_size))
        reference_mixed_sequence = MixedSequence(
            x=BedSequence(genome,
                          region,
                          batch_size=len(region),
                          shuffle=False),
            y=VectorSequence(y, batch_size=len(region), shuffle=False))
        X, _ = reference_mixed_sequence[0]
        for _ in trange(epochs, desc="Epochs", leave=False):
            for step in range(mixed_sequence.steps_per_epoch):
                xi, yi = mixed_sequence[step]
                assert (X[yi.astype(int)] == xi).all()
            mixed_sequence.on_epoch_end()
Ejemplo n.º 7
0
def get_data(
    parameters: Tuple[Tuple[str, int, str], str]
) -> Tuple[pd.DataFrame, np.array] or List[np.array, np.array]:
    load_parameters, data_type = parameters
    if data_type == 'epigenomic':
        dataset, labels = load_dataset(load_parameters)
        dataset.reset_index(drop=True, inplace=True)
        return dataset, labels
    if data_type == 'sequences':
        epigenomes, labels = load_dataset(load_parameters)
        genome = Genome('hg19')
        bed = epigenomes.reset_index()[epigenomes.index.names]
        batch_size = len(labels)
        return [
            data for data in MixedSequence(x=BedSequence(
                genome, bed.iloc[np.arange(
                    batch_size)], batch_size=batch_size),
                                           y=labels[np.arange(batch_size)],
                                           batch_size=batch_size)
        ][0]
def create_sequence(bed: pd.DataFrame, assembly: Genome,
                    batch_size: int) -> MixedSequence:
    """Return training sequence.

    Parameters
    ----------------------------
    bed: pd.DataFrame,
        Dataframe with bed file structure.
    assembly: Genome,
        Genomic assembly to use.
    batch_size: int,
        Batch size to use.

    Returns
    ----------------------------
    Training sequence for model.
    """
    return MixedSequence(x=BedSequence(assembly=assembly,
                                       bed=bed,
                                       batch_size=batch_size),
                         y=VectorSequence(bed.labels.values.astype(float),
                                          batch_size=batch_size))
class BiologicalGapsSequence(Sequence):
    def __init__(self,
                 source: str,
                 target: str,
                 source_window_size: int,
                 target_window_size: int,
                 batch_size: int,
                 verbose: bool = True,
                 seed: int = 42,
                 elapsed_epochs: int = 0):
        """Create new BiologicalGapsSequence.

        Parameters
        ----------------------------------
        source: str,
            Assembly from which to extract the input sequences.
            These sequences are centered upon the single nucleotide gaps.
        target: str,
            Assembly from which to extract the output sequences.
            These sequences are centered upon the nucleotides corresponding to the gaps.
        source_window_size: int,
            Window size to use for the input.
        target_window_size: int,
            Window size to use for the output.
        batch_size: int,
            Training batch size.
        verbose: bool = True,
            Wethever to show or not the loading bars.
        seed: int = 42,
            The seed to use for shuffling the data on training epoch end.
        elapsed_epochs: int = 0,
            The number of elapsed epochs.

        Raises
        ---------------------------------
        ValueError,
            If the dataset with given combination of source and target
            is not currently available.

        Returns
        ----------------------------------
        New BiologicalGapsSequence.
        """
        path = "{pwd}/datasets/{source}_{target}.bed".format(
            pwd=os.path.dirname(os.path.abspath(__file__)),
            source=source,
            target=target)
        if not os.path.exists(path):
            raise ValueError(
                "Given combination of source '{source}' and target '{target}' is not currently available."
                .format(source=source, target=target))
        bed = pd.read_csv(path, sep="\t")
        source_bed = expand_bed_regions(
            pd.DataFrame({
                "chrom": bed.chrom,
                "chromStart": bed[source],
                "chromEnd": bed[source] + 1,
            }), source_window_size)
        target_bed = expand_bed_regions(
            pd.DataFrame({
                "chrom": bed.chrom,
                "chromStart": bed[target],
                "chromEnd": bed[target] + 1,
            }), target_window_size)
        self._source_sequence = BedSequence(source,
                                            source_bed,
                                            batch_size=batch_size,
                                            verbose=verbose,
                                            seed=seed,
                                            nucleotides="actgn",
                                            elapsed_epochs=elapsed_epochs)
        self._target_sequence = BedSequence(target,
                                            target_bed,
                                            batch_size=batch_size,
                                            verbose=verbose,
                                            seed=seed,
                                            nucleotides="actgn",
                                            elapsed_epochs=elapsed_epochs)

    def on_epoch_end(self):
        """Shuffle private bed objects on every epoch end."""
        self._source_sequence.on_epoch_end()
        self._target_sequence.on_epoch_end()

    def __len__(self) -> int:
        """Return length of bed generator."""
        return len(self._source_sequence)

    @property
    def steps_per_epoch(self) -> int:
        """Return length of bed generator."""
        return self._source_sequence.steps_per_epoch

    @property
    def batch_size(self) -> int:
        """Return batch size to be rendered."""
        return self._source_sequence.batch_size

    @batch_size.setter
    def batch_size(self, batch_size: int):
        """Set batch size to given value."""
        self._source_sequence.batch_size = batch_size
        self._target_sequence.batch_size = batch_size

    @property
    def samples_number(self) -> int:
        """Return number of available samples."""
        return self._source_sequence.samples_number

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
        """Return batch corresponding to given index.

        Parameters
        ---------------
        idx: int,
            Index corresponding to batch to be rendered.

        Returns
        ---------------
        Return Tuple containing X and Y numpy arrays corresponding to given batch index.
        """
        # Get the input sequence
        x = self._source_sequence[idx]
        # Get the output sequence
        y = self._target_sequence[idx]
        # Get the values corresponding to gaps
        nx = x[:, :, -1].astype(bool)
        ny = y[:, :, -1].astype(bool)
        # Filter out the 5th nucleotide (the gaps)
        x = x[:, :, :4]
        y = y[:, :, :4]
        # Apply uniform value where the nucleotide is a gap
        x[nx] = 0.25
        y[ny] = 0.25
        return x, y
    def __init__(self,
                 source: str,
                 target: str,
                 source_window_size: int,
                 target_window_size: int,
                 batch_size: int,
                 verbose: bool = True,
                 seed: int = 42,
                 elapsed_epochs: int = 0):
        """Create new BiologicalGapsSequence.

        Parameters
        ----------------------------------
        source: str,
            Assembly from which to extract the input sequences.
            These sequences are centered upon the single nucleotide gaps.
        target: str,
            Assembly from which to extract the output sequences.
            These sequences are centered upon the nucleotides corresponding to the gaps.
        source_window_size: int,
            Window size to use for the input.
        target_window_size: int,
            Window size to use for the output.
        batch_size: int,
            Training batch size.
        verbose: bool = True,
            Wethever to show or not the loading bars.
        seed: int = 42,
            The seed to use for shuffling the data on training epoch end.
        elapsed_epochs: int = 0,
            The number of elapsed epochs.

        Raises
        ---------------------------------
        ValueError,
            If the dataset with given combination of source and target
            is not currently available.

        Returns
        ----------------------------------
        New BiologicalGapsSequence.
        """
        path = "{pwd}/datasets/{source}_{target}.bed".format(
            pwd=os.path.dirname(os.path.abspath(__file__)),
            source=source,
            target=target)
        if not os.path.exists(path):
            raise ValueError(
                "Given combination of source '{source}' and target '{target}' is not currently available."
                .format(source=source, target=target))
        bed = pd.read_csv(path, sep="\t")
        source_bed = expand_bed_regions(
            pd.DataFrame({
                "chrom": bed.chrom,
                "chromStart": bed[source],
                "chromEnd": bed[source] + 1,
            }), source_window_size)
        target_bed = expand_bed_regions(
            pd.DataFrame({
                "chrom": bed.chrom,
                "chromStart": bed[target],
                "chromEnd": bed[target] + 1,
            }), target_window_size)
        self._source_sequence = BedSequence(source,
                                            source_bed,
                                            batch_size=batch_size,
                                            verbose=verbose,
                                            seed=seed,
                                            nucleotides="actgn",
                                            elapsed_epochs=elapsed_epochs)
        self._target_sequence = BedSequence(target,
                                            target_bed,
                                            batch_size=batch_size,
                                            verbose=verbose,
                                            seed=seed,
                                            nucleotides="actgn",
                                            elapsed_epochs=elapsed_epochs)