Esempio n. 1
0
def test_simple_determinism():
    """Test to check that the extraction of the batches is deterministic."""
    classes = 10
    number = 100000
    epochs = 100
    batch_size = 10000

    x = np.arange(0, number, dtype=np.int64)
    y = np.random.randint(0, classes, size=number)

    ms = MixedSequence(VectorSequence(x, batch_size),
                       VectorSequence(y, batch_size))

    ms2 = MixedSequence(VectorSequence(x, batch_size),
                        VectorSequence(y, batch_size))

    for epoch in range(epochs):
        for step in range(ms.steps_per_epoch):
            xi, yi = ms[step]
            xj, yj = ms2[step]
            if epoch == 0:
                # The first epochs they must be aligned
                assert (xi == xj).all()
                assert (yi == yj).all()
            else:
                # Afterwards, since the ms2 is not shuffled, they must not be
                # anymore. Or at least, is very unlikely.
                assert (xi != xj).any()
                assert (yi != yj).any()
            assert (y[xi] == yi).all()

        ms.on_epoch_end()
Esempio n. 2
0
def get_holdout(train: np.ndarray, test: np.ndarray, bed: pd.DataFrame, labels: np.ndarray, genome, batch_size=1024) -> Tuple[Sequence, Sequence]:
    return (
        MixedSequence(
            x=BedSequence(genome, bed.iloc[train], batch_size=batch_size),
            y=labels[train],
            batch_size=batch_size
        ),
        MixedSequence(
            x=BedSequence(genome, bed.iloc[test], batch_size=batch_size),
            y=labels[test],
            batch_size=batch_size
        )
    )
def get_sequence_holdout(train: np.ndarray, test: np.ndarray, bed: pd.DataFrame, labels: np.ndarray, genome,
                         batch_size=32) -> Tuple[Sequence, Sequence]:
    logging.info("Computing train sequence data...")
    train = MixedSequence(
        x=BedSequence(genome, bed.iloc[train], batch_size=batch_size),
        y=labels[train],
        batch_size=batch_size
    )
    logging.info("Computing test sequence data...")
    test = MixedSequence(
        x=BedSequence(genome, bed.iloc[test], batch_size=batch_size),
        y=labels[test],
        batch_size=batch_size
    )
    return (train, test)
Esempio n. 4
0
def test_keras_mixed_sequence():
    model = build_model()
    batch_size = 32
    sequence = MixedSequence(
        VectorSequence(np.random.randint(2, size=(100, 10)), batch_size),
        {
            "output1": VectorSequence(np.random.randint(2, size=(100, 10)), batch_size),
            "output2": VectorSequence(np.random.randint(2, size=(100, 20)), batch_size)
        }
    )
    model.fit(
        sequence,
        steps_per_epoch=sequence.steps_per_epoch,
        epochs=2,
        verbose=0,
        shuffle=True
    )

    X, y = sequence.rasterize()

    model.fit(
        X, y,
        epochs=2,
        verbose=0,
        shuffle=True
    )
def test_change_batch_size():
    batch_size = 512
    sequence = MixedSequence(
        VectorSequence(np.empty(4096), batch_size=batch_size),
        VectorSequence(np.empty(4096), batch_size=batch_size),
    )
    print(sequence[0])
    assert sequence.batch_size == batch_size
    new_batch_size = 32
    sequence.batch_size = new_batch_size
    assert sequence.batch_size == 32
    sequence[sequence.steps_per_epoch-1]
Esempio n. 6
0
def test_illegal_parameters_keras_mixed_sequence():
    with pytest.raises(ValueError):
        MixedSequence(
            VectorSequence(np.random.randint(2, size=(100, 10)), 20),
            VectorSequence(np.random.randint(2, size=(100, 10)), 50)
        )

    with pytest.raises(ValueError):
        MixedSequence(
            VectorSequence(np.random.randint(
                2, size=(100, 10)), 50, elapsed_epochs=50),
            VectorSequence(np.random.randint(2, size=(100, 10)), 50)
        )

    with pytest.raises(ValueError):
        MixedSequence(
            VectorSequence(np.random.randint(2, size=(60, 10)), 50),
            VectorSequence(np.random.randint(2, size=(100, 10)), 50)
        )

    with pytest.raises(ValueError):
        VectorSequence(np.random.randint(2, size=(60, 10)), 50)[10000]
Esempio n. 7
0
def test_genomic_sequence_determinism():
    batch_size = 32
    epochs = 5
    enhancers = pd.read_csv("tests/enhancers.csv")
    promoters = pd.read_csv("tests/promoters.csv")

    genome = Genome("hg19", chromosomes=["chr1"])
    for region in tqdm((enhancers, promoters), desc="Region types"):
        y = np.arange(0, len(region), dtype=np.int64)
        mixed_sequence = MixedSequence(x=BedSequence(genome, region,
                                                     batch_size),
                                       y=VectorSequence(y, batch_size))
        reference_mixed_sequence = MixedSequence(
            x=BedSequence(genome,
                          region,
                          batch_size=len(region),
                          shuffle=False),
            y=VectorSequence(y, batch_size=len(region), shuffle=False))
        X, _ = reference_mixed_sequence[0]
        for _ in trange(epochs, desc="Epochs", leave=False):
            for step in range(mixed_sequence.steps_per_epoch):
                xi, yi = mixed_sequence[step]
                assert (X[yi.astype(int)] == xi).all()
            mixed_sequence.on_epoch_end()
Esempio n. 8
0
def get_data(
    parameters: Tuple[Tuple[str, int, str], str]
) -> Tuple[pd.DataFrame, np.array] or List[np.array, np.array]:
    load_parameters, data_type = parameters
    if data_type == 'epigenomic':
        dataset, labels = load_dataset(load_parameters)
        dataset.reset_index(drop=True, inplace=True)
        return dataset, labels
    if data_type == 'sequences':
        epigenomes, labels = load_dataset(load_parameters)
        genome = Genome('hg19')
        bed = epigenomes.reset_index()[epigenomes.index.names]
        batch_size = len(labels)
        return [
            data for data in MixedSequence(x=BedSequence(
                genome, bed.iloc[np.arange(
                    batch_size)], batch_size=batch_size),
                                           y=labels[np.arange(batch_size)],
                                           batch_size=batch_size)
        ][0]
def create_sequence(bed: pd.DataFrame, assembly: Genome,
                    batch_size: int) -> MixedSequence:
    """Return training sequence.

    Parameters
    ----------------------------
    bed: pd.DataFrame,
        Dataframe with bed file structure.
    assembly: Genome,
        Genomic assembly to use.
    batch_size: int,
        Batch size to use.

    Returns
    ----------------------------
    Training sequence for model.
    """
    return MixedSequence(x=BedSequence(assembly=assembly,
                                       bed=bed,
                                       batch_size=batch_size),
                         y=VectorSequence(bed.labels.values.astype(float),
                                          batch_size=batch_size))