Ejemplo n.º 1
0
def save_earthquake_cycles(data,
                           xcol="acoustic_data",
                           ycol="time_to_failure",
                           data_dir="../data"):
    """Save the training data as chunks representing one cycle (earthquake to earthquake).

    Parameters
    ----------
    data: pd.DataFrame,
        The data with all observations. Must have two columns: one with the measurement
        of the signal and one with the target, i.e., time to the next earthquake.
    xcol: str, optional, default: "acoustic_data"
        The column referring to the the signal data.
    ycol: str, optional, default: "time_to_failure"
        The column referring to the target value.
    data_dir: str
        Where to save the files.
    """
    progress("Finding earthquake locations..")
    indx = find_earthquakes(data, ycol=ycol)
    indx = np.insert(indx, 0, [0])
    indx = np.append(indx, len(data))
    for i in range(len(indx) - 1):
        cycle = data.iloc[indx[i]:indx[i + 1], :]
        cycle.to_pickle(os.path.join(data_dir, "earthquake_{}.pkl".format(i)))
        progress("Cycle {} saved.".format(i))
Ejemplo n.º 2
0
def find_earthquakes(data, ycol="time_to_failure", chunks=3):
    """Find the indices at which an earthquake occurs.

    Note, the returned indices indicate the location of the first observation
    that is part of a new cycle. This means it can be used directly for indexing
    to obtain the exact cycles.

    data: pd.DataFrame
        The earthquake training data.
    ycol: str, optional, default="time_to_failure"
        The column representing the time until the next failure.
    chunks: int, optional, default=3
        In how many chunks the earthquakes should be found in order to prevent
        memory issues. If the default (3) still leads to problems, try increasing this.
    """
    chunk_size = len(data) / chunks
    chunk_indices = [int(chunk_size * i) for i in range(chunks)] + [len(data)]

    all_indices = []
    for i in range(chunks):
        progress("Finding earthquakes in chunk {}/{}".format(i + 1, chunks))
        subset = data[ycol].iloc[chunk_indices[i]:chunk_indices[i + 1]]
        indices = np.asarray(subset.diff() > 0).nonzero()
        indices = np.array(indices) + i * chunk_size
        all_indices = np.append(all_indices, indices)
        del subset
        gc.collect()

    return np.sort([int(x) for x in all_indices])
Ejemplo n.º 3
0
def predict_on_test(model, feature_computer, ycol="time_to_failure", stft=True, stft_feature_computer=None,
                    data_dir="../data", ):
    """Load the test data, compute features on every segment, and predict the target.

    Parameters
    ----------
    model: a fitted predictor,
        Must implement the 'predict' method to predict on the test sequences and
        must already be fitted/trained.
    feature_computer: FeatureComputer object or similar,
        A class that implements a method '.compute()' that takes an array and returns
        features. It must also have an attribute 'feature_names' that shows the corresponding
        names of the features. The same instance as was used during training.
    ycol: str, optional (default: "time_to_failure"),
        The column referring to the target value.
    data_dir: str, optional (default: "../data")
        The path to the main folder with the for this competition data. Note that test data is
        in several files, which are assumed to be in a subfolder called 'test' inside the data_dir.
        A file 'sample_submission.csv' is assumed to be directly in data_dir.
    stft: bool, optional (default: False),
        Whether to calculate the Short Time Fourier Transform.
    stft_feature_computer: FeatureComputer object or None,
        The computer for stft features.

    Returns
    -------
    submission: pd.DataFrame,
        The predictions in the right format for submission.
    """
    # take the segment ids from the sample submission file
    sample_submission = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"), index_col="seg_id")
    x_test = pd.DataFrame(columns=feature_computer.feature_names, dtype=np.float64, index=sample_submission.index)
    if stft:
        x_test_stft= pd.DataFrame(columns=[x + "_stft" for x in stft_feature_computer.feature_names],  # noqa
                                  dtype=np.float64,
                                  index=sample_submission.index)

    # load and predict segments one by one
    for i, seg_id in enumerate(x_test.index):
        progress("Loading and computing features for segment {}/{}.".format(i + 1, len(x_test)),
                 same_line=True, newline_end=(i + 1 == len(x_test)))

        segment = pd.read_csv(os.path.join(data_dir, "test", seg_id + ".csv"))
        x_test.loc[seg_id, :] = feature_computer.compute(segment["acoustic_data"].values)
        if stft:
            # _, _, zxx = signal.stft([item for sublist in segment["acoustic_data"].values for item in sublist])
            _, _, zxx = signal.stft(segment["acoustic_data"].values)
            x_stft = np.sum(np.abs(zxx), axis=0)
            x_test_stft.loc[seg_id, :] = stft_feature_computer.compute(x_stft)

    if stft:
        x_test = pd.concat([x_test, x_test_stft], axis=1)

    sample_submission[ycol] = model.predict(x_test)
    progress("Predictions made.")
    return sample_submission.reset_index()
Ejemplo n.º 4
0
def evaluate_on_cycles(model,
                       cycle_nrs=None,
                       scaler=None,
                       sequence_length=150000,
                       xcol="acoustic_data",
                       ycol="time_to_failure",
                       data_dir="../data"):
    """Evaluate a model on certain earthquake cycles.

    Parameters
    ----------
    model: a trained Keras.Model
        Must implement the `predict`.
    cycle_nrs: list of ints
        The cycle numbers you want to evaluate the model on.
    scaler: earthquakes.deep.Scaler object
        Scaler instance to use to scale every cycle. Must be initialized and fitted
        (if the scaling method requires so).
    sequence_length: int, optional, default=150000
        The length of a signal sequence. This should probably be left at its default.
    xcol, ycol: str, optional
        The column names of the signal (xcol) and target (ycol). Defaults to
        xcol="acoustic_data", ycol="time_to_failure".
    data_dir: str, optional, default="../data"
        The directory that holds the cycle data.

    Returns
    -------
    tuple of (weighted_loss, losses, weights):
        weighted_loss: float
            The total loss weighted over the cycle
    """
    losses, weights = [], []
    for nr in cycle_nrs:
        x, y = get_cycle(nr, xcol=xcol, ycol=ycol, data_dir=data_dir)
        x = x.reshape((len(x), 1))
        y = y.reshape((len(y), 1))
        x = scaler.scale(x)

        data_gen = TimeseriesGenerator(x,
                                       y,
                                       length=sequence_length,
                                       batch_size=128,
                                       shuffle=True)
        progress("Evaluating cycle {}..".format(nr))
        loss = model.evaluate_generator(data_gen,
                                        steps=len(x) / sequence_length)
        losses.append(loss)
        weights.append(len(x))

    weighted_loss = np.dot(losses, weights) / np.sum(weights)
    print("Weighted loss over cycles: {}".format(weighted_loss))
    return weighted_loss, losses, weights
Ejemplo n.º 5
0
def predict_on_test(model,
                    feature_computer=None,
                    test_data=None,
                    data_dir="../data",
                    ycol="time_to_failure",
                    **stft_kwargs):
    """Predict on the test data.

    Data can be provided as an argument or loaded from disk.

    Parameters
    ----------
    model: a fitted predictor
        Must implement the 'predict' method to predict on the test sequences and
        must already be fitted/trained.
    test_data: array-like or None
        The test data. If None, a featurecomputer must be provided, so that a
        dataset can be created before predicting.
    feature_computer: FeatureComputer object or similar,
        A class that implements a method '.compute()' that takes an array and returns
        features. It must also have an attribute 'feature_names' that shows the corresponding
        names of the features. The same instance as was used during training.
    ycol: str, optional (default: "time_to_failure"),
        The column referring to the target value.
    data_dir: str, optional (default: "../data")
        The path to the main folder with the for this competition data. Note that test data is
        in several files, which are assumed to be in a subfolder called 'test' inside the data_dir.
        A file 'sample_submission.csv' is assumed to be directly in data_dir.

    Returns
    -------
    submission: pd.DataFrame
        Predictions that can be used for submission (or for ensembling).
    """
    submission = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"),
                             index_col="seg_id")

    if test_data is None:
        assert feature_computer is not None, "Either test_data or feature_computer must be provided"
        test_data = create_test_dataset(feature_computer,
                                        data_dir=data_dir,
                                        ycol=ycol,
                                        **stft_kwargs)
        progress("Test dataset created.")

    submission[ycol] = model.predict(test_data)
    progress("Predictions made.")
    return submission.reset_index()
Ejemplo n.º 6
0
def cv_with_feature_computer(data, model_cls, feature_computer, ycol="time_to_failure",
                             n_splits=5, train_samples=1000, val_samples=500, predict_test=False,
                             stft=False, stft_feature_computer=None, data_dir="../data/test", **model_params):
    """Perform custom cross validation using randomly sampled sequences of observations.

    Parameters
    ----------
    data: pd.DataFrame,
        The data with all observations. Must have two columns: one with the measurement
        of the signal and one with the target, i.e., time to the next earthquake.
    model_cls: uninitialized predictor class,
        Must implement the '.predict' and '.fit' methods. Every Scikit-Learn
        predictor suffices.
    feature_computer: FeatureComputer object or similar,
        A class that implements a method '.compute()' that takes an array and returns
        features. It must also have an attribute 'feature_names' that shows the corresponding
        names of the features.
    ycol: str, optional (default: "time_to_failure"),
        The column referring to the target value.
    n_splits: int, optional (default: 5)
        The number of folds in cross validation.
    train_samples: int, optional (default: 1000),
        The number of sequences to sample for training.
    val_samples: int, optional (default: 500),
        The number of sequences to sample for validation.
    predict_test: boolean, optional (default: False),
        If True, predicts on the test data at every fold and returns (together with cv scores)
        a dataframe with predictions on the test data.
    stft: boolean, optional (default: False),
        If true, predicts the Compute the Short Time Fourier Transform.
    stft_feature_computer: FeatureComputer object or None,
        The computer for stft features.
    data_dir: str, optional (default: "../data")
        The path to the main folder with the for this competition data. Note that test data is
        in several files, which are assumed to be in a subfolder called 'test' inside the data_dir.
        A file 'sample_submission.csv' is assumed to be directly in data_dir. This parameter
        is ignored if predict_test=False.
    **model_params: key-value pairs,
        Any parameters to pass to the predictor model_cls upon initialization.

    Returns
    -------
    Either a list of validation scores (if predict_test=False) or a tuple of
    (list of validation scores, DataFrame with test predictions).
    """
    splitter = KFold(n_splits=n_splits, shuffle=False)

    scores = []
    for i, (train_index, val_index) in enumerate(splitter.split(data)):
        progress("Starting cross-validation fold {}.".format(i))

        # split the data according to the indices
        progress("Splitting data in train and validation sets.")
        cols = data.columns
        train = pd.DataFrame(data.values[train_index], columns=cols)
        val = pd.DataFrame(data.values[val_index], columns=cols)

        # sample random sequences for training
        progress("Sampling {} sequences from training data.".format(train_samples))
        train_features = create_feature_dataset(train, feature_computer, n_samples=train_samples,
                                                stft=stft, stft_feature_computer=stft_feature_computer)
        y_train = train_features[ycol]
        x_train = train_features.drop(ycol, axis=1)
        progress("Train set sampled.")

        # sample random sequences for validation
        progress("Sampling {} sequences from validation data.".format(val_samples))
        val_features = create_feature_dataset(val, feature_computer, n_samples=val_samples,
                                              stft=stft, stft_feature_computer=stft_feature_computer)
        y_val = val_features[ycol]
        x_val = val_features.drop(ycol, axis=1)
        progress("Validation set sampled.")

        # train and predict validation set
        progress("Start training and predicting.")
        y_val_hat, model = train_and_predict(x_train, y_train, x_val, model_cls, return_model=True, **model_params)
        progress("Predictions on validation set made.")

        # evaluate using mean absolute error for this competition
        score = mean_absolute_error(y_val, y_val_hat)
        scores.append(score)
        progress("Validation score: {}.".format(score))

        # predict on test set if specified
        if predict_test:
            if i == 0:
                test_predictions = predict_on_test(model, feature_computer, data_dir=data_dir, ycol=ycol,
                                                   stft=stft, stft_feature_computer=stft_feature_computer)
            else:
                new_predictions = predict_on_test(model, feature_computer, data_dir=data_dir, ycol=ycol,
                                                  stft=stft, stft_feature_computer=stft_feature_computer)
                test_predictions[ycol + "_{}".format(i)] = new_predictions[ycol].copy()
        progress("Predictions on test set made.")

        # clear up memory
        del train, val, train_features, y_train, x_train, val_features, x_val, y_val, model
        gc.collect()

    if predict_test:
        return scores, test_predictions
    else:
        return scores