def save_earthquake_cycles(data, xcol="acoustic_data", ycol="time_to_failure", data_dir="../data"): """Save the training data as chunks representing one cycle (earthquake to earthquake). Parameters ---------- data: pd.DataFrame, The data with all observations. Must have two columns: one with the measurement of the signal and one with the target, i.e., time to the next earthquake. xcol: str, optional, default: "acoustic_data" The column referring to the the signal data. ycol: str, optional, default: "time_to_failure" The column referring to the target value. data_dir: str Where to save the files. """ progress("Finding earthquake locations..") indx = find_earthquakes(data, ycol=ycol) indx = np.insert(indx, 0, [0]) indx = np.append(indx, len(data)) for i in range(len(indx) - 1): cycle = data.iloc[indx[i]:indx[i + 1], :] cycle.to_pickle(os.path.join(data_dir, "earthquake_{}.pkl".format(i))) progress("Cycle {} saved.".format(i))
def find_earthquakes(data, ycol="time_to_failure", chunks=3): """Find the indices at which an earthquake occurs. Note, the returned indices indicate the location of the first observation that is part of a new cycle. This means it can be used directly for indexing to obtain the exact cycles. data: pd.DataFrame The earthquake training data. ycol: str, optional, default="time_to_failure" The column representing the time until the next failure. chunks: int, optional, default=3 In how many chunks the earthquakes should be found in order to prevent memory issues. If the default (3) still leads to problems, try increasing this. """ chunk_size = len(data) / chunks chunk_indices = [int(chunk_size * i) for i in range(chunks)] + [len(data)] all_indices = [] for i in range(chunks): progress("Finding earthquakes in chunk {}/{}".format(i + 1, chunks)) subset = data[ycol].iloc[chunk_indices[i]:chunk_indices[i + 1]] indices = np.asarray(subset.diff() > 0).nonzero() indices = np.array(indices) + i * chunk_size all_indices = np.append(all_indices, indices) del subset gc.collect() return np.sort([int(x) for x in all_indices])
def predict_on_test(model, feature_computer, ycol="time_to_failure", stft=True, stft_feature_computer=None, data_dir="../data", ): """Load the test data, compute features on every segment, and predict the target. Parameters ---------- model: a fitted predictor, Must implement the 'predict' method to predict on the test sequences and must already be fitted/trained. feature_computer: FeatureComputer object or similar, A class that implements a method '.compute()' that takes an array and returns features. It must also have an attribute 'feature_names' that shows the corresponding names of the features. The same instance as was used during training. ycol: str, optional (default: "time_to_failure"), The column referring to the target value. data_dir: str, optional (default: "../data") The path to the main folder with the for this competition data. Note that test data is in several files, which are assumed to be in a subfolder called 'test' inside the data_dir. A file 'sample_submission.csv' is assumed to be directly in data_dir. stft: bool, optional (default: False), Whether to calculate the Short Time Fourier Transform. stft_feature_computer: FeatureComputer object or None, The computer for stft features. Returns ------- submission: pd.DataFrame, The predictions in the right format for submission. """ # take the segment ids from the sample submission file sample_submission = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"), index_col="seg_id") x_test = pd.DataFrame(columns=feature_computer.feature_names, dtype=np.float64, index=sample_submission.index) if stft: x_test_stft= pd.DataFrame(columns=[x + "_stft" for x in stft_feature_computer.feature_names], # noqa dtype=np.float64, index=sample_submission.index) # load and predict segments one by one for i, seg_id in enumerate(x_test.index): progress("Loading and computing features for segment {}/{}.".format(i + 1, len(x_test)), same_line=True, newline_end=(i + 1 == len(x_test))) segment = pd.read_csv(os.path.join(data_dir, "test", seg_id + ".csv")) x_test.loc[seg_id, :] = feature_computer.compute(segment["acoustic_data"].values) if stft: # _, _, zxx = signal.stft([item for sublist in segment["acoustic_data"].values for item in sublist]) _, _, zxx = signal.stft(segment["acoustic_data"].values) x_stft = np.sum(np.abs(zxx), axis=0) x_test_stft.loc[seg_id, :] = stft_feature_computer.compute(x_stft) if stft: x_test = pd.concat([x_test, x_test_stft], axis=1) sample_submission[ycol] = model.predict(x_test) progress("Predictions made.") return sample_submission.reset_index()
def evaluate_on_cycles(model, cycle_nrs=None, scaler=None, sequence_length=150000, xcol="acoustic_data", ycol="time_to_failure", data_dir="../data"): """Evaluate a model on certain earthquake cycles. Parameters ---------- model: a trained Keras.Model Must implement the `predict`. cycle_nrs: list of ints The cycle numbers you want to evaluate the model on. scaler: earthquakes.deep.Scaler object Scaler instance to use to scale every cycle. Must be initialized and fitted (if the scaling method requires so). sequence_length: int, optional, default=150000 The length of a signal sequence. This should probably be left at its default. xcol, ycol: str, optional The column names of the signal (xcol) and target (ycol). Defaults to xcol="acoustic_data", ycol="time_to_failure". data_dir: str, optional, default="../data" The directory that holds the cycle data. Returns ------- tuple of (weighted_loss, losses, weights): weighted_loss: float The total loss weighted over the cycle """ losses, weights = [], [] for nr in cycle_nrs: x, y = get_cycle(nr, xcol=xcol, ycol=ycol, data_dir=data_dir) x = x.reshape((len(x), 1)) y = y.reshape((len(y), 1)) x = scaler.scale(x) data_gen = TimeseriesGenerator(x, y, length=sequence_length, batch_size=128, shuffle=True) progress("Evaluating cycle {}..".format(nr)) loss = model.evaluate_generator(data_gen, steps=len(x) / sequence_length) losses.append(loss) weights.append(len(x)) weighted_loss = np.dot(losses, weights) / np.sum(weights) print("Weighted loss over cycles: {}".format(weighted_loss)) return weighted_loss, losses, weights
def predict_on_test(model, feature_computer=None, test_data=None, data_dir="../data", ycol="time_to_failure", **stft_kwargs): """Predict on the test data. Data can be provided as an argument or loaded from disk. Parameters ---------- model: a fitted predictor Must implement the 'predict' method to predict on the test sequences and must already be fitted/trained. test_data: array-like or None The test data. If None, a featurecomputer must be provided, so that a dataset can be created before predicting. feature_computer: FeatureComputer object or similar, A class that implements a method '.compute()' that takes an array and returns features. It must also have an attribute 'feature_names' that shows the corresponding names of the features. The same instance as was used during training. ycol: str, optional (default: "time_to_failure"), The column referring to the target value. data_dir: str, optional (default: "../data") The path to the main folder with the for this competition data. Note that test data is in several files, which are assumed to be in a subfolder called 'test' inside the data_dir. A file 'sample_submission.csv' is assumed to be directly in data_dir. Returns ------- submission: pd.DataFrame Predictions that can be used for submission (or for ensembling). """ submission = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"), index_col="seg_id") if test_data is None: assert feature_computer is not None, "Either test_data or feature_computer must be provided" test_data = create_test_dataset(feature_computer, data_dir=data_dir, ycol=ycol, **stft_kwargs) progress("Test dataset created.") submission[ycol] = model.predict(test_data) progress("Predictions made.") return submission.reset_index()
def cv_with_feature_computer(data, model_cls, feature_computer, ycol="time_to_failure", n_splits=5, train_samples=1000, val_samples=500, predict_test=False, stft=False, stft_feature_computer=None, data_dir="../data/test", **model_params): """Perform custom cross validation using randomly sampled sequences of observations. Parameters ---------- data: pd.DataFrame, The data with all observations. Must have two columns: one with the measurement of the signal and one with the target, i.e., time to the next earthquake. model_cls: uninitialized predictor class, Must implement the '.predict' and '.fit' methods. Every Scikit-Learn predictor suffices. feature_computer: FeatureComputer object or similar, A class that implements a method '.compute()' that takes an array and returns features. It must also have an attribute 'feature_names' that shows the corresponding names of the features. ycol: str, optional (default: "time_to_failure"), The column referring to the target value. n_splits: int, optional (default: 5) The number of folds in cross validation. train_samples: int, optional (default: 1000), The number of sequences to sample for training. val_samples: int, optional (default: 500), The number of sequences to sample for validation. predict_test: boolean, optional (default: False), If True, predicts on the test data at every fold and returns (together with cv scores) a dataframe with predictions on the test data. stft: boolean, optional (default: False), If true, predicts the Compute the Short Time Fourier Transform. stft_feature_computer: FeatureComputer object or None, The computer for stft features. data_dir: str, optional (default: "../data") The path to the main folder with the for this competition data. Note that test data is in several files, which are assumed to be in a subfolder called 'test' inside the data_dir. A file 'sample_submission.csv' is assumed to be directly in data_dir. This parameter is ignored if predict_test=False. **model_params: key-value pairs, Any parameters to pass to the predictor model_cls upon initialization. Returns ------- Either a list of validation scores (if predict_test=False) or a tuple of (list of validation scores, DataFrame with test predictions). """ splitter = KFold(n_splits=n_splits, shuffle=False) scores = [] for i, (train_index, val_index) in enumerate(splitter.split(data)): progress("Starting cross-validation fold {}.".format(i)) # split the data according to the indices progress("Splitting data in train and validation sets.") cols = data.columns train = pd.DataFrame(data.values[train_index], columns=cols) val = pd.DataFrame(data.values[val_index], columns=cols) # sample random sequences for training progress("Sampling {} sequences from training data.".format(train_samples)) train_features = create_feature_dataset(train, feature_computer, n_samples=train_samples, stft=stft, stft_feature_computer=stft_feature_computer) y_train = train_features[ycol] x_train = train_features.drop(ycol, axis=1) progress("Train set sampled.") # sample random sequences for validation progress("Sampling {} sequences from validation data.".format(val_samples)) val_features = create_feature_dataset(val, feature_computer, n_samples=val_samples, stft=stft, stft_feature_computer=stft_feature_computer) y_val = val_features[ycol] x_val = val_features.drop(ycol, axis=1) progress("Validation set sampled.") # train and predict validation set progress("Start training and predicting.") y_val_hat, model = train_and_predict(x_train, y_train, x_val, model_cls, return_model=True, **model_params) progress("Predictions on validation set made.") # evaluate using mean absolute error for this competition score = mean_absolute_error(y_val, y_val_hat) scores.append(score) progress("Validation score: {}.".format(score)) # predict on test set if specified if predict_test: if i == 0: test_predictions = predict_on_test(model, feature_computer, data_dir=data_dir, ycol=ycol, stft=stft, stft_feature_computer=stft_feature_computer) else: new_predictions = predict_on_test(model, feature_computer, data_dir=data_dir, ycol=ycol, stft=stft, stft_feature_computer=stft_feature_computer) test_predictions[ycol + "_{}".format(i)] = new_predictions[ycol].copy() progress("Predictions on test set made.") # clear up memory del train, val, train_features, y_train, x_train, val_features, x_val, y_val, model gc.collect() if predict_test: return scores, test_predictions else: return scores