def test_instrument_streamer(slicer, tiny_feats): df = tiny_feats.to_df() t_len = 10 batch_size = 12 if not df.empty: streamer = streams.InstrumentStreamer(df, slicer, t_len=t_len, batch_size=batch_size) __test_streamer(streamer, t_len, batch_size)
def test_instrument_streamer_with_zmq(tiny_feats): df = tiny_feats.to_df() t_len = 10 batch_size = 12 if not df.empty: streamer = streams.InstrumentStreamer(df, streams.cqt_slices, t_len=t_len, batch_size=batch_size, use_zmq=True) __test_streamer(streamer, t_len, batch_size)
def test_overfit_two_samples_cqt(tiny_feats): """Prove that our network works by training it with two random files from our dataset, intentionally overfitting it. Warning: not deterministic, but you could make it. """ features_df = tiny_feats.to_df() # Get list of instruments instruments = sorted(features_df["instrument"].unique()) selected_instruments = instruments[:2] # Create a dataframe from our dataframe with only two files in it test_df = pandas.concat([ features_df[features_df["instrument"] == selected_instruments[0]].sample(), features_df[features_df["instrument"] == selected_instruments[1]].sample() ]) t_len = 8 batch_size = 8 n_targets = 2 # Create a streamer that samples just those two files. streamer = streams.InstrumentStreamer(test_df, streams.cqt_slices, t_len=t_len, batch_size=batch_size) # Create a new model network_def = models.cqt_iX_c1f1_oY(t_len, n_targets) model = models.NetworkManager(network_def) # Train the model for N epochs, till it fits the damn thing max_batches = 10 i = 0 for batch in streamer: train_loss = model.train(batch) i += 1 print("Batch: ", i, "Loss: ", train_loss) if i >= max_batches: break # Evaluate it. On the original files. Should do well. eval_batch = next(streamer) eval_probs = model.predict(eval_batch) eval_loss, accuracy = model.evaluate(eval_batch) print("Predictions:", eval_probs) print("Eval Loss:", eval_loss, "Accuracy:", accuracy) assert np.all(np.isfinite(eval_probs)) and np.isfinite(eval_loss) and \ np.isfinite(accuracy)
def test_predict_dataframe(slicer_and_model, feats_df): # For the purposes of this we don't care too much about what we train with. # TODO: random seeds for consistency / reproducability. test_df = feats_df.sample(n=(12 * 12), replace=True) # Pick a model t_len = 8 n_classes = 12 slicer = slicer_and_model[0] network_def = slicer_and_model[1](t_len, n_classes) model = models.NetworkManager(network_def) # Create the streamer. streamer = streams.InstrumentStreamer(test_df, record_slicer=slicer, t_len=t_len, batch_size=12) # Train for a little bit. iter_count = 0 max_count = 100 for batch in streamer: loss = model.train(batch) print("Batch ", iter_count, "loss:", loss) iter_count += 1 if iter_count >= max_count: break # Run evaluation on some number of datapoints (10ish), # and make sure you get a dataframe back eval_df = hcnn.evaluate.predict.predict_many(test_df, model, slicer, t_len) # TODO: why is this even necessary? eval_df = eval_df.dropna() assert isinstance(eval_df, pandas.DataFrame) assert len(eval_df) == len(test_df) analyzer = hcnn.evaluate.analyze.PredictionAnalyzer(eval_df, test_df) print(analyzer.classification_report) print(analyzer.pprint())
def train_model(self): """ Train a model, writing intermediate params to disk. Trains for max_iterations or max_time, whichever is fewer. [Specified in the config.] """ if self.skip_training: logger.info(utils.colored("--skip_training specified - skipping")) return True assert hasattr(self, 'train_set') and hasattr(self, 'valid_set') logger.info("Starting training for experiment: {}".format( self.experiment_name)) # Save the config we used in the model directory, just in case. self.config.save(self._experiment_config_path) # Duration parameters max_iterations = self.config['training/max_iterations'] max_time = self.config['training/max_time'] # in seconds # Collect various necessary parameters t_len = self.config['training/t_len'] batch_size = self.config['training/batch_size'] n_targets = self.config['training/n_targets'] logger.debug( "Hyperparams:\nt_len: {}\nbatch_size: {}\n" "n_targets: {}\nmax_iterations: {}\nmax_time: {}s or {}h".format( t_len, batch_size, n_targets, max_iterations, max_time, (max_time / 60. / 60.))) slicer = get_slicer_from_feature(self.feature_mode) # Set up our streamer logger.info("[{}] Setting up streamer".format(self.experiment_name)) slice_logger = utils.SliceLogger() streamer = streams.InstrumentStreamer( self.train_set.to_df(), slicer, slicer_kwargs={'slice_logger': slice_logger}, t_len=t_len, batch_size=batch_size) # create our model logger.info("[{}] Setting up model: {}".format(self.experiment_name, self.model_definition)) network_def = getattr(models, self.model_definition)(t_len, n_targets) model = models.NetworkManager(network_def) iter_print_freq = self.config.get('training/iteration_print_frequency', None) iter_write_freq = self.config.get('training/iteration_write_frequency', None) timers = utils.TimerHolder() iter_count = 0 train_stats = pd.DataFrame( columns=['timestamp', 'batch_train_dur', 'iteration', 'loss']) min_train_loss = np.inf timers.start("train") logger.info("[{}] Beginning training loop at {}".format( self.experiment_name, timers.get("train"))) try: timers.start(("stream", iter_count)) for batch in streamer: timers.end(("stream", iter_count)) timers.start(("batch_train", iter_count)) loss = model.train(batch) timers.end(("batch_train", iter_count)) row = dict(timestamp=timers.get_end( ("batch_train", iter_count)), batch_train_dur=timers.get( ("batch_train", iter_count)), iteration=iter_count, loss=loss) train_stats.loc[len(train_stats)] = row # Time Logging logger.debug("[Iter timing] iter: {} | loss: {} | " "stream: {} | train: {}".format( iter_count, loss, timers.get(("stream", iter_count)), timers.get(("batch_train", iter_count)))) # Print status if iter_print_freq and (iter_count % iter_print_freq == 0): mean_train_loss = \ train_stats["loss"][-iter_print_freq:].mean() output_str = ("Iteration: {} | Mean_Train_loss: {}".format( iter_count, utils.conditional_colored(mean_train_loss, min_train_loss))) # On some small probability, do a randomly sampled # validation so we can see approximately how we're doing # on the validation set. if np.random.random() < .3: timers.start(("sampled_validation", iter_count)) valid_loss = self.sampled_validation_loss( model, slicer, t_len) output_str += " | Sampled_Valid_loss: {:0.4f}".format( valid_loss) timers.end(("sampled_validation", iter_count)) output_str += " | Val_time: {:0.2f}s".format( timers.get(("sampled_validation", iter_count)).total_seconds()) logger.info(output_str) min_train_loss = min(mean_train_loss, min_train_loss) # Print the mean times for the last n frames logger.debug( "Mean stream time: {}, Mean train time: {}".format( timers.mean("stream", iter_count - iter_print_freq, iter_count), timers.mean("batch_train", iter_count - iter_print_freq, iter_count))) # save model, maybe if iter_write_freq and (iter_count % iter_write_freq == 0): save_path = os.path.join( self._params_dir, self.param_format_str.format(iter_count)) logger.debug("Writing params to {}".format(save_path)) model.save(save_path) slice_log = os.path.join(self._cv_model_dir, "slice_log.csv") slice_logger.save(slice_log) if datetime.datetime.now() > \ (timers.get("train") + datetime.timedelta( seconds=max_time)): raise EarlyStoppingException("Max Time reached") iter_count += 1 timers.start(("stream", iter_count)) # Stopping conditions if (iter_count >= max_iterations): raise EarlyStoppingException("Max Iterations Reached") except KeyboardInterrupt: logger.warn(utils.colored("Training Cancelled", "red")) print("User cancelled training at epoch:", iter_count) except EarlyStoppingException as e: logger.warn( utils.colored("Training Stopped for {}".format(e), "red")) print("Training halted for: ", e) timers.end("train") # Print final training loss logger.info("Total iterations: {}".format(iter_count)) logger.info("Trained for {}".format(timers.get("train"))) logger.info("Final training loss: {}".format( train_stats["loss"].iloc[-1])) # Make sure to save the final iteration's model. save_path = os.path.join(self._params_dir, self.param_format_str.format(iter_count)) model.save(save_path) logger.info("Completed training for experiment: {}".format( self.experiment_name)) # Save training loss logger.info("Writing training stats to {}".format( self._training_loss_path)) train_stats.to_pickle(self._training_loss_path) # We need these files for models election, so make sure they exist return os.path.exists(self._training_loss_path)