def frnn_model_loader(custom_path): print(conf) specific_builder = builder.ModelBuilder(conf) model = specific_builder.build_model_PCS(False) print( 'Printing_out whole model summary..............********.......********' ) model.summary() print('FRNN Model built....') specific_builder.load_model_weights(model, custom_path) print('FRNN Model loaded....') model.summary() return model
def mpi_train(conf, shot_list_train, shot_list_validate, loader, callbacks_list=None): loader.set_inference_mode(False) conf['num_workers'] = comm.Get_size() specific_builder = builder.ModelBuilder(conf) train_model = specific_builder.build_model(False) #load the latest epoch we did. Returns -1 if none exist yet e = specific_builder.load_model_weights(train_model) e_old = e num_epochs = conf['training']['num_epochs'] lr_decay = conf['model']['lr_decay'] batch_size = conf['training']['batch_size'] lr = conf['model']['lr'] clipnorm = conf['model']['clipnorm'] warmup_steps = conf['model']['warmup_steps'] num_batches_minimum = conf['training']['num_batches_minimum'] if 'adam' in conf['model']['optimizer']: optimizer = MPIAdam(lr=lr) elif conf['model']['optimizer'] == 'sgd' or conf['model'][ 'optimizer'] == 'tf_sgd': optimizer = MPISGD(lr=lr) elif 'momentum_sgd' in conf['model']['optimizer']: optimizer = MPIMomentumSGD(lr=lr) else: print("Optimizer not implemented yet") exit(1) print('{} epochs left to go'.format(num_epochs - 1 - e)) # batch_generator = partial(loader.training_batch_generator,shot_list=shot_list_train) batch_generator = partial(loader.training_batch_generator_partial_reset, shot_list=shot_list_train) #{}batch_generator = partial(loader.training_batch_generator_process,shot_list=shot_list_train) print("warmup {}".format(warmup_steps)) mpi_model = MPIModel(train_model, optimizer, comm, batch_generator, batch_size, lr=lr, warmup_steps=warmup_steps, num_batches_minimum=num_batches_minimum) mpi_model.compile(conf['model']['optimizer'], clipnorm, conf['data']['target'].loss) tensorboard = None if backend != "theano" and task_index == 0: tensorboard_save_path = conf['paths']['tensorboard_save_path'] write_grads = conf['callbacks']['write_grads'] tensorboard = TensorBoard(log_dir=tensorboard_save_path, histogram_freq=1, write_graph=True, write_grads=write_grads) tensorboard.set_model(mpi_model.model) mpi_model.model.summary() if task_index == 0: callbacks = mpi_model.build_callbacks(conf, callbacks_list) callbacks.set_model(mpi_model.model) callback_metrics = conf['callbacks']['metrics'] callbacks.set_params({ 'epochs': num_epochs, 'metrics': callback_metrics, 'batch_size': batch_size, }) callbacks.on_train_begin() if conf['callbacks']['mode'] == 'max': best_so_far = -np.inf cmp_fn = max else: best_so_far = np.inf cmp_fn = min while e < num_epochs - 1: if task_index == 0: callbacks.on_epoch_begin(int(round(e))) mpi_model.set_lr(lr * lr_decay**e) print_unique('\nEpoch {}/{}'.format(e, num_epochs)) (step, ave_loss, curr_loss, num_so_far, effective_epochs) = mpi_model.train_epoch() e = e_old + effective_epochs loader.verbose = False #True during the first iteration if task_index == 0: specific_builder.save_model_weights(train_model, int(round(e))) epoch_logs = {} _, _, _, roc_area, loss = mpi_make_predictions_and_evaluate( conf, shot_list_validate, loader) if conf['training']['ranking_difficulty_fac'] != 1.0: _, _, _, roc_area_train, loss_train = mpi_make_predictions_and_evaluate( conf, shot_list_train, loader) batch_generator = partial( loader.training_batch_generator_partial_reset, shot_list=shot_list_train) mpi_model.batch_iterator = batch_generator mpi_model.batch_iterator_func.__exit__() mpi_model.num_so_far_accum = mpi_model.num_so_far_indiv mpi_model.set_batch_iterator_func() epoch_logs['val_roc'] = roc_area epoch_logs['val_loss'] = loss epoch_logs['train_loss'] = ave_loss best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']], best_so_far) stop_training = False if task_index == 0: print('=========Summary======== for epoch{}'.format(step)) print('Training Loss numpy: {:.3e}'.format(ave_loss)) print('Validation Loss: {:.3e}'.format(loss)) print('Validation ROC: {:.4f}'.format(roc_area)) if conf['training']['ranking_difficulty_fac'] != 1.0: print('Training Loss: {:.3e}'.format(loss_train)) print('Training ROC: {:.4f}'.format(roc_area_train)) callbacks.on_epoch_end(int(round(e)), epoch_logs) if hasattr(mpi_model.model, 'stop_training'): stop_training = mpi_model.model.stop_training if best_so_far != epoch_logs[ conf['callbacks'] ['monitor']]: #only save model weights if quantity we are tracking is improving print("Not saving model weights") specific_builder.delete_model_weights(train_model, int(round(e))) #tensorboard if backend != 'theano': val_generator = partial(loader.training_batch_generator, shot_list=shot_list_validate)() val_steps = 1 tensorboard.on_epoch_end(val_generator, val_steps, int(round(e)), epoch_logs) stop_training = comm.bcast(stop_training, root=0) if stop_training: print("Stopping training due to early stopping") break if task_index == 0: callbacks.on_train_end() tensorboard.on_train_end() mpi_model.close()
def mpi_make_predictions(conf, shot_list, loader, custom_path=None): loader.set_inference_mode(True) np.random.seed(task_index) shot_list.sort() #make sure all replicas have the same list specific_builder = builder.ModelBuilder(conf) y_prime = [] y_gold = [] disruptive = [] model = specific_builder.build_model(True) specific_builder.load_model_weights(model, custom_path) #broadcast model weights then set it explicitely: fix for Py3.6 if sys.version_info[0] > 2: if task_index == 0: new_weights = model.get_weights() else: new_weights = None nw = comm.bcast(new_weights, root=0) model.set_weights(nw) model.reset_states() if task_index == 0: pbar = Progbar(len(shot_list)) shot_sublists = shot_list.sublists(conf['model']['pred_batch_size'], do_shuffle=False, equal_size=True) y_prime_global = [] y_gold_global = [] disruptive_global = [] if task_index != 0: loader.verbose = False for (i, shot_sublist) in enumerate(shot_sublists): if i % num_workers == task_index: X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist) #load data and fit on data y_p = model.predict(X, batch_size=conf['model']['pred_batch_size']) model.reset_states() y_p = loader.batch_output_to_array(y_p) y = loader.batch_output_to_array(y) #cut arrays back y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)] y = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y)] # print('Shots {}/{}'.format(i*num_at_once + j*1.0*len(shot_sublist)/len(X_list),len(shot_list_train))) y_prime += y_p y_gold += y disruptive += disr # print_all('\nFinished with i = {}'.format(i)) if i % num_workers == num_workers - 1 or i == len(shot_sublists) - 1: comm.Barrier() y_prime_global += concatenate_sublists(comm.allgather(y_prime)) y_gold_global += concatenate_sublists(comm.allgather(y_gold)) disruptive_global += concatenate_sublists( comm.allgather(disruptive)) comm.Barrier() y_prime = [] y_gold = [] disruptive = [] # print_all('\nFinished subepoch with lists len(y_prime_global), gold, disruptive = {},{},{}'.format(len(y_prime_global),len(y_gold_global),len(disruptive_global))) if task_index == 0: pbar.add(1.0 * len(shot_sublist)) y_prime_global = y_prime_global[:len(shot_list)] y_gold_global = y_gold_global[:len(shot_list)] disruptive_global = disruptive_global[:len(shot_list)] loader.set_inference_mode(False) return y_prime_global, y_gold_global, disruptive_global
def train(conf, shot_list_train, shot_list_validate, loader): loader.set_inference_mode(False) np.random.seed(1) validation_losses = [] validation_roc = [] training_losses = [] print('validate: {} shots, {} disruptive'.format( len(shot_list_validate), shot_list_validate.num_disruptive())) print('training: {} shots, {} disruptive'.format( len(shot_list_train), shot_list_train.num_disruptive())) if backend == 'tf' or backend == 'tensorflow': first_time = "tensorflow" not in sys.modules if first_time: import tensorflow as tf os.environ['KERAS_BACKEND'] = 'tensorflow' from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto(device_count={"GPU": 1}) set_session(tf.Session(config=config)) else: os.environ['KERAS_BACKEND'] = 'theano' os.environ['THEANO_FLAGS'] = 'device=gpu,floatX=float32' import theano from keras.utils.generic_utils import Progbar from keras import backend as K from plasma.models import builder print('Build model...', end='') specific_builder = builder.ModelBuilder(conf) train_model = specific_builder.build_model(False) print('Compile model', end='') train_model.compile(optimizer=optimizer_class(), loss=conf['data']['target'].loss) print('...done') #load the latest epoch we did. Returns -1 if none exist yet e = specific_builder.load_model_weights(train_model) e_start = e batch_generator = partial(loader.training_batch_generator_partial_reset, shot_list=shot_list_train) batch_iterator = ProcessGenerator(batch_generator()) num_epochs = conf['training']['num_epochs'] num_at_once = conf['training']['num_shots_at_once'] lr_decay = conf['model']['lr_decay'] print('{} epochs left to go'.format(num_epochs - 1 - e)) num_so_far_accum = 0 num_so_far = 0 num_total = np.inf if conf['callbacks']['mode'] == 'max': best_so_far = -np.inf cmp_fn = max else: best_so_far = np.inf cmp_fn = min while e < num_epochs - 1: e += 1 print('\nEpoch {}/{}'.format(e + 1, num_epochs)) pbar = Progbar(len(shot_list_train)) #decay learning rate each epoch: K.set_value(train_model.optimizer.lr, lr * lr_decay**(e)) #print('Learning rate: {}'.format(train_model.optimizer.lr.get_value())) num_batches_minimum = 100 num_batches_current = 0 training_losses_tmp = [] while num_so_far < ( e - e_start ) * num_total or num_batches_current < num_batches_minimum: num_so_far_old = num_so_far try: batch_xs, batch_ys, batches_to_reset, num_so_far_curr, num_total, is_warmup_period = next( batch_iterator) except StopIteration: print("Resetting batch iterator.") num_so_far_accum = num_so_far batch_iterator = ProcessGenerator(batch_generator()) batch_xs, batch_ys, batches_to_reset, num_so_far_curr, num_total, is_warmup_period = next( batch_iterator) if np.any(batches_to_reset): reset_states(train_model, batches_to_reset) if not is_warmup_period: num_so_far = num_so_far_accum + num_so_far_curr num_batches_current += 1 loss = train_model.train_on_batch(batch_xs, batch_ys) training_losses_tmp.append(loss) pbar.add(num_so_far - num_so_far_old, values=[("train loss", loss)]) loader.verbose = False #True during the first iteration else: _ = train_model.predict( batch_xs, batch_size=conf['training']['batch_size']) e = e_start + 1.0 * num_so_far / num_total sys.stdout.flush() ave_loss = np.mean(training_losses_tmp) training_losses.append(ave_loss) specific_builder.save_model_weights(train_model, int(round(e))) if conf['training']['validation_frac'] > 0.0: print("prediction on GPU...") _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu( conf, shot_list_validate, loader) validation_losses.append(loss) validation_roc.append(roc_area) epoch_logs = {} epoch_logs['val_roc'] = roc_area epoch_logs['val_loss'] = loss epoch_logs['train_loss'] = ave_loss best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']], best_so_far) if best_so_far != epoch_logs[ conf['callbacks'] ['monitor']]: #only save model weights if quantity we are tracking is improving print("Not saving model weights") specific_builder.delete_model_weights(train_model, int(round(e))) if conf['training']['ranking_difficulty_fac'] != 1.0: _, _, _, roc_area_train, loss_train = make_predictions_and_evaluate_gpu( conf, shot_list_train, loader) batch_iterator.__exit__() batch_generator = partial( loader.training_batch_generator_partial_reset, shot_list=shot_list_train) batch_iterator = ProcessGenerator(batch_generator()) num_so_far_accum = num_so_far print('=========Summary========') print('Training Loss Numpy: {:.3e}'.format(training_losses[-1])) if conf['training']['validation_frac'] > 0.0: print('Validation Loss: {:.3e}'.format(validation_losses[-1])) print('Validation ROC: {:.4f}'.format(validation_roc[-1])) if conf['training']['ranking_difficulty_fac'] != 1.0: print('Train Loss: {:.3e}'.format(loss_train)) print('Train ROC: {:.4f}'.format(roc_area_train)) # plot_losses(conf,[training_losses],specific_builder,name='training') if conf['training']['validation_frac'] > 0.0: plot_losses(conf, [training_losses, validation_losses, validation_roc], specific_builder, name='training_validation_roc') batch_iterator.__exit__() print('...done')
def keras_fmin_fnct(self, space): from plasma.models import builder specific_builder = builder.ModelBuilder(self.conf) train_model = specific_builder.hyper_build_model(space, False) train_model.compile(optimizer=optimizer_class(), loss=conf['data']['target'].loss) np.random.seed(1) validation_losses = [] validation_roc = [] training_losses = [] shot_list_train, shot_list_validate = self.shot_list.split_direct( 1.0 - conf['training']['validation_frac'], do_shuffle=True) from keras.utils.generic_utils import Progbar from keras import backend as K num_epochs = self.conf['training']['num_epochs'] num_at_once = self.conf['training']['num_shots_at_once'] lr_decay = self.conf['model']['lr_decay'] resulting_dict = {'loss': None, 'status': STATUS_OK, 'model': None} e = -1 #print("Current num_epochs {}".format(e)) while e < num_epochs - 1: e += 1 pbar = Progbar(len(shot_list_train)) shot_list_train.shuffle() shot_sublists = shot_list_train.sublists(num_at_once)[:1] training_losses_tmp = [] K.set_value(train_model.optimizer.lr, lr * lr_decay**(e)) for (i, shot_sublist) in enumerate(shot_sublists): X_list, y_list = self.loader.load_as_X_y_list(shot_sublist) for j, (X, y) in enumerate(zip(X_list, y_list)): history = builder.LossHistory() train_model.fit(X, y, batch_size=Loader.get_batch_size( self.conf['training']['batch_size'], prediction_mode=False), epochs=1, shuffle=False, verbose=0, validation_split=0.0, callbacks=[history]) train_model.reset_states() train_loss = np.mean(history.losses) training_losses_tmp.append(train_loss) pbar.add(1.0 * len(shot_sublist) / len(X_list), values=[("train loss", train_loss)]) self.loader.verbose = False sys.stdout.flush() training_losses.append(np.mean(training_losses_tmp)) specific_builder.save_model_weights(train_model, e) _, _, _, roc_area, loss = make_predictions_and_evaluate_gpu( self.conf, shot_list_validate, self.loader) print("Epoch: {}, loss: {}, validation_losses_size: {}".format( e, loss, len(validation_losses))) validation_losses.append(loss) validation_roc.append(roc_area) resulting_dict['loss'] = loss resulting_dict['model'] = train_model #print("Results {}, before {}".format(resulting_dict,id(resulting_dict))) #print("Results {}, after {}".format(resulting_dict,id(resulting_dict))) return resulting_dict
def mpi_train(conf, shot_list_train, shot_list_validate, loader, callbacks_list=None, shot_list_test=None): loader.set_inference_mode(False) # TODO(KGF): this is not defined in conf.yaml, but added to processed dict # for the first time here: conf['num_workers'] = g.comm.Get_size() specific_builder = builder.ModelBuilder(conf) if g.tf_ver >= parse_version('1.14.0'): # Internal TensorFlow flags, subject to change (v1.14.0+ only?) try: from tensorflow.python.util import module_wrapper as depr except ImportError: from tensorflow.python.util import deprecation_wrapper as depr # depr._PRINT_DEPRECATION_WARNINGS = False # does nothing depr._PER_MODULE_WARNING_LIMIT = 0 # Suppresses warnings from "keras/backend/tensorflow_backend.py" # except: "Rate should be set to `rate = 1 - keep_prob`" # Also suppresses warnings from "keras/optimizers.py # does NOT suppresses warn from "/tensorflow/python/ops/math_grad.py" else: # TODO(KGF): next line suppresses ALL info and warning messages, # not just deprecation warnings... tf.logging.set_verbosity(tf.logging.ERROR) # TODO(KGF): for TF>v1.13.0 (esp v1.14.0), this next line prompts a ton of # deprecation warnings with externally-packaged Keras, e.g.: # WARNING:tensorflow:From .../keras/backend/tensorflow_backend.py:174: # The name tf.get_default_session is deprecated. # Please use tf.compat.v1.get_default_session instead. train_model = specific_builder.build_model(False) # Cannot fix these Keras internals via "import tensorflow.compat.v1 as tf" # # TODO(KGF): note, these are different than C-based info diagnostics e.g.: # 2019-11-06 18:27:31.698908: I ... dynamic library libcublas.so.10 # which are NOT suppressed by set_verbosity. See top level __init__.py # load the latest epoch we did. Returns 0 if none exist yet e = specific_builder.load_model_weights(train_model) e_old = e num_epochs = conf['training']['num_epochs'] lr_decay = conf['model']['lr_decay'] batch_size = conf['training']['batch_size'] lr = conf['model']['lr'] clipnorm = conf['model']['clipnorm'] warmup_steps = conf['model']['warmup_steps'] # TODO(KGF): rename as "num_iter_minimum" or "min_steps_per_epoch" num_batches_minimum = conf['training']['num_batches_minimum'] if 'adam' in conf['model']['optimizer']: optimizer = MPIAdam(lr=lr) elif (conf['model']['optimizer'] == 'sgd' or conf['model']['optimizer'] == 'tf_sgd'): optimizer = MPISGD(lr=lr) elif 'momentum_sgd' in conf['model']['optimizer']: optimizer = MPIMomentumSGD(lr=lr) else: print("Optimizer not implemented yet") exit(1) g.print_unique('{} epoch(s) left to go'.format(num_epochs - e)) batch_generator = partial(loader.training_batch_generator_partial_reset, shot_list=shot_list_train) g.print_unique("warmup steps = {}".format(warmup_steps)) mpi_model = MPIModel(train_model, optimizer, g.comm, batch_generator, batch_size, lr=lr, warmup_steps=warmup_steps, num_batches_minimum=num_batches_minimum, conf=conf) mpi_model.compile(conf['model']['optimizer'], clipnorm, conf['data']['target'].loss) tensorboard = None if g.task_index == 0: tensorboard_save_path = conf['paths']['tensorboard_save_path'] write_grads = conf['callbacks']['write_grads'] tensorboard = TensorBoard(log_dir=tensorboard_save_path, histogram_freq=1, write_graph=True, write_grads=write_grads) tensorboard.set_model(mpi_model.model) # TODO(KGF): check addition of TF model summary write added from fork fr = open('model_architecture.log', 'a') ori = sys.stdout sys.stdout = fr mpi_model.model.summary() sys.stdout = ori fr.close() mpi_model.model.summary() if g.task_index == 0: callbacks = mpi_model.build_callbacks(conf, callbacks_list) callbacks.set_model(mpi_model.model) callback_metrics = conf['callbacks']['metrics'] callbacks.set_params({ 'epochs': num_epochs, 'metrics': callback_metrics, 'batch_size': batch_size, }) callbacks.on_train_begin() if conf['callbacks']['mode'] == 'max': best_so_far = -np.inf cmp_fn = max else: best_so_far = np.inf cmp_fn = min while e < num_epochs: g.write_unique('\nBegin training from epoch {:.2f}/{}'.format( e, num_epochs)) if g.task_index == 0: callbacks.on_epoch_begin(int(round(e))) mpi_model.set_lr(lr * lr_decay**e) # KGF: core work of loop performed in next line (step, ave_loss, curr_loss, num_so_far, effective_epochs) = mpi_model.train_epoch() e = e_old + effective_epochs g.write_unique('Finished training of epoch {:.2f}/{}\n'.format( e, num_epochs)) # TODO(KGF): add diagnostic about "saving to epoch X"? loader.verbose = False # True during the first iteration if g.task_index == 0: specific_builder.save_model_weights(train_model, int(round(e))) if conf['training']['no_validation']: break epoch_logs = {} g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format( e, num_epochs)) # TODO(KGF): flush output/ MPI barrier? # g.flush_all_inorder() # TODO(KGF): is there a way to avoid Keras.Models.load_weights() # repeated calls throughout mpi_make_pred*() fn calls? _, _, _, roc_area, loss = mpi_make_predictions_and_evaluate( conf, shot_list_validate, loader) if conf['training']['ranking_difficulty_fac'] != 1.0: (_, _, _, roc_area_train, loss_train) = mpi_make_predictions_and_evaluate( conf, shot_list_train, loader) batch_generator = partial( loader.training_batch_generator_partial_reset, shot_list=shot_list_train) mpi_model.batch_iterator = batch_generator mpi_model.batch_iterator_func.__exit__() mpi_model.num_so_far_accum = mpi_model.num_so_far_indiv mpi_model.set_batch_iterator_func() if ('monitor_test' in conf['callbacks'].keys() and conf['callbacks']['monitor_test']): times = conf['callbacks']['monitor_times'] areas, _ = mpi_make_predictions_and_evaluate_multiple_times( conf, shot_list_validate, loader, times) epoch_str = 'epoch {}, '.format(int(round(e))) g.write_unique(epoch_str + ' '.join([ 'val_roc_{} = {}'.format(t, roc) for t, roc in zip(times, areas) ]) + '\n') if shot_list_test is not None: areas, _ = mpi_make_predictions_and_evaluate_multiple_times( conf, shot_list_test, loader, times) g.write_unique(epoch_str + ' '.join([ 'test_roc_{} = {}'.format(t, roc) for t, roc in zip(times, areas) ]) + '\n') epoch_logs['val_roc'] = roc_area epoch_logs['val_loss'] = loss epoch_logs['train_loss'] = ave_loss best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']], best_so_far) stop_training = False g.flush_all_inorder() if g.task_index == 0: print('=========Summary======== for epoch {:.2f}'.format(e)) print('Training Loss numpy: {:.3e}'.format(ave_loss)) print('Validation Loss: {:.3e}'.format(loss)) print('Validation ROC: {:.4f}'.format(roc_area)) if conf['training']['ranking_difficulty_fac'] != 1.0: print('Training Loss: {:.3e}'.format(loss_train)) print('Training ROC: {:.4f}'.format(roc_area_train)) print('======================== ') callbacks.on_epoch_end(int(round(e)), epoch_logs) if hasattr(mpi_model.model, 'stop_training'): stop_training = mpi_model.model.stop_training # only save model weights if quantity we are tracking is improving if best_so_far != epoch_logs[conf['callbacks']['monitor']]: if ('monitor_test' in conf['callbacks'].keys() and conf['callbacks']['monitor_test']): print("No improvement, saving model weights anyways") else: print("Not saving model weights") specific_builder.delete_model_weights( train_model, int(round(e))) # tensorboard val_generator = partial(loader.training_batch_generator, shot_list=shot_list_validate)() val_steps = 1 tensorboard.on_epoch_end(val_generator, val_steps, int(round(e)), epoch_logs) stop_training = g.comm.bcast(stop_training, root=0) g.write_unique('Finished evaluation of epoch {:.2f}/{}'.format( e, num_epochs)) # TODO(KGF): compare to old diagnostic: # g.write_unique("end epoch {}".format(e_old)) if stop_training: g.write_unique("Stopping training due to early stopping") break if g.task_index == 0: callbacks.on_train_end() tensorboard.on_train_end() mpi_model.close()
def mpi_make_predictions(conf, shot_list, loader, custom_path=None): loader.set_inference_mode(True) np.random.seed(g.task_index) shot_list.sort() # make sure all replicas have the same list specific_builder = builder.ModelBuilder(conf) y_prime = [] y_gold = [] disruptive = [] model = specific_builder.build_model(True) specific_builder.load_model_weights(model, custom_path) # broadcast model weights then set it explicitly: fix for Py3.6 # TODO(KGF): remove if we no longer support Py2 if sys.version_info[0] > 2: if g.task_index == 0: new_weights = model.get_weights() else: new_weights = None nw = g.comm.bcast(new_weights, root=0) model.set_weights(nw) model.reset_states() if g.task_index == 0: # TODO(KGF): this appears to prepend a \n, resulting in: # [2] loading from epoch 7 # # 128/862 [===>..........................] - ETA: 2:20 pbar = Progbar(len(shot_list)) shot_sublists = shot_list.sublists(conf['model']['pred_batch_size'], do_shuffle=False, equal_size=True) y_prime_global = [] y_gold_global = [] disruptive_global = [] if g.task_index != 0: loader.verbose = False for (i, shot_sublist) in enumerate(shot_sublists): if i % g.num_workers == g.task_index: X, y, shot_lengths, disr = loader.load_as_X_y_pred(shot_sublist) # load data and fit on data y_p = model.predict(X, batch_size=conf['model']['pred_batch_size']) model.reset_states() y_p = loader.batch_output_to_array(y_p) y = loader.batch_output_to_array(y) # cut arrays back y_p = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y_p)] y = [arr[:shot_lengths[j]] for (j, arr) in enumerate(y)] y_prime += y_p y_gold += y disruptive += disr # print_all('\nFinished with i = {}'.format(i)) if (i % g.num_workers == g.num_workers - 1 or i == len(shot_sublists) - 1): g.comm.Barrier() y_prime_global += concatenate_sublists(g.comm.allgather(y_prime)) y_gold_global += concatenate_sublists(g.comm.allgather(y_gold)) disruptive_global += concatenate_sublists( g.comm.allgather(disruptive)) g.comm.Barrier() y_prime = [] y_gold = [] disruptive = [] if g.task_index == 0: pbar.add(1.0 * len(shot_sublist)) y_prime_global = y_prime_global[:len(shot_list)] y_gold_global = y_gold_global[:len(shot_list)] disruptive_global = disruptive_global[:len(shot_list)] loader.set_inference_mode(False) return y_prime_global, y_gold_global, disruptive_global