def test_qm9(): dataset = datasets.QM9(amount=100) dl = DisjointLoader(dataset, batch_size=batch_size) dl.__next__() bl = BatchLoader(dataset, batch_size=batch_size) bl.__next__()
def train_test_val_data(dataset, epochs=400, batch_size=1, path="../data/"): # dataset = CircuitDataset(path=path, transforms=transforms) # # Parameters # F = dataset.n_node_features # Dimension of node features # n_out = dataset.n_labels # Dimension of the target # Train/valid/test split idxs = np.random.permutation(len(dataset)) split_va, split_te = int(0.6 * len(dataset)), int(0.8 * len(dataset)) idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te]) print(idx_tr, idx_va, idx_te) dataset_tr = dataset[idx_tr] dataset_va = dataset[idx_va] dataset_te = dataset[idx_te] loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs, node_level=True) loader_va = DisjointLoader(dataset_va, batch_size=batch_size, node_level=True) loader_te = DisjointLoader(dataset_te, batch_size=batch_size, node_level=True) return loader_tr, loader_va, loader_te
def test_qm7(): dataset = datasets.QM7() dl = DisjointLoader(dataset, batch_size=batch_size) dl.__next__() bl = BatchLoader(dataset, batch_size=batch_size) bl.__next__()
def test_disjoint(): data = TestDataset() loader = DisjointLoader(data, batch_size=batch_size, epochs=1, shuffle=False) batches = list(loader) (x, a, e, i), y = batches[-1] n = sum(ns[-graphs_in_batch:]) assert x.shape == (n, f) assert a.shape == (n, n) assert len(e.shape) == 2 and e.shape[1] == s # Avoid counting edges assert i.shape == (n, ) assert y.shape == (graphs_in_batch, 2) assert loader.steps_per_epoch == np.ceil(len(data) / batch_size) signature = loader.tf_signature() assert len(signature[0]) == 4
def test_tud(): # Edge labels + edge attributes dataset = datasets.TUDataset('BZR_MD', clean=False) dl = DisjointLoader(dataset, batch_size=batch_size) dl.__next__() bl = BatchLoader(dataset, batch_size=batch_size) bl.__next__() # Node labels + node attributes + clean version dataset = datasets.TUDataset('ENZYMES', clean=True) dl = DisjointLoader(dataset, batch_size=batch_size) dl.__next__() bl = BatchLoader(dataset, batch_size=batch_size) bl.__next__()
def test_disjoint_node(): data = TestDatasetDsjNode() loader = DisjointLoader( data, node_level=True, batch_size=batch_size, epochs=1, shuffle=False ) batches = [b for b in loader] (x, a, e, i), y = batches[-1] n = sum(ns[-graphs_in_batch:]) assert x.shape == (n, f) assert a.shape == (n, n) assert len(e.shape) == 2 and e.shape[1] == s # Avoid counting edges assert i.shape == (n,) assert y.shape == (n, 2) assert loader.steps_per_epoch == np.ceil(len(data) / batch_size)
def test_disjoint(): data = TestDataset() loader = DisjointLoader(data, batch_size=batch_size, epochs=1, shuffle=False) batches = [b for b in loader] (x, a, e, i), y = batches[-1] n = sum(ns[-graphs_in_batch:]) assert x.shape == (n, f) assert a.shape == (n, n) assert len(e.shape) == 2 and e.shape[1] == s # Avoid counting edges assert i.shape == (n, ) assert y.shape == (graphs_in_batch, 2)
def save_checkpoint(name, model): os.makedirs(f'{logdir}/{name}', exist_ok=True) loader = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=1) all_pred_types = [] all_actual_types = [] print('>>> saving checkpoint <<<') for batch in loader: nodes, adj, edges = batch[0] actions, targets, mask = forward(model, *batch, training=False) pred_types, actual_types = log_prediction(nodes, targets, actions, mask) print('pred_types:', pred_types) print('actual_types:', actual_types) all_pred_types.extend(pred_types) all_actual_types.extend(actual_types) unique, counts = np.unique(all_actual_types, return_counts=True) label_dist = dict(zip(unique, counts)) # confusion matrix import pandas as pd import seaborn as sn from matplotlib import pyplot as plt all_possible_types = [ i + 1 for i in range(max(*all_actual_types, *all_pred_types)) ] actual_df = pd.Categorical(all_actual_types, categories=all_possible_types) predicted_df = pd.Categorical(all_pred_types, categories=[*all_possible_types, 'Totals']) cm = pd.crosstab(actual_df, predicted_df, rownames=['Actual'], colnames=['Predicted']) for idx in all_actual_types: if idx not in all_pred_types: cm[idx] = 0 totals = [ sum(row) for (_, row) in cm.iterrows() ] cm['Totals'] = totals sorted_cols = sorted([ c for c in cm.columns if type(c) is int ]) sorted_cols.append('Totals') cm = cm.reindex(sorted_cols, axis=1) sn.heatmap(cm, annot=True) plt.title(f'confusion matrix ({name})') plt.savefig(f'{logdir}/{name}/confusion_matrix.png') plt.clf() # save the model(s) model.save(f'{logdir}/{name}/model')
# Best config batch_size = 32 learning_rate = 0.01 epochs = 400 # Read data data = TUDataset('PROTEINS') # Train/test split np.random.shuffle(data) split = int(0.8 * len(data)) data_tr, data_te = data[:split], data[split:] # Data loader loader_tr = DisjointLoader(data_tr, batch_size=batch_size, epochs=epochs) loader_te = DisjointLoader(data_te, batch_size=batch_size) # Create model model = GeneralGNN(data.n_labels, activation='softmax') optimizer = Adam(learning_rate) model.compile('adam', 'categorical_crossentropy', metrics=['categorical_accuracy']) # Evaluation function def evaluate(loader): step = 0 results = [] for batch in loader: step += 1 loss, acc = model.test_on_batch(*batch)
def test_data(construct_dict): """ Train a model given a construction dictionairy """ # Setup Log wandblog = construct_dict["wandblog"] if wandblog: import wandb run = wandb.init(project='datagen', entity="chri862z", group=construct_dict["group"], config=construct_dict, reinit=True, settings=wandb.Settings(start_method="fork")) wandb.run.name = construct_dict['model_name'] + '_' + construct_dict[ 'experiment_name'] + '_' + str(wandb.run.id) import dev.datawhere as dl graph_data = dl.graph_data dataset_train = graph_data(**construct_dict['data_params'], traintest='train', i_train=construct_dict['data_params']['n_steps'] - 1) dataset_test = graph_data(**construct_dict['data_params'], traintest='test', i_test=construct_dict['data_params']['n_steps'] - 1) dataset_val = dataset_test batch_size = 512 print('Loaded datasets') loader_train = DisjointLoader(dataset_train, epochs=1, batch_size=batch_size) loader_test = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) # Define training function @tf.function(input_signature=loader_train.tf_signature(), experimental_relax_shapes=True) def train_step(inputs, targets): with tf.GradientTape() as tape: predictions = model(inputs, training=True) targets = tf.cast(targets, tf.float32) loss = loss_func(predictions, targets) loss += sum(model.losses) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) return loss @tf.function(input_signature=loader_test.tf_signature(), experimental_relax_shapes=True) def test_step(inputs, targets): predictions = model(inputs, training=False) targets = tf.cast(targets, tf.float32) out = loss_func(predictions, targets) return predictions, targets, out print("Data generated, everything looks good!") return 1
################################################################################ dataset = QM9(amount=1000) # Set amount=None to train on whole dataset # Parameters F = dataset.n_node_features # Dimension of node features S = dataset.n_edge_features # Dimension of edge features n_out = dataset.n_labels # Dimension of the target # Train/test split idxs = np.random.permutation(len(dataset)) split = int(0.9 * len(dataset)) idx_tr, idx_te = np.split(idxs, [split]) dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te] loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs, node_level=False) loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1, node_level=False) # load() output: X, A, E, I ################################################################################ # BUILD MODEL ################################################################################ X_in = Input(shape=(F, ), name='X_in') A_in = Input(shape=(None, ), sparse=True, name='A_in') E_in = Input(shape=(S, ), name='E_in') I_in = Input(shape=(), name='segment_ids_in', dtype=tf.int32) X_1 = ECCConv(32, activation='relu')([X_in, A_in, E_in])
dataset = MyDataset(1000, transforms=NormalizeAdj()) # Parameters F = dataset.n_node_features # Dimension of node features n_out = dataset.n_labels # Dimension of the target # Train/valid/test split idxs = np.random.permutation(len(dataset)) split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset)) idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te]) dataset_tr = dataset[idx_tr] dataset_va = dataset[idx_va] dataset_te = dataset[idx_te] loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs) loader_va = DisjointLoader(dataset_va, batch_size=batch_size) loader_te = DisjointLoader(dataset_te, batch_size=batch_size) ################################################################################ # BUILD (unnecessarily big) MODEL ################################################################################ X_in = Input(shape=(F, ), name="X_in") A_in = Input(shape=(None, ), sparse=True) I_in = Input(shape=(), name="segment_ids_in", dtype=tf.int32) X_1 = GCSConv(32, activation="relu")([X_in, A_in]) X_1, A_1, I_1 = TopKPool(ratio=0.5)([X_1, A_in, I_in]) X_2 = GCSConv(32, activation="relu")([X_1, A_1]) X_2, A_2, I_2 = TopKPool(ratio=0.5)([X_2, A_1, I_1]) X_3 = GCSConv(32, activation="relu")([X_2, A_2])
def analyze_train(construct_dict): """ Train a model given a construction dictionairy """ # Setup Log wandblog = construct_dict["wandblog"] if wandblog: import wandb run = wandb.init(project=construct_dict["experiment"], entity="chri862z", group=construct_dict["group"], config=construct_dict, reinit=True, settings=wandb.Settings(start_method="fork")) wandb.run.name = construct_dict['model_name'] + '_' + construct_dict[ 'experiment_name'] + '_' + str(wandb.run.id) ################################################ # Load dataset # ################################################ from dev.data_load import graph_data #load dataset epochs = int(construct_dict['run_params']['epochs']) batch_size = int(construct_dict['run_params']['batch_size']) dataset = graph_data(**construct_dict['data_params']) idx_lists = dataset.index_lists # Split data dataset_train = dataset[idx_lists[0]] dataset_val = dataset[idx_lists[1]] dataset_test = dataset[idx_lists[2]] loader_train = DisjointLoader(dataset_train, epochs=epochs, batch_size=batch_size) loader_test = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) ############################################### # Setup other run params # ################################################ early_stop = construct_dict['run_params']['early_stop'] patience = construct_dict['run_params']['patience'] val_epoch = construct_dict['run_params']['val_epoch'] print('check') ################################################ # Setup model, loss, lr schedule and metrics # ################################################ # Get model, metrics, lr_schedule and loss function model, model_path = setup_model(construct_dict) loss_func = get_loss_func(construct_dict['run_params']['loss_func']) metrics = get_metrics(construct_dict['run_params']['metrics']) performance_plot = get_performance( construct_dict['run_params']['performance_plot']) lr_schedule = get_lr_schedule(construct_dict) save_path = osp.join(model_path, wandb.run.name) if not osp.isdir(save_path): os.makedirs(save_path) print('New folder for saving run made') # Learning rate and optimizer learning_rate = next(lr_schedule) opt = Adam(learning_rate) ################################################ # Set up TF functions and validation step # ################################################ # Define training function @tf.function(input_signature=loader_train.tf_signature(), experimental_relax_shapes=True) def train_step(inputs, targets): with tf.GradientTape() as tape: predictions = model(inputs, training=True) targets = tf.cast(targets, tf.float32) loss = loss_func(predictions, targets) loss += sum(model.losses) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) return loss @tf.function(input_signature=loader_test.tf_signature(), experimental_relax_shapes=True) def test_step(inputs, targets): predictions = model(inputs, training=False) targets = tf.cast(targets, tf.float32) out = loss_func(predictions, targets) return predictions, targets, out def validation(loader): loss = 0 prediction_list, target_list = [], [] for batch in loader: inputs, targets = batch predictions, targets, out = test_step(inputs, targets) loss += out prediction_list.append(predictions) target_list.append(targets) y_reco = tf.concat(prediction_list, axis=0) y_true = tf.concat(target_list, axis=0) y_true = tf.cast(y_true, tf.float32) loss, loss_from = loss_func(y_reco, y_true, re=True) energy, e_old, alpha, zeni, azi = metrics(y_reco, y_true) return loss, loss_from, [energy, e_old, alpha, zeni, azi] @tf.function(experimental_relax_shapes=True) def gradient_importance(inputs, targets, j): with tf.GradientTape() as tape: tape.watch(inputs[0]) predictions = model( inputs, training=False )[:, j] # needs to be under the gradient tape to be tracked grads = tape.gradient(predictions, inputs[0]) grads = tf.where(tf.math.is_nan(grads), tf.zeros_like(grads), grads) grads = tf.math.segment_mean(tf.math.abs(grads), inputs[2], name=None) return grads ################################################ # Train Model # ################################################ options = tf.profiler.experimental.ProfilerOptions(host_tracer_level=3, python_tracer_level=1, device_tracer_level=1) log_dir = 'tmp/board/' + wandb.run.name tf.profiler.experimental.start(log_dir, options=options) tot_time = 0 current_batch = 0 current_epoch = 1 loss = 0 lowest_loss = np.inf early_stop = 1 early_stop_counter = 0 pbar = tqdm(total=loader_train.steps_per_epoch, position=0, leave=True) start_time = time.time() summarylist = [] for batch in loader_train: inputs, targets = batch out = train_step(inputs, targets) loss += out if current_epoch == 1 and current_batch == 0: model.summary() if wandblog: summary = model.summary(print_fn=summarylist.append) table = wandb.Table(columns=["Layers"]) for s in summarylist: table.add_data(s) wandb.log({'Model summary': table}) current_batch += 1 pbar.update(1) pbar.set_description( f"Epoch {current_epoch} / {epochs}; Avg_loss: {loss / current_batch:.6f}" ) if current_batch == loader_train.steps_per_epoch: t = time.time() - start_time tot_time += t print( f"Epoch {current_epoch} of {epochs} done in {t:.2f} seconds using learning rate: {learning_rate:.2E}" ) print( f"Avg loss of train: {loss / loader_train.steps_per_epoch:.6f}" ) loader_val = DisjointLoader(dataset_val, epochs=1, batch_size=batch_size) val_loss, val_loss_from, val_metric = validation(loader_val) ################## ## TensorBoard ### ################## # tb_callback=tensorflow.keras.callbacks.TensorBoard(log_dir = log_dir, # histogram_freq = 1, # profile_batch = '500,520') # consider to start your looping after a few steps of training, so that the profiling does not consider initialization overhead #-------------------------# # tb_callback.set_model(model) if wandblog: wandb.log({ "Train Loss": loss / loader_train.steps_per_epoch, "Validation Loss": val_loss, "w(log(E))": val_metric[1], "Energy bias": val_metric[0][1], "Energy sig-1": val_metric[0][0], "Energy sig+1": val_metric[0][2], "Solid angle 68th": val_metric[2][3], "Angle bias": val_metric[2][1], "Angle sig-1": val_metric[2][0], "Angle sig+1": val_metric[2][2], "zenith 68th": val_metric[3][3], "zenith bias": val_metric[3][1], "zenith sig-1": val_metric[3][0], "zenith sig+1": val_metric[3][2], "azimuth 68th": val_metric[4][3], "azimuth bias": val_metric[4][1], "azimuth sig-1": val_metric[4][0], "azimuth sig+1": val_metric[4][2], "Learning rate": learning_rate }) ###gradient_tracker, could possible made in a less sucky way grad_dict = { 'energy': { 'dom_x': 1, 'dom_y': 1, 'dom_z': 1, 'time': 1, 'logcharge': 1, 'SRT': 1 }, 'zenith': { 'dom_x': 1, 'dom_y': 1, 'dom_z': 1, 'time': 1, 'logcharge': 1, 'SRT': 1 }, 'azimuth': { 'dom_x': 1, 'dom_y': 1, 'dom_z': 1, 'time': 1, 'logcharge': 1, 'SRT': 1 }, 'sig_zeni': { 'dom_x': 1, 'dom_y': 1, 'dom_z': 1, 'time': 1, 'logcharge': 1, 'SRT': 1 }, 'sig_azi': { 'dom_x': 1, 'dom_y': 1, 'dom_z': 1, 'time': 1, 'logcharge': 1, 'SRT': 1 } } keys = list(grad_dict.keys()) feats = list(grad_dict[keys[0]].keys()) for j in range(len(keys)): grads = gradient_importance(inputs, targets, j) grads_av = tf.reduce_mean(grads, axis=0) grads_av = grads_av / tf.reduce_sum(grads_av) #softmax for i, feat in enumerate(feats): grad_dict[keys[j]][feat] = grads_av[i] if wandblog: wandb.log(grad_dict) print("\n") if not construct_dict['run_params']['zeniazi_metric']: print(f"Avg loss of validation: {val_loss:.6f}") print( f"Loss from: Energy: {val_loss_from[0]:.6f} \t Angle: {val_loss_from[1]:.6f} " ) print( f"Energy: bias = {val_metric[0][1]:.6f} sig_range = {val_metric[0][0]:.6f}<->{val_metric[0][2]:.6f}, old metric {val_metric[1]:.6f}\ \n Angle: bias = {val_metric[2][1]:.6f} sig_range = {val_metric[2][0]:.6f}<->{val_metric[2][2]:.6f}, old metric {val_metric[2][3]:.6f}" ) else: print(f"Avg loss of validation: {val_loss:.6f}") print( f"Loss from: Energy: {val_loss_from[0]:.6f} \t Angle: {val_loss_from[1]:.6f} " ) print( f"Energy: bias = {val_metric[0][1]:.6f} sig_range = {val_metric[0][0]:.6f}<->{val_metric[0][2]:.6f}, old metric {val_metric[1]:.6f}\ \n Angle: bias = {val_metric[2][1]:.6f} sig_range = {val_metric[2][0]:.6f}<->{val_metric[2][2]:.6f}, old metric {val_metric[2][3]:.6f}\ \n Zenith: bias = {val_metric[3][1]:.6f} sig_range = {val_metric[3][0]:.6f}<->{val_metric[3][2]:.6f}, old metric {val_metric[3][3]:.6f}\ \n Azimuth: bias = {val_metric[4][1]:.6f} sig_range = {val_metric[4][0]:.6f}<->{val_metric[4][2]:.6f}, old metric {val_metric[4][3]:.6f}" ) if val_loss < lowest_loss: early_stop_counter = 0 lowest_loss = val_loss else: early_stop_counter += 1 print( f'Early stop counter: {early_stop_counter}/{patience}, lowest val loss was {lowest_loss:.6f}' ) if early_stop and (early_stop_counter >= patience): model.save(save_path) print( f"Stopped training. No improvement was seen in {patience} epochs" ) return current_epoch if current_epoch != epochs: pbar = tqdm(total=loader_train.steps_per_epoch, position=0, leave=True) learning_rate = next(lr_schedule) opt.learning_rate.assign(learning_rate) time_avg = tot_time / current_epoch if current_epoch % val_epoch == 0: model.save(save_path) print("Model saved") if wandblog: loader_test = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) fig, _ = performance_plot(loader_test, test_step, metrics, save=True, save_path=save_path) title = "performanceplot_" + str(current_epoch) wandb.log({title: [wandb.Image(fig, caption=title)]}) loss = 0 start_time = time.time() current_epoch += 1 current_batch = 0 tf.profiler.experimental.stop() return current_epoch run.finish()
model = tf.keras.models.load_model( f'../from_config/trained_models/IceCube/{args.run}') model.compile() batch_size = 512 #just give the same database as you would normally run it on dataset =graph_data(n_data=100000,skip=0, restart=0, transform=True,\ transform_path='../db_files/muongun/transformers.pkl', db_path= '../db_files/muongun/rasmus_classification_muon_3neutrino_3mio.db') #../../../../pcs557/databases/dev_lvl7_mu_nu_e_classification_v003----IC8611_oscNext_003_final/data/meta/transformers.pkl #../../../../pcs557/databases/dev_lvl7_mu_nu_e_classification_v003---IC8611_oscNext_003_final/data/IC8611_oscNext_003_final.db ## get out relevant stuff train, val, test = dataset.index_lists dataset_test = dataset[test] loader = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) df_event = dataset.df_event ### define func @tf.function(input_signature=loader.tf_signature(), experimental_relax_shapes=True) def test_step(inputs, targets): predictions = model(inputs, training=False) targets = tf.cast(targets, tf.float32) return predictions, targets ## def predict func
def train_model(construct_dict): """ Train a model given a construction dictionairy """ # Setup Log wandblog = construct_dict["wandblog"] if wandblog: import wandb run = wandb.init(project=construct_dict["experiment"], entity="chri862z", group=construct_dict["group"], config=construct_dict) wandb.run.name = construct_dict['model_name'] + wandb.run.id ################################################ # Load dataset # ################################################ from dev.data_load import graph_data #load dataset epochs = construct_dict['run_params']['epochs'] batch_size = construct_dict['run_params']['batch_size'] dataset = graph_data(**construct_dict['data_params']) idx_lists = dataset.index_lists # Split data dataset_train = dataset[idx_lists[0]] dataset_val = dataset[idx_lists[1]] dataset_test = dataset[idx_lists[2]] loader_train = DisjointLoader(dataset_train, epochs=epochs, batch_size=batch_size) loader_test = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) ############################################### # Setup other run params # ################################################ early_stop = construct_dict['run_params']['early_stop'] patience = construct_dict['run_params']['patience'] val_epoch = construct_dict['run_params']['val_epoch'] print('check') ################################################ # Setup model, loss, lr schedule and metrics # ################################################ # Get model, metrics, lr_schedule and loss function model, model_path = setup_model(construct_dict) loss_func = get_loss_func(construct_dict['run_params']['loss_func']) metrics = get_metrics(construct_dict['run_params']['metrics']) performance_plot = get_performance( construct_dict['run_params']['performance_plot']) lr_schedule = get_lr_schedule(construct_dict) save_path = model_path + wandb.run.name if not osp.isdir(save_path): os.makedirs(save_path) print('New folder for saving run made') # Learning rate and optimizer learning_rate = next(lr_schedule) opt = Adam(learning_rate) ################################################ # Set up TF functions and validation step # ################################################ # Define training function @tf.function(input_signature=loader_train.tf_signature(), experimental_relax_shapes=True) def train_step(inputs, targets): with tf.GradientTape() as tape: predictions = model(inputs, training=True) targets = tf.cast(targets, tf.float32) loss = loss_func(predictions, targets) loss += sum(model.losses) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) return loss @tf.function(input_signature=loader_test.tf_signature(), experimental_relax_shapes=True) def test_step(inputs, targets): predictions = model(inputs, training=False) targets = tf.cast(targets, tf.float32) out = loss_func(predictions, targets) return predictions, targets, out def validation(loader): loss = 0 prediction_list, target_list = [], [] for batch in loader: inputs, targets = batch predictions, targets, out = test_step(inputs, targets) loss += out prediction_list.append(predictions) target_list.append(targets) y_reco = tf.concat(prediction_list, axis=0) y_true = tf.concat(target_list, axis=0) y_true = tf.cast(y_true, tf.float32) loss, loss_from = loss_func(y_reco, y_true, re=True) energy, angle, old = metrics(y_reco, y_true) return loss, loss_from, [energy, angle, old] ################################################ # Train Model # ################################################ tot_time = 0 current_batch = 0 current_epoch = 1 loss = 0 lowest_loss = np.inf early_stop = 1 early_stop_counter = 0 pbar = tqdm(total=loader_train.steps_per_epoch, position=0, leave=True) start_time = time.time() summarylist = [] for batch in loader_train: inputs, targets = batch out = train_step(inputs, targets) loss += out if current_epoch == 1 and current_batch == 0: model.summary() if wandblog: summary = model.summary(print_fn=summarylist.append) table = wandb.Table(columns=["Layers"]) for s in summarylist: table.add_data(s) wandb.log({'Model summary': table}) current_batch += 1 pbar.update(1) pbar.set_description( f"Epoch {current_epoch} / {epochs}; Avg_loss: {loss / current_batch:.6f}" ) if current_batch == loader_train.steps_per_epoch: t = time.time() - start_time tot_time += t print( f"Epoch {current_epoch} of {epochs} done in {t:.2f} seconds using learning rate: {learning_rate:.2E}" ) print( f"Avg loss of train: {loss / loader_train.steps_per_epoch:.6f}" ) loader_val = DisjointLoader(dataset_val, epochs=1, batch_size=batch_size) val_loss, val_loss_from, val_metric = validation(loader_val) if wandblog: wandb.log({ "Train Loss": loss / loader_train.steps_per_epoch, "Validation Loss": val_loss, "Energy metric": val_metric[2][0], "Energy bias": val_metric[0][1], "Energy sig-1": val_metric[0][0], "Energy sig+1": val_metric[0][2], "Angle metric": val_metric[2][1], "Angle bias": val_metric[1][1], "Angle sig-1": val_metric[1][0], "Angle sig+1": val_metric[1][2], "Learning rate": learning_rate }) print(f"Avg loss of validation: {val_loss:.6f}") print( f"Loss from: Energy: {val_loss_from[0]:.6f} \t Angle: {val_loss_from[1]:.6f} " ) print( f"Energy: bias = {val_metric[0][1]:.6f} sig_range = {val_metric[0][0]:.6f}<->{val_metric[0][2]:.6f}, old metric {val_metric[2][0]:.6f}\ \n Angle: bias = {val_metric[1][1]:.6f} sig_range = {val_metric[1][0]:.6f}<->{val_metric[1][2]:.6f}, old metric {val_metric[2][1]:.6f}" ) if val_loss < lowest_loss: early_stop_counter = 0 lowest_loss = val_loss else: early_stop_counter += 1 print( f'Early stop counter: {early_stop_counter}/{patience}, lowest val loss was {lowest_loss:.6f}' ) if early_stop and (early_stop_counter >= patience): model.save(save_path) print( f"Stopped training. No improvement was seen in {patience} epochs" ) return current_epoch if current_epoch != epochs: pbar = tqdm(total=loader_train.steps_per_epoch, position=0, leave=True) learning_rate = next(lr_schedule) opt.learning_rate.assign(learning_rate) time_avg = tot_time / current_epoch if current_epoch % val_epoch == 0: model.save(save_path) print("Model saved") if wandblog: loader_test = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) fig, ax = performance_plot(loader_test, test_step, metrics, save=True, save_path=save_path) title = "performanceplot_" + str(current_epoch) wandb.log({title: [wandb.Image(fig, caption=title)]}) loss = 0 start_time = time.time() current_epoch += 1 current_batch = 0 return current_epoch run.finish()
def train_model(construct_dict): """ Train a model given a construction dictionairy """ # Setup Log wandblog = construct_dict["wandblog"] if wandblog: print('Logging to wandb') import wandb run = wandb.init(project=construct_dict["experiment"], entity="chri862z", group=construct_dict["group"], config=construct_dict, reinit=True, settings=wandb.Settings(start_method="fork")) wandb.run.name = construct_dict['model_name'] + '_' + construct_dict[ 'experiment_name'] + '_' + str(wandb.run.id) ################################################ # Load dataset # ################################################ # import dev.submit_traindata as dl # # reload(dl) # dataset_train=dl.graph_data(**construct_dict['data_params']) import dev.datawhere as dl graph_data = dl.graph_data dataset_test = graph_data(**construct_dict['data_params'], traintest='test') graph_data.traintest = 'train' epochs = int(construct_dict['run_params']['epochs']) batch_size = int(construct_dict['run_params']['batch_size']) print('Loaded datasets') loader_test = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) dataset_val = dataset_test ############################################### # Setup other run params # ################################################ early_stop = construct_dict['run_params']['early_stop'] patience = construct_dict['run_params']['patience'] val_epoch = construct_dict['run_params']['val_epoch'] print('check') ################################################ # Setup model, loss, lr schedule and metrics # ################################################ # Get model, metrics, lr_schedule and loss function if construct_dict['run_params']['retrain_model'] == False: model, model_path = setup_model(construct_dict) else: model_path = osp.join(cwd, "trained_models/IceCube_neutrino", construct_dict['run_params']['retrain_model']) model = tf.keras.models.load_model(model_path) model.compile() loss_func = get_loss_func(construct_dict['run_params']['loss_func']) metrics = get_metrics(construct_dict['run_params']['metrics']) performance_plot = get_performance( construct_dict['run_params']['performance_plot']) lr_schedule = get_lr_schedule(construct_dict) save_path = osp.join(model_path, wandb.run.name) if not osp.isdir(save_path): os.makedirs(save_path) print('New folder for saving run made') # Learning rate and optimizer learning_rate = next(lr_schedule) opt = Adam(learning_rate) ################################################ # Set up TF functions and validation step # ################################################ # Define training function @tf.function(input_signature=loader_test.tf_signature(), experimental_relax_shapes=True) def train_step(inputs, targets): with tf.GradientTape() as tape: predictions = model(inputs, training=True) targets = tf.cast(targets, tf.float32) loss = loss_func(predictions, targets) loss += sum(model.losses) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) return loss @tf.function(input_signature=loader_test.tf_signature(), experimental_relax_shapes=True) def test_step(inputs, targets): predictions = model(inputs, training=False) targets = tf.cast(targets, tf.float32) out = loss_func(predictions, targets) return predictions, targets, out def validation(loader): loss = 0 prediction_list, target_list = [], [] for batch in loader: inputs, targets = batch predictions, targets, out = test_step(inputs, targets) loss += out prediction_list.append(predictions) target_list.append(targets) y_reco = tf.concat(prediction_list, axis=0) y_true = tf.concat(target_list, axis=0) y_true = tf.cast(y_true, tf.float32) loss, loss_from = loss_func(y_reco, y_true, re=True) energy, e_old, alpha, zeni, azi = metrics(y_reco, y_true) return loss, loss_from, [energy, e_old, alpha, zeni, azi] ################################################ # Train Model # ################################################ n_steps = construct_dict['data_params']['n_steps'] dataset_train = graph_data(**construct_dict['data_params'], traintest='train') loader_train = DisjointLoader(dataset_train, epochs=1, batch_size=batch_size) steps_per_epoch = loader_train.steps_per_epoch tot_time = 0 current_batch = 0 current_epoch = 1 loss = 0 lowest_loss = np.inf early_stop = 1 early_stop_counter = 0 pbar = tqdm(total=steps_per_epoch * n_steps, position=0, leave=True) start_time = time.time() summarylist = [] for j in range(epochs): for i in range(n_steps): dataset_train = graph_data(**construct_dict['data_params'], traintest='train', i_train=i) loader_train = DisjointLoader(dataset_train, epochs=1, batch_size=batch_size) for batch in loader_train: inputs, targets = batch out = train_step(inputs, targets) loss += out if current_epoch == 1 and current_batch == 0: model.summary() if wandblog: summary = model.summary(print_fn=summarylist.append) table = wandb.Table(columns=["Layers"]) for s in summarylist: table.add_data(s) wandb.log({'Model summary': table}) current_batch += 1 pbar.update(1) pbar.set_description( f"Epoch {current_epoch} / {epochs}; Avg_loss: {loss / current_batch:.6f}" ) if current_batch == steps_per_epoch * n_steps: # if current_batch == : t = time.time() - start_time tot_time += t print( f"Epoch {current_epoch} of {epochs} done in {t:.2f} seconds using learning rate: {learning_rate:.2E}" ) print( f"Avg loss of train: {loss / (steps_per_epoch*n_steps):.6f}" ) loader_val = DisjointLoader(dataset_val, epochs=1, batch_size=batch_size) val_loss, val_loss_from, val_metric = validation( loader_val) if wandblog: wandb.log({ "Train Loss": loss / (steps_per_epoch * n_steps), "Validation Loss": val_loss, "w(log(E))": val_metric[1], "Energy bias": val_metric[0][1], "Energy sig-1": val_metric[0][0], "Energy sig+1": val_metric[0][2], "Solid angle 68th": val_metric[2][3], "Angle bias": val_metric[2][1], "Angle sig-1": val_metric[2][0], "Angle sig+1": val_metric[2][2], "zenith 68th": val_metric[3][3], "zenith bias": val_metric[3][1], "zenith sig-1": val_metric[3][0], "zenith sig+1": val_metric[3][2], "azimuth 68th": val_metric[4][3], "azimuth bias": val_metric[4][1], "azimuth sig-1": val_metric[4][0], "azimuth sig+1": val_metric[4][2], "Learning rate": learning_rate }) print("\n") print(f"Avg loss of validation: {val_loss:.6f}") print( f"Loss from: Energy: {val_loss_from[0]:.6f} \t Zenith: {val_loss_from[1]:.6f} \t Azimuth {val_loss_from[2]:.6f}" ) print( f"Energy: bias = {val_metric[0][1]:.6f} sig_range = {val_metric[0][0]:.6f}<->{val_metric[0][2]:.6f}, old metric {val_metric[1]:.6f}\ \n Angle: bias = {val_metric[2][1]:.6f} sig_range = {val_metric[2][0]:.6f}<->{val_metric[2][2]:.6f}, old metric {val_metric[2][3]:.6f}\ \n Zenith: bias = {val_metric[3][1]:.6f} sig_range = {val_metric[3][0]:.6f}<->{val_metric[3][2]:.6f}, old metric {val_metric[3][3]:.6f}\ \n Azimuth: bias = {val_metric[4][1]:.6f} sig_range = {val_metric[4][0]:.6f}<->{val_metric[4][2]:.6f}, old metric {val_metric[4][3]:.6f}" ) if val_loss < lowest_loss: early_stop_counter = 0 lowest_loss = val_loss else: early_stop_counter += 1 print( f'Early stop counter: {early_stop_counter}/{patience}, lowest val loss was {lowest_loss:.6f}' ) if early_stop and (early_stop_counter >= patience): model.save(save_path) print( f"Stopped training. No improvement was seen in {patience} epochs" ) return current_epoch if current_epoch != epochs: pbar = tqdm(total=steps_per_epoch * n_steps, position=0, leave=True) learning_rate = next(lr_schedule) opt.learning_rate.assign(learning_rate) time_avg = tot_time / current_epoch if current_epoch % val_epoch == 0: model.save(save_path) print("Model saved") if wandblog: loader_test = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) fig, _ = performance_plot(loader_test, test_step, metrics, bins=20, save=True, save_path=save_path) title = "performanceplot_" + str(current_epoch) wandb.log( {title: [wandb.Image(fig, caption=title)]}) loss = 0 start_time = time.time() current_epoch += 1 current_batch = 0 return current_epoch run.finish()
def pipeline(): featurearr, simarr, labelarr=load_data() xarr, yarr, aarr, edge_attrarr=graphdatageneration(featurearr, simarr, labelarr) dataset = MyDataset(xarr,yarr,aarr,edge_attrarr) np.random.seed(10) # Train/test split idxs = np.random.permutation(len(dataset)) split = int(0.8 * len(dataset)) idx_tr, idx_te = np.split(idxs, [split]) dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te] loader_tr = DisjointLoader(dataset_tr, batch_size=32, epochs=30,shuffle=True) loader_te = DisjointLoader(dataset_te, batch_size=32, epochs=1,shuffle=True) model=buildmodel(dataset) opt = optimizers.Adam(lr=learning_rate) loss_fn = losses.MeanSquaredError() @tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True) def train_step(inputs, target): with tf.GradientTape() as tape: predictions = model(inputs, training=True) loss = loss_fn(target, predictions) mae=losses.MeanAbsoluteError()(target, predictions) mape=losses.MeanAbsolutePercentageError()(target, predictions) loss += sum(model.losses) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) return loss,mae,mape print("training") current_batch = 0 model_loss = 0 total_mape=0 total_mae=0 for batch in loader_tr: outs,mae,mape= train_step(*batch) model_loss += outs total_mae+=mae total_mape+=mape current_batch += 1 if current_batch == loader_tr.steps_per_epoch: print("MSE: {}".format(model_loss / loader_tr.steps_per_epoch), "MAE: {}".format(total_mae/ loader_tr.steps_per_epoch), "MAPE: {}".format(total_mape/ loader_tr.steps_per_epoch)) model_loss = 0 total_mae = 0 total_mape = 0 current_batch = 0 print("testing") model_loss = 0 model_mae=0 model_mape = 0 for batch in loader_te: inputs, target = batch predictions = model(inputs, training=False) model_loss += loss_fn(target, predictions) model_mae += losses.MeanAbsoluteError()(target, predictions) model_mape+= losses.MeanAbsolutePercentageError()(target, predictions) model_loss /= loader_te.steps_per_epoch model_mae /= loader_te.steps_per_epoch model_mape /= loader_te.steps_per_epoch print("Done. Test MSE: {}".format(model_loss), "Test MAE: {}".format(model_mae), "Test MAPE: {}".format(model_mape)) model.save('/home/som/lab/seed-yzj/newpaper4/laboratory/model/fusion.hdf5')
# Load data dataset = datasets.omitted_with_actions(exp_config.files, shuffle=False) #dataset = dataset[0:2] #np.set_printoptions(threshold=100000) # Train/valid/test split idxs = np.random.permutation(len(dataset)) split_va, split_te = int(0.8 * len(dataset)), int(0.9 * len(dataset)) idx_tr, idx_va, idx_te = np.split(idxs, [split_va, split_te]) dataset_tr = dataset[idx_tr] dataset_va = dataset[idx_va] dataset_te = dataset[idx_te] print('dataset size:', len(dataset)) dataset_tr = dataset # FIXME: Using "entire" dataset for now loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs) loader_va = DisjointLoader(dataset_va, batch_size=batch_size) loader_te = DisjointLoader(dataset_te, batch_size=batch_size) # Parameters channels = 8 # Number of channel in each head of the first GAT layer n_attn_heads = 8 # Number of attention heads in first GAT layer F = dataset.n_node_features dropout = 0.6 # Dropout rate for the features and adjacency matrix dropout = 0. # FIXME: remove l2_reg = 5e-6 # L2 regularization rate learning_rate = exp_config.lr epochs = exp_config.epochs es_patience = 100 # Patience for early stopping # Model definition
ogb_dataset = GraphPropPredDataset(name=dataset_name) dataset = OGB(ogb_dataset) # Parameters F = dataset.n_node_features # Dimension of node features S = dataset.n_edge_features # Dimension of edge features n_out = dataset.n_labels # Dimension of the target # Train/test split idx = ogb_dataset.get_idx_split() idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"] dataset_tr = dataset[idx_tr] dataset_va = dataset[idx_va] dataset_te = dataset[idx_te] loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs) loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1) ################################################################################ # Build model ################################################################################ X_in = Input(shape=(F,)) A_in = Input(shape=(None,), sparse=True) E_in = Input(shape=(S,)) I_in = Input(shape=(), dtype=tf.int64) X_1 = ECCConv(32, activation="relu")([X_in, A_in, E_in]) X_2 = ECCConv(32, activation="relu")([X_1, A_in, E_in]) X_3 = GlobalSumPool()([X_2, I_in]) output = Dense(n_out, activation="sigmoid")(X_3)
"restart": False, "transform_path": "../db_files/dev_lvl7/transformers.pkl", "db_path": "../db_files/dev_lvl7/dev_lvl7_mu_nu_e_classification_v003.db", "features": ["dom_x", "dom_y", "dom_z", "dom_time", "charge_log10", "width", "rqe"], "targets": ["energy_log10", "zenith","azimuth","event_no"], "database": "submit" } import dev.testtraindata as dl reload(dl) graph_data=dl.graph_data dataset_train=graph_data(**data_params, traintest='train') dataset_test=graph_data(**data_params, traintest='mix') dataset_val=dataset_test loader_train = DisjointLoader(dataset_train, epochs=epochs, batch_size=batch_size) # the different loaders work very very differently, beware loader_test = DisjointLoader(dataset_test, batch_size=batch_size, epochs=1) loss_func = get_loss_func(loss_method) metrics = get_metrics('energy_angle_zeniazi') performance_plot = get_performance("performance_vM2D") import dev.lr_schedules as lr_module lr_generator = getattr(lr_module, 'classic') lr_schedule = lr_generator(1e-5, 0, 0.95)() if wandblog: import wandb run = wandb.init(project = 'IceCube_neutrino', entity = "chri862z", group='new_loss', reinit=True, settings=wandb.Settings(start_method="fork"))