def debug(args): contexts, repsonses, contexts_graph_adjs, response_graph_adjs, labels = load_data_from_file(args.dir + 'data/2011_split_by_idname.csv', tokenize=True, read_case_num=10) net = GMN(emdedding_dim=768, use_bert=True).to(DEVICE) optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE) loss_function = torch.nn.BCELoss() print('start training...') print('dataset size: %s' % labels.shape) for epoch in range(EPOCH): net.train() losses = [] for k in range(len(labels) // BATCH_SIZE): output_p = net(contexts[k * BATCH_SIZE:(k+1) * BATCH_SIZE], repsonses[k * BATCH_SIZE:(k+1) * BATCH_SIZE], contexts_graph_adjs[k * BATCH_SIZE:(k+1) * BATCH_SIZE], response_graph_adjs[k * BATCH_SIZE:(k+1) * BATCH_SIZE]) loss = loss_function(output_p, labels[k * BATCH_SIZE:(k+1) * BATCH_SIZE]) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) print('loss: %.2f' % np.mean(losses))
def spam_filter(train_mode: bool): labels, texts = dl.load_data_from_file(dataset_path) word_embeddings = dl.load_word_embeddings_from_file(word_emb_path) tokenizer = prepare.get_prepared_tokenizer(texts) texts = tokenizer.texts_to_sequences(texts) texts = np.array(texts) texts = pad_sequences(texts, maxlen=config['dataset']['max_seq_len']) labels = prepare.encode_labels(labels) labels = np.array(labels) texts_train, texts_test, labels_train, labels_test = train_test_split( texts, labels, random_state=config['dataset']['random_state'], test_size=config['dataset']['test_size']) embeddings_matrix = prepare.map_embeddings_to_word_index( word_embeddings, tokenizer.word_index) seq_model = model.get_compiled_model(embeddings_matrix, config) seq_model.summary() if train_mode: print("Entering train mode") train.train_model(seq_model, config, texts_train, labels_train, texts_test, labels_test) else: print("Loading weights file and entering prediction mode") load_weights_from_file( '../checkpoints/weights-improvement-12-0.98.hdf5', seq_model, texts_test, labels_test) while True: try: text = str(input('>> ')) prediction = get_prediction(seq_model, tokenizer, text) prediction_index = utils.probability_to_index(prediction) print("Prediction:", utils.decode_index(prediction_index)) except (KeyboardInterrupt, KeyError): print() exit()
def train(args): eval_contexts, eval_repsonses, eval_contexts_graph_adjs, eval_response_graph_adjs, eval_labels = load_data_from_file(args.dir + 'data/2020_split_by_idname.csv', tokenize=True) contexts, repsonses, contexts_graph_adjs, response_graph_adjs, labels = load_data_from_file(args.dir + 'data/from_2011_to_2019_split_by_idname.csv', tokenize=True) # move tensors to the specified devices. if DEVICE == 'cuda': for tensor in [eval_contexts, eval_repsonses, eval_contexts_graph_adjs, eval_response_graph_adjs, eval_labels, contexts, repsonses, contexts_graph_adjs, response_graph_adjs, labels]: tensor.cuda() else: for tensor in [eval_contexts, eval_repsonses, eval_contexts_graph_adjs, eval_response_graph_adjs, eval_labels, contexts, repsonses, contexts_graph_adjs, response_graph_adjs, labels]: tensor.cpu() net = GMN(emdedding_dim=768, use_bert=True).to(DEVICE) optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE) loss_function = torch.nn.BCELoss() print('start training...') print('dataset size: %s' % labels.shape) for epoch in range(EPOCH): net.train() losses = [] for k in range(len(labels) // BATCH_SIZE): output_p = net(contexts[k * BATCH_SIZE:(k+1) * BATCH_SIZE], repsonses[k * BATCH_SIZE:(k+1) * BATCH_SIZE], contexts_graph_adjs[k * BATCH_SIZE:(k+1) * BATCH_SIZE], response_graph_adjs[k * BATCH_SIZE:(k+1) * BATCH_SIZE]) loss = loss_function(output_p, labels[k * BATCH_SIZE:(k+1) * BATCH_SIZE]) optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) print('loss: %.2f' % np.mean(losses)) evaluate(net, eval_contexts, eval_repsonses, eval_contexts_graph_adjs, eval_response_graph_adjs, eval_labels)
def __getitem__( self, index ): # because how we design our data loadin, this returns a whole batch data = data_loader.load_data_from_file( np.random.choice(np.array(self.file_list))) # random file in here if self.who_dies_next_mode == True or self.is_validation == True: X, y, death_times = data_loader.getBatchBalanced( data, self.batch_size, self.feature_indicies, self.label_indicies, get_death_times=True) return X, y, death_times else: player_i = np.random.randint(10) X, y, death_times = data_loader.getBalancedBatchForPlayer( data, player_i, self.batch_size, self.feature_indicies, self.label_indicies, get_death_times=True) return X, y, death_times, player_i
def make_predictions(file,modelPath): trainingDataFiles = [file]#glob.glob("/scratch/staff/ak1774/shared_folder/data/train/*.h5") data = data_loader.load_data_from_file(trainingDataFiles[0]) models = [] for i in range(1,2): print(i) models.append( load_pytorch_model('ModelData/' +str(i) +'/' +'model.model', get_config('/' +str(i) +'/config.json'), data) ) #fullGameData,fullGameLabels = data_loader.getSequencialNaive(data,hero_feature_indicies,label_indicies) xLims = data['time'].values #¢health = data['player_4_m_iHealth'].values ####################### # get original health ###################### norm_stats = None with open("norm_stats.pickle", 'rb') as f: norm_stats = pickle.load(f) for label,min_value,max_value in normalization_stats: if "_m_iHealth" in label: health_min = min_value health_max = max_value if "m_iMaxHealth" in label: maxhealth_min = min_value maxhealth_max = max_value healthes = [] max_healthes = [] relative_healthes = [] for i in range(0,10): health_vals = data['player_' + str(i) + '_m_iHealth'].values maxhealth_vals = data['player_' + str(i) + '_m_iMaxHealth'].values health_vals = health_vals * (health_max - health_min) + health_min maxhealth_vals = maxhealth_vals * (maxhealth_max - maxhealth_min) + maxhealth_min relative_health_vals = health_vals / maxhealth_vals # hopefully maxhealth is never 0 healthes.append(health_vals) max_healthes.append(maxhealth_vals) relative_healthes.append(relative_health_vals) ####################### # get death times ###################### labels = [(i,label) for i,label in enumerate(list(data))] death_time_indicies = preprocess.labels_to_indicies(preprocess.select_features_by_name("time_until_next_death",labels)) death_times = data.values[:,death_time_indicies].astype(np.float32) for m in models: X = [torch.from_numpy(hero_X) for hero_X in m.fullGameData] pred = model(X) pred = torch.sigmoid(pred) pred = pred.cpu().detach().numpy() y = m.fullGameLabels currentMeanTrueAccuracy = 0 currentMeanFalseAccuracy=0 numTruePos = 0 numFalsePos = 0 numTrueNeg = 0 numFalseNeg = 0 for i in range(0,data.shape[0]): predX = 0 for m in models: y = m.fullGameLabels[i] y = np.array(y) y = np.expand_dims(y,0) X = [torch.from_numpy(hero_X[i:(i+1),:]) for hero_X in m.fullGameData] print(i) #predX = averagePred(models,X) predX = modelPred(m.model,X) +predX predX = predX/len(models) ''' true_pos = ((predX > 0.5) == (y > 0.5)).reshape(-1).astype(np.float32) true_neg = ((predX < 0.5) == (y <0.5)).reshape(-1).astype(np.float32) false_pos = ((predX > 0.5) == (y < 0.5)).reshape(-1).astype(np.float32) false_neg = ((predX < 0.5) == (y > 0.5)).reshape(-1).astype(np.float32) for pos in true_neg: if pos ==1: numTrueNeg +=1 for pos in false_neg: if pos ==1: numFalseNeg +=1 for pos in true_pos: if pos ==1: numTruePos +=1 for pos in false_pos: if pos ==1: numFalsePos +=1 ''' prediction = predX currentMeanTrueAccuracy += np.mean(true_pos) currentMeanFalseAccuracy += np.mean(false_pos) prediction = np.squeeze(prediction,0) if i %3000 ==0: print('Current true pos ' +str(currentMeanTrueAccuracy/(i+1))) print('Current false pos ' +str(currentMeanFalseAccuracy/(i+1))) heroStuff.append(prediction) labelStuff.append(np.squeeze(y,0)) print() print(numTruePos) print(numTrueNeg) print() print(numFalsePos) print(numFalseNeg) print() print('True Pos = ' + str(currentMeanTrueAccuracy/19326)) print('False pos = ' + str(currentMeanFalseAccuracy/19326)) heroStuff1 = np.swapaxes(heroStuff,0,1) labelStuff1= np.swapaxes(labelStuff,0,1) xLims = xLims - xLims[0] - 90 np.save('hero.npy', np.array(heroStuff1)) np.save('label.npy', np.array(labelStuff1)) np.save('xLims.npy', np.array(xLims)) np.save('health.npy',np.array(healthes))
def train_pytorch(): # is there a config in the current directory? config_path = "config.json" if not os.path.isfile("config.json"): # use default config config_path = os.path.dirname( os.path.realpath(__file__)) + "/config/default.json" with open(config_path) as f: config = commentjson.load(f) import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(config) sys.stdout.flush() WHO_DIES_NEXT_MODE = config["predict_who_dies_next"] use_cuda = config["use_gpu"] == True and torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") print("using device: ", device) model_type = locate(config["model"]) get_feature_indicies_fn = locate(config["feature_set"]) get_label_indicies_fn = locate(config["lable_set"]) batch_size = config["batch_size"] print(type(batch_size)) print(type(config["log_at_every_x_sample"])) epoch_size = int(config["log_at_every_x_sample"] / batch_size) print("epoch_size: ", epoch_size) checkpoint_frequency = int(config["chekpoint_at_every_x_sample"] / (epoch_size * batch_size)) validation_epoch_sice = config["validation_epoch_size"] if config["optimizer"] == "Adam": OptimizerType = torch.optim.Adam elif config["optimizer"] == "SGD": OptimizerType = torch.optim.SGD # YARCC #trainingDataFiles = glob.glob("/scratch/ak1774/data/train/*.h5") #validationDataFiles = glob.glob("/scratch/ak1774/data/validation/*.h5") # Viking trainingDataFiles = glob.glob( str(Path.cwd().parent / 'randomized_data' / 'train') + '/*.h5' ) #glob.glob("/mnt/lustre/groups/cs-dclabs-2019/esport/death_prediction_data/randomized_data/train/*.h5") validationDataFiles = glob.glob( str(Path.cwd().parent / 'randomized_data' / 'validation') + '/*.h5' ) #glob.glob("/mnt/lustre/groups/cs-dclabs-2019/esport/death_prediction_data/randomized_data/validation/*.h5") #trainingDataFiles = glob.glob("/scratch/staff/ak1774/shared_folder/data/train/*.h5") #validationDataFiles = glob.glob("/scratch/staff/ak1774/shared_folder/data/validation/*.h5") example_data = data_loader.load_data_from_file(trainingDataFiles[0]) hero_feature_indicies = get_feature_indicies_fn(example_data) if WHO_DIES_NEXT_MODE == True: label_indicies = get_label_indicies_fn(example_data) else: #label_indicies = get_label_indicies_fn(example_data,config["label_set_arg"]) label_indicies = get_label_indicies_fn(example_data) inputFeatureSize = len(hero_feature_indicies[0]) outputFeatureSize = len(label_indicies) if WHO_DIES_NEXT_MODE == True and outputFeatureSize != 11: print("error, bad config, label set and prediction mode mismatch") raise "error, bad config, label set and prediction mode mismatch" elif WHO_DIES_NEXT_MODE == False and outputFeatureSize != 10: print("error, bad config, label set and prediction mode mismatch") raise "error, bad config, label set and prediction mode mismatch" # the dataset returns a batch when called (because we get the whole batch from one file), the batch size of the data loader thus is set to 1 (default) # epoch size is how many elements the iterator of the generator will provide, NOTE should not be too small, because it have a significant overhead p=0.05 training_set = DotaDataset( file_list=trainingDataFiles, batch_size=batch_size, epoch_size=epoch_size, feature_indicies=hero_feature_indicies, label_indicies=label_indicies, who_dies_next_mode=WHO_DIES_NEXT_MODE, is_validation=False) # set is validation to get death times... training_generator = torch.utils.data.DataLoader( training_set, num_workers=20, worker_init_fn=worker_init_fn) validation_set = DotaDataset( file_list=validationDataFiles, batch_size=batch_size, epoch_size=validation_epoch_sice, feature_indicies=hero_feature_indicies, label_indicies=label_indicies, who_dies_next_mode=WHO_DIES_NEXT_MODE, is_validation=False ) # actually we want the same distribution, so we can compare loss, so dont do anything differently in case of validation validation_generator = torch.utils.data.DataLoader( validation_set, num_workers=20, worker_init_fn=worker_init_fn) #model = models.SimpleFF(inputFeatureSize,outputFeatureSize) model = model_type(inputFeatureSize, outputFeatureSize, **config["model_params"]) model.to(device) print(model.final_layers) criterion = nn.CrossEntropyLoss() binary_classification_loss = torch.nn.BCELoss() optimizer = OptimizerType(model.parameters(), **config["optimizer_params"]) if WHO_DIES_NEXT_MODE == True: all_train_losses = [] all_train_accuracies = [] all_train_kill_nokill_accuracies = [] all_train_per_second_accuracies = [] all_validation_losses = [] all_validation_accuracies = [] all_validation_kill_nokill_accuracies = [] all_validation_per_second_accuracies = [] for epoch_i in range(50000): now = time.time() np.random.seed( ) # reset seed https://github.com/pytorch/pytorch/issues/5059 data loader returns the same values epoch_losses = [] epoch_overall_accuracies = [] epoch_kill_accuracies = [] epoch_no_kill_accuracies = [] epoch_one_sec_accuracies = [] epoch_two_sec_accuracies = [] epoch_three_sec_accuracies = [] epoch_four_sec_accuracies = [] epoch_five_sec_accuracies = [] for sub_epoch_i, (X, y, death_times) in enumerate(training_generator): # since we get a batch of size 1 of batch of real batch size, we take the 0th element X = [(hero_X[0, :]).to(device) for hero_X in X] y = torch.argmax(y[0, :], dim=1).to(device) death_times = death_times[0] # Forward + Backward + Optimize optimizer.zero_grad() output = model(X) loss = criterion(output, y) accuracy_vec = (torch.argmax( output, 1) == y).cpu().numpy().reshape(-1).astype(np.float32) loss.backward() optimizer.step() epoch_losses.append(loss.cpu().detach().numpy().reshape(-1)[0]) (overall_accuracy, (kill_accuracy, no_kill_accuracy), (one_sec_accuracy, two_sec_accuracy, three_sec_accuracy, four_sec_accuracy, five_sec_accuracy)) = calculate_detailed_accuracies( accuracy_vec, death_times, y) epoch_overall_accuracies.append(overall_accuracy) epoch_kill_accuracies.extend(kill_accuracy) epoch_no_kill_accuracies.extend(no_kill_accuracy) epoch_one_sec_accuracies.extend(one_sec_accuracy) epoch_two_sec_accuracies.extend(two_sec_accuracy) epoch_three_sec_accuracies.extend(three_sec_accuracy) epoch_four_sec_accuracies.extend(four_sec_accuracy) epoch_five_sec_accuracies.extend(five_sec_accuracy) if sub_epoch_i > 0 and (sub_epoch_i % 50) == 0: print( epoch_i, " ", sub_epoch_i, " loss: ", np.array(epoch_losses[-49:]).mean(), " accuracy: ", np.array( epoch_overall_accuracies[(-49 * y.shape[0]):]).mean()) sys.stdout.flush() all_train_losses.append(np.array(epoch_losses).mean()) all_train_accuracies.append( np.array(epoch_overall_accuracies).mean()) all_train_kill_nokill_accuracies.append( (np.array(epoch_kill_accuracies).mean(), np.array(epoch_no_kill_accuracies).mean())) all_train_per_second_accuracies.append( (np.array(epoch_one_sec_accuracies).mean(), np.array(epoch_two_sec_accuracies).mean(), np.array(epoch_three_sec_accuracies).mean(), np.array(epoch_four_sec_accuracies).mean(), np.array(epoch_five_sec_accuracies).mean())) # reset logs for validation epoch_losses = [] epoch_overall_accuracies = [] epoch_kill_accuracies = [] epoch_no_kill_accuracies = [] epoch_one_sec_accuracies = [] epoch_two_sec_accuracies = [] epoch_three_sec_accuracies = [] epoch_four_sec_accuracies = [] epoch_five_sec_accuracies = [] with torch.no_grad(): for X, y, death_times in validation_generator: X = [(hero_X[0, :]).to(device) for hero_X in X] y = torch.argmax(y[0, :], dim=1).to(device) death_times = death_times[0] output = model(X) loss = criterion(output, y) accuracy_vec = (torch.argmax( output, 1) == y).cpu().numpy().reshape(-1).astype(np.float32) epoch_losses.append( loss.cpu().detach().numpy().reshape(-1)[0]) (overall_accuracy, (kill_accuracy, no_kill_accuracy), (one_sec_accuracy, two_sec_accuracy, three_sec_accuracy, four_sec_accuracy, five_sec_accuracy)) = calculate_detailed_accuracies( accuracy_vec, death_times, y) epoch_overall_accuracies.append(overall_accuracy) epoch_kill_accuracies.extend(kill_accuracy) epoch_no_kill_accuracies.extend(no_kill_accuracy) epoch_one_sec_accuracies.extend(one_sec_accuracy) epoch_two_sec_accuracies.extend(two_sec_accuracy) epoch_three_sec_accuracies.extend(three_sec_accuracy) epoch_four_sec_accuracies.extend(four_sec_accuracy) epoch_five_sec_accuracies.extend(five_sec_accuracy) all_validation_losses.append(np.array(epoch_losses).mean()) all_validation_accuracies.append( np.array(epoch_overall_accuracies).mean()) all_validation_kill_nokill_accuracies.append( (np.array(epoch_kill_accuracies).mean(), np.array(epoch_no_kill_accuracies).mean())) all_validation_per_second_accuracies.append( (np.array(epoch_one_sec_accuracies).mean(), np.array(epoch_two_sec_accuracies).mean(), np.array(epoch_three_sec_accuracies).mean(), np.array(epoch_four_sec_accuracies).mean(), np.array(epoch_five_sec_accuracies).mean())) # epoch over, checkpoint, report, check validation error print("Epoch done ", epoch_i, " loss: ", np.array(epoch_losses).mean(), " accuracy: ", np.array(epoch_overall_accuracies).mean()) #print("all_train_kill_nokill_accuracies ",len(all_train_kill_nokill_accuracies)) PlotValues((all_train_losses, all_validation_losses), "loss", ["train", "validation"]) PlotValues((all_train_accuracies, all_validation_accuracies), "accuracy", ["train", "validation"]) PlotValues((*zip(*all_train_kill_nokill_accuracies), *zip(*all_validation_kill_nokill_accuracies)), "accuracy_kill", [ "train_kill", "train_no_kill", "validation_kill", "validation_no_kill" ]) sec_labels = ["1_sec", "2_sec", "3_sec", "4_sec", "5_sec"] PlotValues( (*zip(*all_train_per_second_accuracies), *zip(*all_validation_per_second_accuracies)), "accuracy_sec", [ *["accuracy_train" + label for label in sec_labels], *["accuracy_validation" + label for label in sec_labels] ]) #np.save('losses.npy', np.array(mean_losses)) #np.save('accuracies.npy', np.array(mean_accuracies)) print("Epoch took: ", time.time() - now) sys.stdout.flush() #PlotValues(mean_validation_accuracies,"valid_accuracy") #PlotValues(mean_valid_overall_accuracies,"valid_overall_accuracy") #np.save('mean_valid_overall_accuracies.npy', np.array(mean_valid_overall_accuracies)) #np.save('mean_validation_accuracies.npy', np.array(mean_validation_accuracies)) if (epoch_i % 100) == 99: torch.save(model.state_dict(), "model" + str(epoch_i) + ".model") else: # Per player probability prediction all_train_losses = [] all_train_accuracies = [] all_train_target_accuracies = [] all_train_die_notdie_accuracies = [] all_train_per_sec_accuracies = [[] for _ in range(20)] all_train_per_sec_predictions = [[] for _ in range(20)] all_train_per_sec_predictions_std = [[] for _ in range(20)] all_validation_losses = [] all_validation_accuracies = [] all_validation_roc_scores = [] all_validation_pr_scores = [] for epoch_i in range(50000): now = time.time() np.random.seed( ) # reset seed https://github.com/pytorch/pytorch/issues/5059 data loader returns the same values epoch_overall_loss = [] epoch_overall_accuracy = [] epoch_target_accuracy = [] epoch_die_accuracy = [] epoch_not_die_accuracy = [] epoch_per_sec_accuracies = [[] for _ in range(20)] epoch_per_sec_predictions = [[] for _ in range(20)] for sub_epoch_i, (X, y, death_times, player_i) in enumerate(training_generator): # since we get a batch of size 1 of batch of real batch size, we take the 0th element X = [(hero_X[0, :]).to(device) for hero_X in X] y = (y[0, :]).to(device) death_times = death_times[0] player_i = player_i[0].to(device) # Forward + Backward + Optimize optimizer.zero_grad() output = model(X) output = torch.sigmoid(output) output_np = output.cpu().detach().numpy() # only backpropagate the loss for player_i (so the training data is balanced) loss = binary_classification_loss(output[:, player_i], y[:, player_i]) loss.backward() optimizer.step() overall_loss = binary_classification_loss( output, y).cpu().detach().numpy() epoch_overall_loss.append(overall_loss.reshape(-1).mean()) accuracy_values = ((output > 0.5) == ( y > 0.5)).cpu().numpy().astype(np.float32) target_accuracy = ((output[:, player_i] > 0.5) == ( y[:, player_i] > 0.5)).cpu().numpy().reshape(-1).astype( np.float32) die_accuracy_vec = ((output > 0.5) == (y > 0.5)).view(-1)[ y.view(-1) > 0.5].cpu().numpy().reshape(-1).astype( np.float32) not_die_accuracy_vec = ((output > 0.5) == (y > 0.5)).view(-1)[ y.view(-1) < 0.5].cpu().numpy().reshape(-1).astype( np.float32) epoch_overall_accuracy.append( accuracy_values.reshape(-1).mean()) epoch_target_accuracy.append(target_accuracy.mean()) # these have varying size, so calculating the proper mean across batches takes more work epoch_die_accuracy.extend(die_accuracy_vec) epoch_not_die_accuracy.extend(not_die_accuracy_vec) death_times = death_times.cpu().numpy() #death_times[death_times < 0] = 1000.0 # make invalid death times a large number for timeslot_i in range(19): mask_die_in_timeslot = np.logical_and( (death_times > timeslot_i), (death_times < (timeslot_i + 1))) epoch_per_sec_accuracies[timeslot_i].extend( accuracy_values[mask_die_in_timeslot].reshape(-1)) epoch_per_sec_predictions[timeslot_i].extend( output_np[mask_die_in_timeslot].reshape(-1)) # and the rest mask_die_in_timeslot = (death_times > 19) epoch_per_sec_accuracies[19].extend( accuracy_values[mask_die_in_timeslot].reshape(-1)) epoch_per_sec_predictions[19].extend( output_np[mask_die_in_timeslot].reshape(-1)) if sub_epoch_i > 0 and (sub_epoch_i % 50) == 0: print(epoch_i, " ", sub_epoch_i, " loss: ", np.array(epoch_overall_loss[-49:]).mean(), " accuracy: ", np.array(epoch_target_accuracy[-49:]).mean()) #for timeslot_i in range(19): # print("epoch_per_sec_predictions ",len(epoch_per_sec_predictions[timeslot_i])) #print("die accuracy: ",np.array(epoch_die_accuracy[-49:]).mean()) #print("not_die accuracy: ",np.array(epoch_not_die_accuracy[-49:]).mean()) sys.stdout.flush() if (epoch_i % 10) == 9: np.save('epoch_per_sec_predictions.npy', np.array(epoch_per_sec_predictions)) all_train_losses.append(np.array(epoch_overall_loss).mean()) all_train_accuracies.append( np.array(epoch_overall_accuracy).mean()) all_train_target_accuracies.append( np.array(epoch_target_accuracy).mean()) all_train_die_notdie_accuracies.append( (np.array(die_accuracy_vec).mean(), np.array(not_die_accuracy_vec).mean())) for timeslot_i in range(20): all_train_per_sec_accuracies[timeslot_i].append( np.array(epoch_per_sec_accuracies[timeslot_i]).mean()) all_train_per_sec_predictions[timeslot_i].append( np.array(epoch_per_sec_predictions[timeslot_i]).mean()) all_train_per_sec_predictions_std[timeslot_i].append( np.array(epoch_per_sec_predictions[timeslot_i]).std()) # VALIDATION EPOCH if (epoch_i % 3) == 0: epoch_overall_loss = [] epoch_overall_accuracy = [] epoch_all_pred = [] epoch_all_y = [] #epoch_die_accuracy = [] #epoch_not_die_accuracy = [] #epoch_per_sec_accuracies = [[] for _ in range(20)] #epoch_per_sec_predictions = [[] for _ in range(20)] with torch.no_grad(): for X, y, death_times, player_i in validation_generator: X = [(hero_X[0, :]).to(device) for hero_X in X] y = (y[0, :]).to(device) death_times = death_times[0] output = model(X) output = torch.sigmoid(output) output_np = output.cpu().detach().numpy() epoch_overall_loss.append( binary_classification_loss( output, y).cpu().detach().numpy().reshape(-1).mean()) accuracy_vec = ((output > 0.5) == ( y > 0.5)).cpu().numpy().reshape(-1).astype( np.float32) epoch_overall_accuracy.append(accuracy_vec.mean()) #death_times = death_times.cpu().numpy() #for timeslot_i in range(19): # mask_die_in_timeslot = np.logical_and( (death_times > timeslot_i), (death_times < (timeslot_i+1))) # epoch_per_sec_accuracies[timeslot_i].extend(accuracy_values[mask_die_in_timeslot].reshape(-1)) # epoch_per_sec_predictions[timeslot_i].extend(output_np[mask_die_in_timeslot].reshape(-1)) epoch_all_pred.extend(output_np.reshape(-1)) epoch_all_y.extend(y.cpu().numpy().reshape(-1)) all_validation_roc_scores.append( roc_auc_score(epoch_all_y, epoch_all_pred)) all_validation_pr_scores.append( average_precision_score(epoch_all_y, epoch_all_pred)) all_validation_losses.append( np.array(epoch_overall_loss).mean()) all_validation_accuracies.append( np.array(epoch_overall_accuracy).mean()) else: # just copy the previous validation statistics, so we can plot it togeather with training statistics all_validation_losses.append(all_validation_losses[-1]) all_validation_accuracies.append(all_validation_accuracies[-1]) all_validation_roc_scores.append(all_validation_roc_scores[-1]) all_validation_pr_scores.append(all_validation_pr_scores[-1]) PlotValues((all_train_losses, all_validation_losses), "loss", ["train", "validation"]) PlotValues((all_train_accuracies, all_validation_accuracies), "accuracy", ["train", "validation"]) PlotValues((all_validation_roc_scores, ), "roc_score", ["roc"]) PlotValues((all_validation_pr_scores, ), "pr_score", ["pr"]) #PlotValues((all_train_losses,),"loss",["train"]) #PlotValues((all_train_accuracies,),"accuracy",["train"]) PlotValues((all_train_target_accuracies, ), "target_accuracy", ["train"]) PlotValues( ([vals[0] for vals in all_train_die_notdie_accuracies ], [vals[1] for vals in all_train_die_notdie_accuracies]), "all_train_die_notdie_accuracies", ["die", "not_die"]) PlotValues(all_train_per_sec_accuracies, "all_train_per_sec_accuracies", [str(time_i + 1) + "_sec" for time_i in range(20)]) PlotWithStd( values=[vec[-1] for vec in all_train_per_sec_predictions], stds=[vec[-1] for vec in all_train_per_sec_predictions_std], legends=["per_sec predictions"], name="per_sec predictions") print("Epoch done ", epoch_i, " loss: ", np.array(epoch_overall_loss).mean(), " accuracy: ", np.array(epoch_target_accuracy).mean()) print("Epoch took: ", time.time() - now) sys.stdout.flush() if (epoch_i % 10) == 9: np.save('all_train_per_sec_predictions.npy', np.array(all_train_per_sec_predictions)) np.save('all_train_per_sec_predictions_std.npy', np.array(all_train_per_sec_predictions_std)) if (epoch_i % 100) == 99: torch.save(model.state_dict(), "model" + str(epoch_i) + ".model")
def make_predictions(file, modelPath): trainingDataFiles = [ 'data.h5' ] #glob.glob("/scratch/staff/ak1774/shared_folder/data/train/*.h5") config = get_config() get_feature_indicies_fn = locate(config["feature_set"]) get_label_indicies_fn = locate(config["lable_set"]) example_data = data_loader.load_data_from_file(trainingDataFiles[0]) #hero_feature_indicies,label_indicies = get_feature_indicies_fn(example_data) hero_feature_indicies = get_feature_indicies_fn(example_data) label_indicies = get_label_indicies_fn(example_data) print(len(hero_feature_indicies)) print(len(label_indicies)) model = load_pytorch_model(modelPath, hero_feature_indicies, label_indicies, config) #model = model.eval() print(model) data = data_loader.load_data_from_file(file) xLims = data['time'].values fullGameData, fullGameLabels = data_loader.getSequencialNaive( data, hero_feature_indicies, label_indicies) print(np.array(fullGameData).shape) windowData = [] maxWindow = 20 #torch.set_printoptions(precision=10) currentMeanAccuracy = 0 for i in range(0, 19326): y = fullGameLabels[i] y = np.array(y) y = np.expand_dims(y, 0) X = [torch.from_numpy(hero_X[i:(i + 1), :]) for hero_X in fullGameData] predX = model(X) predX = torch.sigmoid(predX) predX = predX.cpu().detach().numpy() accuracy_vec = ((predX > 0.5) == (y > 0.5)).reshape(-1).astype( np.float32) currentMeanAccuracy += np.mean(accuracy_vec) prediction = predX prediction = np.squeeze(prediction, 0) #print(prediction) #print(prediction.shape) if i % 3000 == 0: print('Current mean ' + str(currentMeanAccuracy / (i + 1))) #print('----------------------') windowData.append(prediction) print(np.array(windowData).shape) if len(windowData) > maxWindow: windowData.pop(0) #(get_average(windowData,maxWindow,5,i,y)) #(get_average(windowData,maxWindow,10,i,y)) (get_average(windowData, maxWindow, 15, i, y)) #(get_average(windowData,maxWindow,20,i,y)) heroStuff1 = np.swapaxes(heroStuff, 0, 1) labelStuff1 = np.swapaxes(labelStuff, 0, 1) #heroStuff1 = heroStuff #labelStuff1 = labelStuff #x = arange(0,len(heroStuff1[0])) #x = np.array(x) / splitLower = 2500 splitHigher = 3000 xLims = xLims - xLims[0] - 90 print(np.array(heroStuff1).shape) print(np.array(labelStuff1).shape) print(np.array(xLims).shape) ''' heroStuff1 = heroStuff1[:,splitLower:splitHigher] labelStuff1 = labelStuff1[:,splitLower:splitHigher] xLims = xLims[splitLower:splitHigher] ''' #print(heroStuff1[0]) heroStuff1 = (heroStuff1 - 1) labelStuff1 = (labelStuff1 - 1) * -1 #print(heroStuff1[0]) #heroStuff1[:] = [x - 1 for x in heroStuff1] #labelStuff1[:] = [(x - 1) * -1 for x in labelStuff1] # 1:30 game start print(np.array(heroStuff1).shape) print(np.array(labelStuff1).shape) print(np.array(xLims).shape) #x = np.arange(194) #plt.subplots_adjust(hspace=100) #plt.xticks(np.arange(0,1,)) #fig = plt.figure(figsize=(11,8)) #ax1 = fig.add_subplot(111) #plt.yticks(np.arange(0, 1, step=1)) for i in range(0, 10): #ax1 = fig.add_subplot(111) #ax1.plot(heroStuff1[i], label=1) #ax1.plot(labelStuff1[i], label=2) plt.subplot(10, 1, (i + 1)) plt.plot(xLims, heroStuff1[i], color='red', linewidth=0.5) plt.plot(xLims, labelStuff1[i], color='blue', linewidth=0.5) #plt.title('Player ' + str(i)) #ax1.legend(loc=2) plt.savefig('smooth_plot.eps')
" num matches is ", num_matches) sys.stdout.flush() all_y = [[] for model_path in modelPathList] all_pred = [[] for model_path in modelPathList] per_sec_pred = [[[] for _ in range(20)] for model_path in modelPathList] for i in range(match_per_worker): match_index = first_match_index_for_this_task + i if match_index >= num_matches: continue print("Loading match ", match_index) data = data_loader.load_data_from_file(dataPathList[match_index]) # get death times labels = [(i, label) for i, label in enumerate(list(data))] death_time_indicies = preprocess.labels_to_indicies( preprocess.select_features_by_name("time_until_next_death", labels)) death_times = data.values[:, death_time_indicies].astype(np.float32) for model_i, (model_path, config_path) in enumerate(zip(modelPathList, configPathList)): with open(config_path) as f: config = commentjson.load(f) modeldata = test_model.load_pytorch_model(model_path, config, data)
def get_predictions(self, prediction_type="regression"): if self.model == None: print 'no evaluation possible since no model was provided.' return try: model = serial.load(self.model) except Exception as e: print("error loading {}:".format(self.model)) print(e) raise Exception("error loading {}:".format(self.model)) X = model.get_input_space().make_theano_batch() Y = model.fprop(X) if prediction_type == "classification": Y = theanoTensor.argmax(Y, axis=1) else: assert prediction_type == "regression" f = theanoFunction([X], Y, allow_input_downcast=True) print("loading data and predicting...") data = data_loader.Data('test') if self.direction == 'reverse': input_data = data.response_data output_path = self.stimulus_data_path m, r, c = input_data.shape input_data = input_data.reshape((m, r, c, 1)) elif self.direction == 'forward': input_data = data.stimulus_data output_path = self.response_data_path else: raise Exception( 'Specify either "reverse" or "forward" as direction.') prediction = f(input_data) print("writing predictions...") if output_path[-4:] == '.mat': scipy.io.savemat(output_path, {'data': prediction}) elif output_path[-4:] == '.txt': np.savetxt(output_path, prediction) elif output_path[-4:] == '.npy': np.save(output_path, prediction) else: raise Exception( 'Only ".mat", ".txt" and ".npy" files are supported as data formats.' ) #------------------------------------------------------------------------------ # for testing #------------------------------------------------------------------------------ if self.debug: try: test_output = np.squeeze( data_loader.load_data_from_file( 'testUnattendedAudioOrg.mat')) test_predictions = np.squeeze( data_loader.load_data_from_file(self.stimulus_data_path)) print 'DEBUG: test_predictions.shape', test_predictions.shape, 'test_output.shape', test_output.shape print 'DEBUG: test_predictions.max', np.max( test_predictions), np.argmax(test_predictions) print 'DEBUG: ', np.corrcoef(test_predictions, test_output) except: print 'Detailed correlation analysis only possible for the hello world example.' pass
def main(): #------------------------------------------------------------------------------ # set parameters #------------------------------------------------------------------------------ train_network = None yaml = None stimulus_data = None response_data = None weight_path = None visualize = False context_length = 25 direction = 'reverse' valid_data_entries = -1 # this encodes the default to use all entries forking = False verbosity = 1 debug = 0 num_training_epochs = 100 num_neurons = 5 #------------------------------------------------------------------------------ # parse arguments #------------------------------------------------------------------------------ def usage(): print 'Call python DNNRegression.py with the following arguments: \n', \ '{--train, --predict} \n', \ '-m model.yaml \n', \ '-s stimulus_data [i/o, default text, or .mat file] \n', \ '-r response_data [i/o, default text, or .mat file] \n', \ '-w weights [i/o, default text, or .pkl] \n', \ '[--visual visualize.png] \n', \ '[--context N for N>=1 *25] \n', \ '[--numEpochs N default is 100]', \ '[--numNeurons N default is 5]', \ '[--dir forward/reverse*] \n', \ '[--valid which parts are valid, in case of concatenating trials, default all valid.] \n', \ '[--forking for matlab sake] \n' try: opts, _ = getopt.getopt(sys.argv[1:], "htpm:s:r:w:", [ "help", "train", "predict", "model=", "stimulus=", "response=", "weights=", "visual=", "context=", "dir=", "valid=", "forking=", "verbosity=", "debug", "numEpochs=", "numNeurons=", "numChannels=" ]) except getopt.GetoptError as err: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit() elif opt in ("-p", "--predict"): train_network = 0 elif opt in ("-t", "--train"): train_network = 1 elif opt in ("-m", "--model"): yaml = arg elif opt in ("-s", "--stimulus"): stimulus_data = arg elif opt in ("-r", "--response"): response_data = arg elif opt in ("-w", "--weights"): weight_path = arg elif opt in ("--visual"): visualize = arg elif opt in ("--context"): context_length = int(arg) elif opt in ("--numEpochs"): num_training_epochs = int(arg) elif opt in ("--numNeurons"): num_neurons = int(arg) elif opt in ("--dir"): direction = arg elif opt in ("--valid"): valid_data_entries = arg elif opt in ("--forking"): print "forking is not supported for now" # forking = True elif opt in ("--verbosity"): verbosity = arg elif opt in ("--debug"): debug = 1 else: assert False, "unhandled option" if train_network is None: print 'You need to define wheter to predict outputs or to train the network.' usage() sys.exit(2) if yaml is None: print 'You need to define a valid model path / yaml file.' usage() sys.exit(2) if stimulus_data is None: print 'You need to define a valid stimulus data path.' usage() sys.exit(2) if response_data is None: print 'You need to define a valid response data path.' usage() sys.exit(2) if weight_path is None: print 'You need to define a valid path containing the weights of the mode.' usage() sys.exit(2) #------------------------------------------------------------------------------ # Save environment for other parts of the DNN #------------------------------------------------------------------------------ os.environ["EEGTOOLS_TRAIN_NETWORK"] = str(train_network) os.environ["EEGTOOLS_STIMULUS_DATA_PATH"] = stimulus_data os.environ["EEGTOOLS_RESPONSE_DATA_PATH"] = response_data os.environ["EEGTOOLS_VALID_DATA_PATH"] = str(valid_data_entries) os.environ["EEGTOOLS_CONTEXT_LENGTH"] = str(context_length) os.environ["EEGTOOLS_DIRECTION"] = direction os.environ["EEGTOOLS_DEBUG"] = str(debug) # determine number of channels from data if direction == 'forward': input_path = stimulus_data else: input_path = response_data input_data = data_loader.load_data_from_file(input_path) num_channels = input_data.shape[1] os.environ["EEGTOOLS_NUM_CHANNELS"] = str(num_channels) #------------------------------------------------------------------------------ # Save imports for the YAML file #------------------------------------------------------------------------------ pkl.dump(context_length, open('context_length.pkl', 'wb')) pkl.dump(num_training_epochs, open('num_training_epochs.pkl', 'wb')) pkl.dump(num_neurons, open('num_neurons.pkl', 'wb')) pkl.dump(num_channels, open('num_channels.pkl', 'wb')) #------------------------------------------------------------------------------ # train / run DNN #------------------------------------------------------------------------------ net = DNNregression(train_network, yaml, stimulus_data, response_data, weight_path, visualize, context_length, direction, valid_data_entries, forking, verbosity, debug) if net.train_network: net.train() net.save_weights() else: net.get_predictions() if net.visualize: net.show_weights()
def get_predictions(self, prediction_type="regression"): if self.model == None: print 'no evaluation possible since no model was provided.' return try: model = serial.load(self.model) except Exception as e: print("error loading {}:".format(self.model)) print(e) raise Exception("error loading {}:".format(self.model)) X = model.get_input_space().make_theano_batch() Y = model.fprop(X) if prediction_type == "classification": Y = theanoTensor.argmax(Y, axis=1) else: assert prediction_type == "regression" f = theanoFunction([X], Y, allow_input_downcast=True) print("loading data and predicting...") data = data_loader.Data('test') if self.direction == 'reverse': input_data = data.response_data output_path = self.stimulus_data_path m, r, c = input_data.shape input_data = input_data.reshape((m, r, c , 1)) elif self.direction == 'forward': input_data = data.stimulus_data output_path = self.response_data_path else: raise Exception('Specify either "reverse" or "forward" as direction.') prediction = f(input_data) print("writing predictions...") if output_path[-4:] == '.mat': scipy.io.savemat(output_path, {'data': prediction}) elif output_path[-4:] == '.txt': np.savetxt(output_path, prediction) elif output_path[-4:] == '.npy': np.save(output_path, prediction) else: raise Exception('Only ".mat", ".txt" and ".npy" files are supported as data formats.') #------------------------------------------------------------------------------ # for testing #------------------------------------------------------------------------------ if self.debug: try: test_output = np.squeeze(data_loader.load_data_from_file('testUnattendedAudioOrg.mat')) test_predictions = np.squeeze(data_loader.load_data_from_file(self.stimulus_data_path)) print 'DEBUG: test_predictions.shape', test_predictions.shape, 'test_output.shape', test_output.shape print 'DEBUG: test_predictions.max', np.max(test_predictions), np.argmax(test_predictions) print 'DEBUG: ', np.corrcoef(test_predictions, test_output) except: print 'Detailed correlation analysis only possible for the hello world example.' pass
# setup jupiter kernel with requirements # import sys # !{sys.executable} -m pip install numpy scipy eikon matplotlib cvxopt """Una volta predisposto l'ambiente, carichiamo i dati precedentemente scaricati da Eikon ed organizzati in file .json. Sono presenti quattro diversi scenari: 1. Scenario 'assets-test.json': file utilizzato ai fini del testing del codice, 3 soli simboli, 6 soli periodi mensili, date corrette ma fittizie. Questo file è utile solo ai fini del testing del codice. 2. Scenario 'assets-small.json': file di piccole dimensioni (periodo semestrale, simboli provenienti da DAX30, CAC40 ed IBEX 30, frequenza mensile) 3. Scenario 'assets-large.json': file di medie dimensioni (periodo decennale, simboli provenienti da DAX30, CAC40 ed IBEX 30, frequenza mensile) 4. Scenario 'assets-production.json': file di grandi dimensioni (periodo ventennale, simboli provenienti da DAX30, CAC40 ed IBEX 30, frequenza mensile) Nel codice sono presenti funzionalità dedicate alla creazione e manipolazione di ulteriori insiemi di simboli e relativi performance temporali. """ # load data from file ek_data_processed = dl.load_data_from_file('assets-test.json') target_assets = ek_data_processed['target_assets'] print("Assets list: ") print(target_assets) print("Date from: " + ek_data_processed["start_date"]) print("Date to: " + ek_data_processed["end_date"]) print("Number of assets: " + str(len(target_assets))) print("Number of time samples: " + str(len(ek_data_processed[target_assets[0]]))) """## Assets Vengono qui caricati il RIC dell'asset e la relativa serie temporale di ritorni menili %. ``` assets.add_or_modify_asset(asset_name, ek_data_processed[asset_name]) ``` Una volta caricati tutti gli asset viene calcolata la matrice di covarianza. ```
def main(): #------------------------------------------------------------------------------ # set parameters #------------------------------------------------------------------------------ train_network = None yaml = None stimulus_data = None response_data = None weight_path = None visualize = False context_length = 25 direction = 'reverse' valid_data_entries = -1 # this encodes the default to use all entries forking = False verbosity = 1 debug = 0 num_training_epochs = 100 num_neurons = 5 #------------------------------------------------------------------------------ # parse arguments #------------------------------------------------------------------------------ def usage(): print 'Call python DNNRegression.py with the following arguments: \n', \ '{--train, --predict} \n', \ '-m model.yaml \n', \ '-s stimulus_data [i/o, default text, or .mat file] \n', \ '-r response_data [i/o, default text, or .mat file] \n', \ '-w weights [i/o, default text, or .pkl] \n', \ '[--visual visualize.png] \n', \ '[--context N for N>=1 *25] \n', \ '[--numEpochs N default is 100]', \ '[--numNeurons N default is 5]', \ '[--dir forward/reverse*] \n', \ '[--valid which parts are valid, in case of concatenating trials, default all valid.] \n', \ '[--forking for matlab sake] \n' try: opts, _ = getopt.getopt(sys.argv[1:], "htpm:s:r:w:", ["help", "train", "predict", "model=", "stimulus=", "response=", "weights=", "visual=", "context=", "dir=", "valid=", "forking=", "verbosity=", "debug", "numEpochs=", "numNeurons=", "numChannels="]) except getopt.GetoptError as err: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit() elif opt in ("-p", "--predict"): train_network = 0 elif opt in ("-t", "--train"): train_network = 1 elif opt in ("-m", "--model"): yaml = arg elif opt in ("-s", "--stimulus"): stimulus_data = arg elif opt in ("-r", "--response"): response_data = arg elif opt in ("-w", "--weights"): weight_path = arg elif opt in ("--visual"): visualize = arg elif opt in ("--context"): context_length = int(arg) elif opt in ("--numEpochs"): num_training_epochs = int(arg) elif opt in ("--numNeurons"): num_neurons = int(arg) elif opt in ("--dir"): direction = arg elif opt in ("--valid"): valid_data_entries = arg elif opt in ("--forking"): print "forking is not supported for now" # forking = True elif opt in ("--verbosity"): verbosity = arg elif opt in ("--debug"): debug = 1 else: assert False, "unhandled option" if train_network is None: print 'You need to define wheter to predict outputs or to train the network.' usage() sys.exit(2) if yaml is None: print 'You need to define a valid model path / yaml file.' usage() sys.exit(2) if stimulus_data is None: print 'You need to define a valid stimulus data path.' usage() sys.exit(2) if response_data is None: print 'You need to define a valid response data path.' usage() sys.exit(2) if weight_path is None: print 'You need to define a valid path containing the weights of the mode.' usage() sys.exit(2) #------------------------------------------------------------------------------ # Save environment for other parts of the DNN #------------------------------------------------------------------------------ os.environ["EEGTOOLS_TRAIN_NETWORK"] = str(train_network) os.environ["EEGTOOLS_STIMULUS_DATA_PATH"] = stimulus_data os.environ["EEGTOOLS_RESPONSE_DATA_PATH"] = response_data os.environ["EEGTOOLS_VALID_DATA_PATH"] = str(valid_data_entries) os.environ["EEGTOOLS_CONTEXT_LENGTH"] = str(context_length) os.environ["EEGTOOLS_DIRECTION"] = direction os.environ["EEGTOOLS_DEBUG"] = str(debug) # determine number of channels from data if direction == 'forward': input_path = stimulus_data else: input_path = response_data input_data = data_loader.load_data_from_file(input_path) num_channels = input_data.shape[1] os.environ["EEGTOOLS_NUM_CHANNELS"] = str(num_channels) #------------------------------------------------------------------------------ # Save imports for the YAML file #------------------------------------------------------------------------------ pkl.dump(context_length, open( 'context_length.pkl', 'wb')) pkl.dump(num_training_epochs, open( 'num_training_epochs.pkl', 'wb')) pkl.dump(num_neurons, open( 'num_neurons.pkl', 'wb')) pkl.dump(num_channels, open( 'num_channels.pkl', 'wb')) #------------------------------------------------------------------------------ # train / run DNN #------------------------------------------------------------------------------ net = DNNregression(train_network, yaml, stimulus_data, response_data, weight_path, visualize, context_length, direction, valid_data_entries, forking, verbosity, debug) if net.train_network: net.train() net.save_weights() else: net.get_predictions() if net.visualize: net.show_weights()