def run_train(): features, labels = torch.load("../calculated_features/features.tr"),\ torch.load("../calculated_features/labels.tr") permutation = torch.randperm(labels.shape[0]) features, labels = features[permutation], labels[permutation] train_test_split_id = int(labels.shape[0] * 0.75) train_features, train_labels = features[:train_test_split_id], labels[:train_test_split_id] test_features, test_labels = features[train_test_split_id:], labels[train_test_split_id:] trainloader = torch.utils.data.DataLoader( PsobDataset(train_features, train_labels), batch_size=BATCH_SIZE, shuffle=True, num_workers=2 ) testloader = torch.utils.data.DataLoader( PsobDataset(test_features, test_labels), batch_size=BATCH_SIZE, shuffle=False, num_workers=2 ) model = Model(features.shape[1]) criterion = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) for epoch in range(EPOCHS): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 10 == 9: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10)) running_loss = 0.0 print('Finished Training') correct = 0 total = 0 labels_correct = torch.zeros(NUM_OF_AUTHORS) with torch.no_grad(): for data in testloader: features, labels = data outputs = model(features) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() for i, label in enumerate(labels): labels_correct[label] += predicted[i] == labels[i] print('Accuracy of the network: %d / %d = %d %%' % ( correct, total, 100 * correct / total)) print(labels_correct)
def get_test_accuracy_by_epoch() -> Tuple[List[int], List[float], List[int]]: logger = logging.getLogger('early_stopping') configure_logger_by_default(logger) logger.info("START get_test_accuracy_by_epoch") train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES, INPUT_LABELS)) model = Model(INPUT_FEATURES.shape[1]) criterion = CONFIG['criterion']() optimizer = CONFIG['optimizer'](model.parameters(), lr=CONFIG['lr']) train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[train_index] test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[test_index] scaler = preprocessing.StandardScaler().fit(train_features) train_features = scaler.transform(train_features) test_features = scaler.transform(test_features) trainloader = torch.utils.data.DataLoader( PsobDataset(train_features, train_labels), batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2 ) testloader = torch.utils.data.DataLoader( PsobDataset(test_features, test_labels), batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2 ) accuracies = [] best_accuracy = -1 durations = [] current_duration = 0 for epoch in tqdm(range(CONFIG['epochs'])): for i, data in enumerate(trainloader, 0): inputs, labels = data optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() correct = 0 total = 0 with torch.no_grad(): for data in testloader: features, labels = data outputs = model(features) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = correct / total if best_accuracy >= accuracy: current_duration += 1 else: if current_duration != 0: durations.append(current_duration) current_duration = 0 best_accuracy = max(best_accuracy, accuracy) accuracies.append(accuracy) logger.info(str(epoch) + ": " + str(accuracy)) if current_duration != 0: durations.append(current_duration) logger.info("END get_test_accuracy_by_epoch") return [i for i in range(CONFIG['epochs'])], accuracies, durations
def get_accuracies_for_lr() -> Dict[float, float]: logger = logging.getLogger('learning_rate') configure_logger_by_default(logger) logger.info("START get_accuracies_for_lr") accuracies_by_lr = defaultdict(lambda: -1.0) for lr in CONFIG['params']['lr']: logger.info("lr = " + str(lr)) skf = CONFIG['cv'] train_index, test_index = next(skf.split(INPUT_FEATURES, INPUT_LABELS)) model = Model(INPUT_FEATURES.shape[1]) criterion = CONFIG['criterion']() optimizer = CONFIG['optimizer'](model.parameters(), lr=lr, momentum=CONFIG['momentum']) train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[train_index] test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[test_index] trainloader = torch.utils.data.DataLoader( PsobDataset(train_features, train_labels), batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2 ) testloader = torch.utils.data.DataLoader( PsobDataset(test_features, test_labels), batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2 ) current_duration = 0 for epoch in tqdm(range(CONFIG['epochs'])): for i, data in enumerate(trainloader, 0): inputs, labels = data optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() correct = 0 total = 0 with torch.no_grad(): for data in testloader: features, labels = data outputs = model(features) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = correct / total if accuracies_by_lr[lr] >= accuracy: current_duration += 1 else: current_duration = 0 accuracies_by_lr[lr] = max(accuracies_by_lr[lr], accuracy) if current_duration > CONFIG['early_stopping_rounds']: break if epoch % 10 == 0: logger.info("CHECKPOINT EACH 10th EPOCH" + str(epoch) + ": " + str(accuracy)) if epoch % 100 == 0: logger.info("CHECKPOINT EACH 100th EPOCH" + str(epoch) + ": " + str(accuracy)) logger.info(str(epoch) + ": " + str(accuracy)) logger.info("END get_accuracies_for_lr") return accuracies_by_lr
def get_best_metrics_and_accuracy_from_metrics_set(metrics_sets) -> Tuple[List[int], float]: logger = logging.getLogger('finding_best_metrics_and_accuracy') configure_logger_by_default(logger) logger.info("STARTED FINDING BEST METRICS SET") loaded_features, loaded_labels = \ torch.load("../calculated_features/split_each_file_features.tr"), torch.load("../calculated_features/split_each_file_labels.tr") skf = StratifiedKFold(n_splits=10, shuffle=True) train_index, test_index = next(skf.split(loaded_features, loaded_labels)) best_metrics = None best_accuracy = -1 for metrics in tqdm(metrics_sets): metrics = list(metrics) if len(metrics) == 0: continue model = Model(len(metrics)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9) train_features, train_labels = loaded_features[train_index], loaded_labels[train_index] test_features, test_labels = loaded_features[test_index], loaded_labels[test_index] trainloader = torch.utils.data.DataLoader( PsobDataset(train_features, train_labels, metrics), batch_size=BATCH_SIZE, shuffle=True, num_workers=2 ) testloader = torch.utils.data.DataLoader( PsobDataset(test_features, test_labels, metrics), batch_size=BATCH_SIZE, shuffle=False, num_workers=2 ) for _ in range(EPOCHS): for i, data in enumerate(trainloader, 0): inputs, labels = data optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() correct = 0 total = 0 with torch.no_grad(): for data in testloader: features, labels = data outputs = model(features) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = correct / total log_info = str(metrics) + ": " + str(accuracy) if accuracy > best_accuracy: best_accuracy = accuracy best_metrics = metrics log_info += " NEW BEST" logger.info(log_info) logger.info("END FINDING BEST METRICS SET") return best_metrics, best_accuracy
def train_bp(model, train_features, train_labels, test_features, test_labels, config): print_info = config['pso_options']['print_info'] criterion = config['criterion'] optimizer = config['optimizer'](model.parameters(), lr=config['lr']) trainloader = torch.utils.data.DataLoader(PsobDataset( train_features, train_labels), batch_size=config['batch_size'], shuffle=config['shuffle'], num_workers=2) testloader = torch.utils.data.DataLoader(PsobDataset( test_features, test_labels), batch_size=config['batch_size'], shuffle=config['shuffle'], num_workers=2) best_accuracy = -1.0 train_accuracy = -1.0 train_loss = -1.0 test_loss = -1.0 current_duration = 0 print_evaluation_before_train(model, criterion, train_features, train_labels, test_features, test_labels, print_info) for epoch in range(config['epochs']): for inputs, labels in trainloader: inputs = inputs.to(config['device']) labels = labels.to(config['device']) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() correct = 0 total = 0 with torch.no_grad(): for inputs, labels in testloader: inputs = inputs.to(config['device']) labels = labels.to(config['device']) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = correct / total if best_accuracy >= accuracy: current_duration += 1 else: current_duration = 0 with torch.no_grad(): train_correct = 0 train_total = 0 for inputs, labels in trainloader: inputs = inputs.to(config['device']) labels = labels.to(config['device']) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) train_total += labels.size(0) train_correct += (predicted == labels).sum().item() train_accuracy = train_correct / train_total train_loss = criterion(model(train_features), train_labels).item() test_loss = criterion(model(test_features), test_labels).item() best_accuracy = max(best_accuracy, accuracy) if current_duration > config['early_stopping_rounds']: print_info("On epoch " + str(epoch) + " training was early stopped") break if epoch % 100 == 0: with torch.no_grad(): print_100th_checkpoint_evaluation(epoch, model, criterion, train_features, train_labels, test_features, test_labels, print_info) correct = 0 total = 0 labels_dist = torch.zeros(config['number_of_authors']) labels_correct = torch.zeros(config['number_of_authors']) with torch.no_grad(): for inputs, labels in testloader: inputs = inputs.to(config['device']) labels = labels.to(config['device']) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() for i, label in enumerate(labels): labels_dist[label] += 1 labels_correct[label] += predicted[i] == labels[i] print_info('Finished training') best_accuracy = max(best_accuracy, correct / total) print_info('Best accuracy: ' + str(best_accuracy)) print_info( 'Accuracy of the last validation of the network: %d / %d = %d %%' % (correct, total, 100 * correct / total)) print_info( "Correct labels / labels for each author of last validation:\n" + str(torch.stack((labels_correct, labels_dist), dim=1))) return best_accuracy, train_accuracy, test_loss, train_loss
def fit_model(file_to_print): logger = logging.getLogger('one_split_fit') configure_logger_by_default(logger) logger.info("START fit_model") def print_info(info): logger.info(info) print(info) file_to_print.write(info + "\n") train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES, INPUT_LABELS)) model = Model(len(CONFIG['metrics'])) criterion = CONFIG['criterion']() optimizer = CONFIG['optimizer'](model.parameters(), lr=CONFIG['lr'], momentum=CONFIG['momentum']) train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[ train_index] test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[ test_index] scaler = preprocessing.StandardScaler().fit(train_features) train_features = scaler.transform(train_features) test_features = scaler.transform(test_features) trainloader = torch.utils.data.DataLoader(PsobDataset( train_features, train_labels, CONFIG['metrics']), batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2) testloader = torch.utils.data.DataLoader(PsobDataset( test_features, test_labels, CONFIG['metrics']), batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2) best_accuracy = -1.0 current_duration = 0 for epoch in range(CONFIG['epochs']): for i, data in enumerate(trainloader, 0): inputs, labels = data optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() correct = 0 total = 0 with torch.no_grad(): for data in testloader: features, labels = data outputs = model(features) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = correct / total if best_accuracy >= accuracy: current_duration += 1 else: current_duration = 0 best_accuracy = max(best_accuracy, accuracy) if current_duration > CONFIG['early_stopping_rounds']: print_info("On epoch " + str(epoch) + " training was early stopped") break if epoch % 10 == 0: logger.info("CHECKPOINT EACH 10th EPOCH" + str(epoch) + ": " + str(accuracy)) if epoch % 100 == 0: print_info("CHECKPOINT EACH 100th EPOCH " + str(epoch) + ": current accuracy " + str(accuracy) + " , best " + str(best_accuracy)) logger.info(str(epoch) + ": " + str(accuracy)) logger.info('Finished Training') correct = 0 total = 0 labels_dist = torch.zeros(CONFIG['number_of_authors']) labels_correct = torch.zeros(CONFIG['number_of_authors']) with torch.no_grad(): for data in testloader: features, labels = data outputs = model(features) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() for i, label in enumerate(labels): labels_dist[label] += 1 labels_correct[label] += predicted[i] == labels[i] print_info('Best accuracy: ' + str(max(best_accuracy, correct / total))) print_info('Final accuracy of the network: %d / %d = %d %%' % (correct, total, 100 * correct / total)) print_info("Correct labels / labels for each author:\n" + str(torch.stack((labels_correct, labels_dist), dim=1))) logger.info("END fit_model")
def run_cross_validation(): k_fold = 10 loaded_features, loaded_labels = \ torch.load("../calculated_features/split_each_file_features.tr"),\ torch.load("../calculated_features/split_each_file_labels.tr") skf = RepeatedStratifiedKFold(n_splits=k_fold, n_repeats=10) # metrics = [i for i in range(19)] metrics = [0, 1, 2, 3, 4, 6, 7, 9] accuraces = torch.zeros((10, 10)) loop = 0 for train_index, test_index in skf.split(loaded_features, loaded_labels): model = Model(len(metrics)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9) train_features, train_labels = loaded_features[train_index], loaded_labels[train_index] test_features, test_labels = loaded_features[test_index], loaded_labels[test_index] trainloader = torch.utils.data.DataLoader( PsobDataset(train_features, train_labels, metrics), batch_size=BATCH_SIZE, shuffle=True, num_workers=2 ) testloader = torch.utils.data.DataLoader( PsobDataset(test_features, test_labels, metrics), batch_size=BATCH_SIZE, shuffle=False, num_workers=2 ) for epoch in range(EPOCHS): running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs inputs, labels = data # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 10 == 9: # print('[%d, %5d] loss: %.3f' % # (epoch + 1, i + 1, running_loss / 10)) running_loss = 0.0 print('Finished Training') correct = 0 total = 0 labels_correct = torch.zeros(NUM_OF_AUTHORS) with torch.no_grad(): for data in testloader: features, labels = data outputs = model(features) _, predicted = torch.max(outputs.data, 1) print(predicted) total += labels.size(0) correct += (predicted == labels).sum().item() for i, label in enumerate(labels): labels_correct[label] += predicted[i] == labels[i] print('Accuracy of the network: %d / %d = %d %%' % ( correct, total, 100 * correct / total)) print(labels_correct) accuraces[loop % 10][int(loop / 10)] = correct / total loop += 1 return print(torch.mean(accuraces, 1)) print(torch.std(accuraces, 1)) print(accuraces)