def file_output(self, loss_avg, accuracy, epoch_number, time_passed): file_path = os.path.join(self.results_path, self.RESULTS_FILENAME) append_to_file(file_path, str(self.outputs_counter) + self.SEPARATOR + str(epoch_number) + self.SEPARATOR + str(loss_avg) + self.SEPARATOR + str(accuracy) + self.SEPARATOR + str(time_passed) + '\n')
def add_network_info(self): path = os.path.join(self.results_path, self.NETWORK_INFO_FILENAME) append_to_file(path, 'num_layers: ' + str(self.num_layers) + '\n' + 'hidden_size: ' + str(self.hidden_size) + '\n' + 'batch_size: ' + str(self.batch_size) + '\n' + 'timesteps: ' + str(self.timesteps) + '\n' + 'learning_rate: ' + str(self.learning_rate) + '\n' + 'num_epochs: ' + str(self.num_epochs) + '\n' + 'vocab_size: ' + str(self.vocab_size) + '\n' + 'authors_size: ' + str(self.authors_size) + '\n')
def get_average_cross_entropies(self): average_cross_entropies_batch_processor = OptimizedBatchProcessor( tensors_dir=self.training_tensors_path, batch_size=self.batch_size, authors_size=self.authors_size, timesteps=self.timesteps, language=self.language, vocab_size=self.vocab_size) average_cross_entropies_batch_processor.new_epoch() states = (torch.zeros(self.num_layers, self.batch_size, self.hidden_size), torch.zeros(self.num_layers, self.batch_size, self.hidden_size)) authors_with_average_loss = self.initialize_average_training_loss_struct( ) append_to_file('output.txt', 'after init\n') while average_cross_entropies_batch_processor.next_batch(): batches, target, authors_order = average_cross_entropies_batch_processor.get_results( ) batches = batches.type(torch.FloatTensor) outputs, _ = self.model(batches, states) for head in range(self.authors_size): softmax = self.softmax(outputs[head]) vector = self.loss(softmax, target) for counter, author in enumerate(authors_order): authors_with_average_loss[ author - 1]['sum'] += vector[counter].item() authors_with_average_loss[author - 1]['counter'] += 1 append_to_file('output.txt', 'after while\n') for author in authors_with_average_loss: author['sum'] /= author['counter'] return authors_with_average_loss
import os import re from library.helpers.files.files_operations import check_if_file, TextFileLoader, append_to_file from library.helpers.files.name_convention import check_name_convention, TEXT_NAME_CONVENTIONS, KNOWN_AUTHOR output = [] path = '../data/old/english/authors' output_file_path = 'en_train.txt' for author in os.listdir(path): directory_path = os.path.join(path, author) sum_known = 0 first_chars = '' for filename in os.listdir(directory_path): file_path = os.path.join(directory_path, filename) if check_if_file(file_path) and check_name_convention(filename, TEXT_NAME_CONVENTIONS): text_file_loader = TextFileLoader(file_path) if re.match(KNOWN_AUTHOR, filename): first_chars = first_chars + text_file_loader.text[:100] sum_known += len(text_file_loader.text) output.append((author, sum_known, first_chars.replace("\n", "").replace(" ", "").replace("\t", ""))) output = sorted(output, key=lambda tup: tup[1], reverse=True) for tup in output: append_to_file(output_file_path, str(tup[0]) + " " + str(tup[1]) + " " + str(tup[2]) + "\n")
def train(self): append_to_file('output.txt', '\nstart\n') self.time_start = time.time() counter = 0 while True: batch_processor = BatchProcessor( tensors_dir=self.training_tensors_path, batch_size=self.batch_size, authors_size=self.authors_size, timesteps=self.timesteps, language=self.language, vocab_size=self.vocab_size) states = (torch.zeros(self.num_layers, self.batch_size, self.hidden_size), torch.zeros(self.num_layers, self.batch_size, self.hidden_size)) counter += 1 batch_processor.new_epoch() while batch_processor.next_batch(): batches, target, authors_order = batch_processor.get_results() batches = batches.type(torch.FloatTensor) outputs, _ = self.model(batches, states) heads_to_train = self.get_heads_for_training(authors_order) loss = 0 # print('NEW BATCH') # for i, author in enumerate(batches): # print(authors_order[i]) # for letter in author: # print(decode_letter(letter), end='') # print('\nnext_letter') # print(decode_letter(class_to_one_hot(target[i]))) for head in heads_to_train: # creating mask mask = (torch.tensor(authors_order) == head + 1).float() # calculating softmax softmax = self.softmax(outputs[head]) # calculating loss which is a vector of same size as outputs[head] vector = self.loss(softmax, target) # s = 0 # for elem in self.softmax(outputs[head])[0]: # s += elem # print(s) # # vector = self.loss_fn(outputs[head], target) # then we equalize to 0 elements of vector we don't need vector = vector * mask # and finally... loss += torch.sum(vector) / torch.sum(mask) self.model.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), 0.5) self.optimizer.step() self.get_accuracy(i=counter)
def get_accuracy(self, i): append_to_file('output.txt', 'get accuracy \n') batch_processor = BatchProcessor(tensors_dir=self.testing_tensors_path, batch_size=self.batch_size, authors_size=self.authors_size, timesteps=self.timesteps, language=self.language, vocab_size=self.vocab_size) batch_processor.new_epoch() states = (torch.zeros(self.num_layers, self.batch_size, self.hidden_size), torch.zeros(self.num_layers, self.batch_size, self.hidden_size)) testing_data_looses = self.initialize_testing_loss_struct() # average loss collected using training data average_cross_entropies = self.get_average_cross_entropies() append_to_file('output.txt', 'average_cross_entropies\n') append_to_file('output.txt', str(average_cross_entropies) + '\n\n\n') while batch_processor.next_batch(): # here we start using evaluation data batches, target, authors_order = batch_processor.get_results() batches = batches.type(torch.FloatTensor) outputs, _ = self.model(batches, states) # iterating through all heads for head in range(self.authors_size): # calculating cross entropies vector, where is included loss for each unknown author in batch # (for this iteration). softmax = self.softmax(outputs[head]) entropies_vector = self.loss(softmax, target) # now, I can iterate through all unknown authors in batch for head I'm currently at for counter, author in enumerate(authors_order): # and collect losses separately for each unknown author testing_data_looses[head][author][ 'sum'] += entropies_vector[counter].item() testing_data_looses[head][author]['counter'] += 1 # after this, it's time to get average loss for each unknown author in each head. And ... # to use average loss collected earlier from training data max_ = -100000 min_ = 100000 append_to_file('output.txt', 'min max\n') for head in range(self.authors_size): for author in range(self.authors_size): average = testing_data_looses[head][author + 1][ 'sum'] / testing_data_looses[head][author + 1]['counter'] # testing_data_looses[head][author + 1]['sum'] = average - average_cross_entropies[head]['sum'] testing_data_looses[head][author + 1]['sum'] = average if testing_data_looses[head][author + 1]['sum'] < min_: min_ = testing_data_looses[head][author + 1]['sum'] if testing_data_looses[head][author + 1]['sum'] > max_: max_ = testing_data_looses[head][author + 1]['sum'] diff = max_ - min_ for head in range(self.authors_size): for author in range(self.authors_size): testing_data_looses[head][author + 1]['sum'] = ( testing_data_looses[head][author + 1]['sum'] - min_) / diff results = [] for author in range(self.authors_size): min_value = 1000 min_head = -1 for head in range(self.authors_size): if testing_data_looses[head][author + 1]['sum'] < min_value: min_head = head min_value = testing_data_looses[head][author + 1]['sum'] results.append({ 'head': min_head, 'unknown_author_number': author + 1, 'loss_diff': min_value }) append_to_file('output.txt', str(i) + '\n') append_to_file('output.txt', str(results)) count = 0 for elem in results: if elem['head'] + 1 == elem['unknown_author_number']: count += 1 append_to_file('output.txt', '\n\ntrafieni:' + str(count)) append_to_file('output.txt', '\n\naccuracy:' + str(count / 79))
test_path = 'test' train_path = 'train' create_directory(test_path) create_directory(train_path) for author in os.listdir(path): directory_path = os.path.join(path, author) sum_known = 0 create_directory(os.path.join(test_path, author)) create_directory(os.path.join(train_path, author)) for filename in os.listdir(directory_path): file_path = os.path.join(directory_path, filename) if check_if_file(file_path) and check_name_convention(filename, TEXT_NAME_CONVENTIONS): text_file_loader = TextFileLoader(file_path) text = text_file_loader.text if re.match(KNOWN_AUTHOR, filename): length = len(text_file_loader.text) middle = length / 2 test_save_path = os.path.join(test_path, author) train_save_path = os.path.join(train_path, author) create_file('known01.txt', test_save_path) create_file('unknown.txt', test_save_path) create_file('known01.txt', train_save_path) create_file('unknown.txt', train_save_path) append_to_file(os.path.join(test_save_path, 'known01.txt'), text[int(middle):int(length)]) append_to_file(os.path.join(train_save_path, 'known01.txt'), text[0:int(middle)]) append_to_file(os.path.join(test_save_path, 'unknown.txt'), text[int(middle):int(length)]) append_to_file(os.path.join(train_save_path, 'unknown.txt'), text[0:int(middle)])
def add_results_headline(self): path = os.path.join(self.results_path, self.RESULTS_FILENAME) append_to_file(path, self.HEADLINE)