def create_model(self): self.build_model_with_am() P.write_to_log(self.model) if self.execute_from_model: self.model.load_state_dict(self.model_state_dict) P.write_to_log("recovery model:", self.model, "current epoch = {}".format(self.current_epoch)) return self.model
def __init__(self, input_path: str = None, target_path: str = None, cache_input_path: str = None, cache_target_path: str = None, image_suffixes: list = ['.jpg', '.png', P.cached_extension]): self.input_path = input_path self.target_path = target_path self.image_suffixes = image_suffixes # create cached path if not exists if cache_input_path is None: cache_input_path = os.path.join(input_path, "cached") os.makedirs(cache_input_path, exist_ok=True) print("created cache input dir: {}".format(cache_input_path)) P.write_to_log( "created cache input dir: {}".format(cache_input_path)) if cache_target_path is None: cache_target_path = os.path.join(target_path, "cached") os.makedirs(cache_target_path, exist_ok=True) print("created cache target dir: {}".format(cache_target_path)) P.write_to_log( "created cache target dir: {}".format(cache_target_path)) self.cache_input_path = cache_input_path self.cache_target_path = cache_target_path self.data = None
def test(self, model, test_set, l_loss, m_loss): model.train(mode=False) loss_classification_sum = 0 loss_segmentation_sum = 0 accuracy_classification_sum = 0 batch_count = 0 for images, segments, labels in test_set: labels, segments = model_utils.reduce_to_class_number(self.left_class_number, self.right_class_number, labels, segments) images, labels, segments = self.convert_data_and_label(images, labels, segments) segments_list = [] for puller in self.puller: segments_list.append(puller(segments)) model_classification, model_segmentation = model_utils.wait_while_can_execute(model, images) classification_loss = l_loss(model_classification, labels) if self.use_mloss: sum_segm_loss = None for ms, sl in zip(model_segmentation, segments_list): segmentation_loss = self.m_loss(ms, sl) if sum_segm_loss is None: sum_segm_loss = segmentation_loss else: sum_segm_loss += segmentation_loss output_probability, output_cl, cl_acc = self.calculate_accuracy(labels, model_classification, labels.size(0)) self.save_test_data(labels, output_cl, output_probability) # accumulate information accuracy_classification_sum += model_utils.scalar(cl_acc.sum()) loss_classification_sum += model_utils.scalar(classification_loss.sum()) if self.use_mloss: loss_segmentation_sum += model_utils.scalar(sum_segm_loss.sum()) batch_count += 1 # self.de_convert_data_and_label(images, labels) # torch.cuda.empty_cache() f_1_score_text, recall_score_text, precision_score_text = metrics_processor.calculate_metric(self.classes, self.test_trust_answers, self.test_model_answers) loss_classification_sum /= batch_count + p.EPS accuracy_classification_sum /= batch_count + p.EPS loss_segmentation_sum /= batch_count + p.EPS text = 'TEST={} Loss_CL={:.5f} Loss_M={:.5f} Accuracy_CL={:.5f} {} {} {} '.format(self.current_epoch, loss_classification_sum, loss_segmentation_sum, accuracy_classification_sum, f_1_score_text, recall_score_text, precision_score_text) p.write_to_log(text) model.train(mode=True) return loss_classification_sum, accuracy_classification_sum
def save_model(self, weights): """name = self.description + "_date-" + datetime.today().strftime('%Y-%m-%d-_-%H_%M_%S') + ".torch" try: saved_dir = os.path.join(p.base_data_dir, 'model_weights') os.makedirs(saved_dir, exist_ok=True) saved_file = os.path.join(saved_dir, name) torch.save(weights, saved_file) print("Save model: {}".format(name)) p.write_to_log("Save model: {}".format(name)) except Exception as e: print("Can't save model: {}".format(name), e) p.write_to_log("Can't save model: {}".format(name), e) """ p.write_to_log("not needed save weights here")
def initialize_logs(self): P.initialize_log_name(self.run_name, self.algorithm_name, self.description, self.model_identifier) P.write_to_log("description={}".format(self.description)) P.write_to_log("classes={}".format(self.classes)) P.write_to_log("run=" + self.run_name) P.write_to_log("algorithm_name=" + self.algorithm_name)
def test(self, model, test_set, l_loss, m_loss=None): loss_classification_sum = 0 accuracy_classification_sum = 0 batch_count = 0 self.model.train(mode=False) for images, segments, labels in test_set: labels, segments = model_utils.reduce_to_class_number(self.left_class_number, self.right_class_number, labels, segments) images, labels, segments = self.convert_data_and_label(images, labels, segments) model_classification = model_utils.wait_while_can_execute_single(model, images) sigmoid = nn.Sigmoid() # used for calculate accuracy model_classification = sigmoid(model_classification) classification_loss = l_loss(model_classification, labels) output_probability, output_cl, cl_acc = self.calculate_accuracy(labels, model_classification, labels.size(0)) self.save_test_data(labels, output_cl, output_probability) # accumulate information accuracy_classification_sum += model_utils.scalar(cl_acc.sum()) loss_classification_sum += model_utils.scalar(classification_loss.sum()) batch_count += 1 # self.de_convert_data_and_label(images, labels) # torch.cuda.empty_cache() f_1_score_text, recall_score_text, precision_score_text = metrics_processor.calculate_metric(self.classes, self.test_trust_answers, self.test_model_answers) loss_classification_sum /= batch_count + p.EPS accuracy_classification_sum /= batch_count + p.EPS text = 'TEST={} Loss_CL={:.5f} Accuracy_CL={:.5f} {} {} {} '.format(self.current_epoch, loss_classification_sum, accuracy_classification_sum, f_1_score_text, recall_score_text, precision_score_text) p.write_to_log(text) return loss_classification_sum, accuracy_classification_sum
def calculate_metric(classes, trust_answers, model_answer): try: """ Calculate f1 score, precision, recall :param classes: class count :param trust_answers: list of list with trust answers :param model_answer: list of list with model answers :return: tuple with f1 score, recall score, precision score """ class_metric = 'binary' if classes == 1 else 'macro' class_metric_for_one_class = 'binary' f_1_score_text = "" for i in range(classes): f_1_score_text += "f_1_{}={:.5f} ".format(i, metrics.f1_score(trust_answers[i], model_answer[i], average=class_metric_for_one_class)) recall_score_text = "" for i in range(classes): recall_score_text += "recall_{}={:.5f} ".format(i, metrics.recall_score(trust_answers[i], model_answer[i], average=class_metric_for_one_class)) precision_score_text = "" for i in range(classes): precision_score_text += "precision_{}={:.5f} ".format(i, metrics.precision_score(trust_answers[i], model_answer[i], average=class_metric_for_one_class)) trust_answer_1, model_answer_1 = __to_global(trust_answers, model_answer, classes) # assert trust_answer_1 == trust_answers[0] f_1_score_text += "f_1_global={:.5f}".format( metrics.f1_score(trust_answer_1, model_answer_1, average=class_metric)) recall_score_text += "recall_global={:.5f}".format( metrics.recall_score(trust_answer_1, model_answer_1, average=class_metric)) precision_score_text += "precision_global={:.5f}".format( metrics.precision_score(trust_answer_1, model_answer_1, average=class_metric)) return f_1_score_text, recall_score_text, precision_score_text except ValueError as e: P.write_to_log("trust_answers: ", trust_answers) P.write_to_log("model_answers: ", model_answer) exit(0)
def infinity_server(q: list): strategy_queue.extend(q) p.initialize_log_name("NO_NUMBER", "NO_ALGORITHM", "FOR_EXEC_PURPOSE") global actual_property_index, alive_process actual_property_context = None while True: property_context, actual_property_index = pp.process_property_file( PROPERTY_FILE, actual_property_index) strategy_lock.acquire() try: if property_context != actual_property_context: # update actual_property_context = property_context register_commands(actual_property_context) p.write_to_log("=" * 20) print_status_info() p.write_to_log("=" * 20) if len(strategy_queue) == 0: continue strategy_name, strategy_memory, strategy_arguments = strategy_queue.popleft( ) gpu = ru.found_gpu(nsmi.NVLog(), int(strategy_memory), actual_property_context.banned_gpu, actual_property_context.max_thread_on_gpu) if gpu == -1: strategy_queue.appendleft( (strategy_name, strategy_memory, strategy_arguments)) continue if alive_process >= actual_property_context.max_alive_threads: strategy_queue.appendleft( (strategy_name, strategy_memory, strategy_arguments)) continue thread = Thread(target=start_strategy, args=(strategy_name, strategy_memory, gpu, strategy_arguments)) thread.start() thread_list.append(thread) mapper_list.append( (strategy_name, strategy_memory, gpu, strategy_arguments)) alive_process += 1 p.write_to_log("-" * 20) print_status_info() p.write_to_log("-" * 20) finally: strategy_lock.release() time.sleep(SLEEP_SECONDS)
def wait_while_can_execute_single(model, images): """ Execute the same images on model, while execute won't fail with error, if executed limit reached then break :param model: model with attention module :param images: passed to model images :return: tuple with result of model(images) """ flag = True cnt = 0 model_classification = None while cnt != p.TRY_CALCULATE_MODEL and flag: try: cnt += 1 model_classification = model(images) flag = False #torch.cuda.empty_cache() except RuntimeError as e: time.sleep(5) p.write_to_log("Can't execute model, CUDA out of memory", e) return model_classification
def start_strategy(executor_name: str, memory_usage: int, gpu: int, algorithms_params: dict): global alive_process executor_name = os.path.join(DIPLOMA_DIR, "executors", executor_name) args = [PYTHON_EXECUTOR_NAME, executor_name, "--gpu", str(gpu)] for k, v in algorithms_params.items(): args.append(k) args.append(str(v)) cmd = " ".join(args) current_time = datetime.today().strftime('%Y-%m-%d-_-%H_%M_%S') p.write_to_log("time = {} BEGIN execute: {}".format(current_time, cmd)) status = os.system(cmd) current_time = datetime.today().strftime('%Y-%m-%d-_-%H_%M_%S') p.write_to_log("time = {} END execute: {}, status = {}".format( current_time, cmd, status)) strategy_lock.acquire() try: if status != 0: p.write_to_log("Failed algorithm execution: {}, status={}".format( cmd, status)) copy_alg_params = {} for k, v in algorithms_params.items(): copy_alg_params[k] = v copy_alg_params['--execute_from_model'] = 'True' strategy_queue.appendleft( (executor_name, memory_usage, copy_alg_params)) alive_process -= 1 finally: strategy_lock.release()
def safe_train(self): try: self.train_strategy() exit(0) except BaseException as e: if isinstance(e, SystemExit) and e.args[0] == 0: P.write_to_log("Exit with 0") return print("EXCEPTION", e) print(type(e)) P.write_to_log("EXCEPTION", e, type(e)) P.write_to_log(traceback.extract_tb(e.__traceback__)) P.save_raised_model(self.model, self.strategy.current_epoch, self.model_identifier, self.run_name, self.algorithm_name) P.write_to_log("saved model, exception raised") exit(1)
def load_balanced_dataset(train_size: int, seed: int, image_size: int): train_set, test_set, train_count = il.load_data(train_size, seed, image_size) train_set = balance_dataset(train_set, train_size, train_size // 2) # test_set = balance_dataset(test_set, len(test_set), len(test_set) // 2) P.write_to_log("========") P.write_to_log("balanced TRAIN size: ", calculate_stat(train_set), " full size: ", len(train_set)) P.write_to_log("balanced TEST size: ", calculate_stat(test_set), " full size: ", len(test_set)) return train_set, test_set, train_count
def save_images_to_tensors(self): # print(self.input_path, self.target_path) data = self.__merge_data(self.input_path, self.target_path) data_len = len(data) for idx, dct in enumerate(data): for item in P.labels_attributes: self.__save_torch(dct, item, self.cache_target_path) self.__save_torch(dct, P.input_attribute, self.cache_input_path) print("=" * 10) print("save: {} of {} elements".format(idx, data_len)) P.write_to_log("=" * 10) P.write_to_log("save: {} of {} elements".format(idx, data_len)) print("all saved successfully") P.write_to_log("all saved successfully")
def load_data(train_size: int, seed: int, image_size: int): loader = DatasetLoader.initial() all_data = prepare_data( loader.load_tensors(0, train_size * 2, 10**20, image_size)) # all_data = prepare_data(loader.load_tensors(None, None)) log = "set size: {}, set by classes: {}".format(len(all_data), count_size(all_data)) P.write_to_log(log) random.Random(seed).shuffle(all_data) test_set = all_data[train_size:] train_set = all_data[:train_size] log = "TEST set size: {}, test set by classes: {}".format( len(test_set), count_size(test_set)) P.write_to_log(log) train_count = count_size(train_set) log = "TRAIN set size: {}, train set by classes: {}".format( len(train_set), train_count) P.write_to_log(log) return train_set, test_set, train_count
def train(self): if self.is_vgg_model: classifier_optimizer = torch.optim.Adam(gr.register_weights("classifier", self.am_model), lr=self.classifier_learning_rate) attention_module_optimizer = torch.optim.Adam(gr.register_weights("attention", self.am_model), lr=self.attention_module_learning_rate) else: classifier_optimizer = torch.optim.Adam(rgr.register_weights("classifier", self.am_model), lr=self.classifier_learning_rate) attention_module_optimizer = torch.optim.Adam(rgr.register_weights("attention", self.am_model), lr=self.attention_module_learning_rate) self.best_weights = copy.deepcopy(self.am_model.state_dict()) best_loss = None best_test_loss = None while self.current_epoch <= self.train_epochs: accuracy_classification_sum_classifier = 0 accuracy_classification_sum_segments = 0 loss_l1_sum = 0 # classifier_optimizer = self.apply_adaptive_learning(classifier_optimizer, learning_rate, # self.current_epoch) if self.current_epoch <= self.pre_train_epochs: accuracy_classification_sum_segments, loss_m_sum, loss_l1_sum, loss_classification_sum_classifier = \ self.train_segments(self.am_model, self.l_loss, self.m_loss, attention_module_optimizer, self.train_segments_set) attention_module_optimizer.zero_grad() else: loss_classification_sum_classifier, accuracy_classification_sum_classifier, loss_m_sum = \ self.train_classifier(self.am_model, self.l_loss, self.m_loss, classifier_optimizer, self.train_segments_set) classifier_optimizer.zero_grad() accuracy_total = accuracy_classification_sum_segments + accuracy_classification_sum_classifier loss_total = loss_classification_sum_classifier + loss_m_sum prefix = "PRETRAIN" if self.current_epoch <= self.pre_train_epochs else "TRAIN" f_1_score_text, recall_score_text, precision_score_text = metrics_processor.calculate_metric(self.classes, self.train_trust_answers, self.train_model_answers) text = "{}={} Loss_CL={:.5f} Loss_M={:.5f} Loss_L1={:.5f} Loss_Total={:.5f} Accuracy_CL={:.5f} " \ "{} {} {} ".format(prefix, self.current_epoch, loss_classification_sum_classifier, loss_m_sum, loss_l1_sum, loss_total, accuracy_total, f_1_score_text, recall_score_text, precision_score_text) p.write_to_log(text) if self.current_epoch % self.test_each_epoch == 0: test_loss, _ = self.test(self.am_model, self.test_set, self.l_loss, self.m_loss) if best_test_loss is None or test_loss < best_test_loss: best_test_loss = test_loss self.best_test_weights = copy.deepcopy(self.am_model.state_dict()) if self.current_epoch % 200 == 0: self.take_snapshot(self.train_segments_set, self.am_model, "TRAIN_{}".format(self.current_epoch)) self.take_snapshot(self.test_set, self.am_model, "TEST_{}".format(self.current_epoch)) if best_loss is None or loss_total < best_loss: best_loss = loss_total self.best_weights = copy.deepcopy(self.am_model.state_dict()) self.clear_temp_metrics() self.current_epoch += 1 self.save_model(self.best_test_weights) self.save_model(self.best_weights)
def found_gpu(smi, max_algorithm_memory: int, banned_gpu: int, max_thread_on_gpu: int) -> int: p.write_to_log("list of gpu:") p.write_to_log([ str(idx) + " " + str(smi['Attached GPUs'][gpu]['Minor Number']) + " " + smi['Attached GPUs'][gpu]['FB Memory Usage']['Free'].split()[0] + "| " for idx, gpu in enumerate(smi['Attached GPUs']) ]) p.write_to_log("Mapper=", MAPPER) p.write_to_log("need memory", max_algorithm_memory) for idx, k in enumerate(smi['Attached GPUs']): gpu = int(smi['Attached GPUs'][k]['Minor Number']) free_memory = int( smi['Attached GPUs'][k]['FB Memory Usage']['Free'].split()[0]) if banned_gpu == gpu: current_time = datetime.today().strftime('%Y-%m-%d-_-%H_%M_%S') p.write_to_log("time = {}, gpu = {} is banned".format( current_time, gpu)) continue if smi['Attached GPUs'][k]['Processes'] is not None and len( smi['Attached GPUs'][k]['Processes']) >= max_thread_on_gpu: current_time = datetime.today().strftime('%Y-%m-%d-_-%H_%M_%S') p.write_to_log( "time = {}, gpu = {} has processes = {} but max processes = {}" .format(current_time, gpu, len(smi['Attached GPUs'][k]['Processes']), max_thread_on_gpu)) continue if free_memory < max_algorithm_memory: current_time = datetime.today().strftime('%Y-%m-%d-_-%H_%M_%S') p.write_to_log( "time = {}, gpu = {} has free memory = {}, but required = {}". format(current_time, gpu, free_memory, max_algorithm_memory)) continue current_time = datetime.today().strftime('%Y-%m-%d-_-%H_%M_%S') p.write_to_log("time = {} found gpu = {}".format( current_time, MAPPER[gpu])) return MAPPER[gpu] current_time = datetime.today().strftime('%Y-%m-%d-_-%H_%M_%S') p.write_to_log("time = {} not found gpu".format(current_time)) return -1
def train(self): if self.is_vgg_model: classifier_optimizer = torch.optim.Adam(gr.register_weights("classifier", self.am_model), self.classifier_learning_rate) attention_module_optimizer = torch.optim.Adam(gr.register_weights("attention", self.am_model), lr=self.attention_module_learning_rate) else: classifier_optimizer = torch.optim.Adam(rgr.register_weights("classifier", self.am_model), self.classifier_learning_rate) attention_module_optimizer = torch.optim.Adam(rgr.register_weights("attention", self.am_model), lr=self.attention_module_learning_rate) while self.current_epoch <= self.train_epochs: loss_m_sum = 0 loss_l1_sum = 0 loss_classification_sum = 0 loss_segmentation_sum = 0 accuracy_sum = 0 batch_count = 0 self.am_model.train(mode=True) for images, segments, labels in self.train_segments_set: labels, segments = model_utils.reduce_to_class_number(self.left_class_number, self.right_class_number, labels, segments) images, labels, segments = self.convert_data_and_label(images, labels, segments) segments = self.puller(segments) # calculate and optimize model classifier_optimizer.zero_grad() attention_module_optimizer.zero_grad() model_classification, model_segmentation = model_utils.wait_while_can_execute(self.am_model, images) segmentation_loss = self.m_loss(model_segmentation, segments) classification_loss = self.l_loss(model_classification, labels) # torch.cuda.empty_cache() classification_loss.backward(retain_graph=True) segmentation_loss.backward() classifier_optimizer.step() attention_module_optimizer.step() output_probability, output_cl, cl_acc = self.calculate_accuracy(labels, model_classification, labels.size(0)) classifier_optimizer.zero_grad() attention_module_optimizer.zero_grad() self.save_train_data(labels, output_cl, output_probability) # accumulate information accuracy_sum += model_utils.scalar(cl_acc.sum()) loss_classification_sum += model_utils.scalar(classification_loss.sum()) loss_segmentation_sum += model_utils.scalar(segmentation_loss.sum()) batch_count += 1 # self.de_convert_data_and_label(images, segments, labels) # torch.cuda.empty_cache() loss_classification_sum = loss_classification_sum / (batch_count + p.EPS) accuracy_sum = accuracy_sum / (batch_count + p.EPS) loss_segmentation_sum = loss_segmentation_sum / (batch_count + p.EPS) loss_total = loss_classification_sum + loss_m_sum + loss_segmentation_sum prefix = "PRETRAIN" if self.current_epoch <= self.pre_train_epochs else "TRAIN" f_1_score_text, recall_score_text, precision_score_text = metrics_processor.calculate_metric(self.classes, self.train_trust_answers, self.train_model_answers) text = "{}={} Loss_CL={:.5f} Loss_M={:.5f} Loss_L1={:.5f} Loss_Total={:.5f} Accuracy_CL={:.5f} " \ "{} {} {} ".format(prefix, self.current_epoch, loss_classification_sum, loss_m_sum, loss_l1_sum, loss_total, accuracy_sum, f_1_score_text, recall_score_text, precision_score_text) P.write_to_log(text) self.am_model.train(mode=False) if self.current_epoch % self.test_each_epoch == 0: test_loss, _ = self.test(self.am_model, self.test_set, self.l_loss, self.m_loss) if self.current_epoch % 200 == 0: self.take_snapshot(self.train_segments_set, self.am_model, "TRAIN_{}".format(self.current_epoch)) self.take_snapshot(self.test_set, self.am_model, "TEST_{}".format(self.current_epoch)) self.clear_temp_metrics() self.current_epoch += 1
def print_status_info(): global strategy_queue, thread_list, mapper_list p.write_to_log("actual_property_index={}".format(actual_property_index)) p.write_to_log("alive_process={}".format(alive_process)) p.write_to_log("queue:") for idx, i in enumerate(strategy_queue): p.write_to_log("idx = {}, values={}".format(idx, i)) p.write_to_log("thread:") for idx, (i, j) in enumerate(zip(thread_list, mapper_list)): p.write_to_log("idx = {}, i={}, j={}".format(idx, i, j)) p.write_to_log("end")
def train(self): optimizer = torch.optim.Adam(self.am_model.parameters(), self.classifier_learning_rate) while self.current_epoch <= self.train_epochs: loss_m_sum = 0 loss_l1_sum = 0 loss_classification_sum = 0 loss_segmentation_sum = 0 accuracy_sum = 0 batch_count = 0 self.am_model.train(mode=True) for images, segments, labels in self.train_segments_set: labels, segments = model_utils.reduce_to_class_number(self.left_class_number, self.right_class_number, labels, segments) images, labels, segments = self.convert_data_and_label(images, labels, segments) segments_list = [] for puller in self.puller: segments_list.append(puller(segments)) # calculate and optimize model optimizer.zero_grad() model_classification, model_segmentation = model_utils.wait_while_can_execute(self.am_model, images) classification_loss = self.l_loss(model_classification, labels) total_loss = classification_loss if self.use_mloss: sum_segm_loss = None for ms, sl in zip(model_segmentation, segments_list): segmentation_loss = self.m_loss(ms, sl) total_loss += segmentation_loss if sum_segm_loss is None: sum_segm_loss = segmentation_loss else: sum_segm_loss += segmentation_loss total_loss.backward() optimizer.step() output_probability, output_cl, cl_acc = self.calculate_accuracy(labels, model_classification, labels.size(0)) optimizer.zero_grad() self.save_train_data(labels, output_cl, output_probability) accuracy_sum += model_utils.scalar(cl_acc.sum()) loss_classification_sum += model_utils.scalar(classification_loss.sum()) if self.use_mloss: loss_segmentation_sum += model_utils.scalar(sum_segm_loss.sum()) batch_count += 1 loss_classification_sum = loss_classification_sum / (batch_count + p.EPS) accuracy_sum = accuracy_sum / (batch_count + p.EPS) loss_segmentation_sum = loss_segmentation_sum / (batch_count + p.EPS) loss_total = loss_classification_sum + loss_m_sum + loss_segmentation_sum prefix = "TRAIN" f_1_score_text, recall_score_text, precision_score_text = metrics_processor.calculate_metric(self.classes, self.train_trust_answers, self.train_model_answers) text = "{}={} Loss_CL={:.5f} Loss_M={:.5f} Loss_L1={:.5f} Loss_Total={:.5f} Accuracy_CL={:.5f} " \ "{} {} {} ".format(prefix, self.current_epoch, loss_classification_sum, loss_m_sum, loss_l1_sum, loss_total, accuracy_sum, f_1_score_text, recall_score_text, precision_score_text) P.write_to_log(text) if self.current_epoch % self.test_each_epoch == 0: test_loss, _ = self.test(self.am_model, self.test_set, self.l_loss, self.m_loss) self.clear_temp_metrics() self.current_epoch += 1
def train(self): params = self.model.parameters() optimizer = torch.optim.Adam(params, lr=self.classifier_learning_rate, weight_decay=self.weight_decay) best_loss = None best_test_loss = None while self.current_epoch <= self.train_epochs: loss_classification_sum = 0 accuracy_classification_sum = 0 batch_count = 0 self.model.train(mode=True) for images, segments, labels in self.train_segments_set: labels, segments = model_utils.reduce_to_class_number(self.left_class_number, self.right_class_number, labels, segments) images, labels, segments = self.convert_data_and_label(images, labels, segments) segments = self.puller(segments) # calculate and optimize model optimizer.zero_grad() model_classification = model_utils.wait_while_can_execute_single(self.model, images) sigmoid = nn.Sigmoid() # used for calculate accuracy model_classification = sigmoid(model_classification) # все дело в инцептион может быть classification_loss = self.l_loss(model_classification, labels) # torch.cuda.empty_cache() classification_loss.backward() optimizer.step() output_probability, output_cl, cl_acc = self.calculate_accuracy(labels, model_classification, labels.size(0)) self.save_train_data(labels, output_cl, output_probability) # accumulate information accuracy_classification_sum += model_utils.scalar(cl_acc.sum()) loss_classification_sum += model_utils.scalar(classification_loss.sum()) batch_count += 1 # self.de_convert_data_and_label(images, segments, labels) # torch.cuda.empty_cache() if best_loss is None or loss_classification_sum < best_loss: best_loss = loss_classification_sum self.best_weights = copy.deepcopy(self.model.state_dict()) f_1_score_text, recall_score_text, precision_score_text = metrics_processor.calculate_metric(self.classes, self.train_trust_answers, self.train_model_answers) text = "TRAIN={} Loss_CL={:.10f} Accuracy_CL={:.5f} {} {} {} ".format(self.current_epoch, loss_classification_sum / batch_count, accuracy_classification_sum / batch_count, f_1_score_text, recall_score_text, precision_score_text) p.write_to_log(text) if self.current_epoch % self.test_each_epoch == 0: test_loss, _ = self.test(self.model, self.test_set, self.l_loss) if best_test_loss is None or test_loss < best_test_loss: best_test_loss = test_loss self.best_test_weights = copy.deepcopy(self.model.state_dict()) if best_loss is None or loss_classification_sum < best_loss: best_loss = loss_classification_sum self.best_weights = copy.deepcopy(self.model.state_dict()) self.clear_temp_metrics() self.current_epoch += 1 self.save_model(self.best_test_weights) self.save_model(self.best_weights)
def __init__(self, parsed): self.gpu = int(parsed.gpu) os.environ["CUDA_VISIBLE_DEVICES"] = str(self.gpu) self.gpu = 0 self.parsed_description = parsed.description self.pre_train = int(parsed.pre_train) self.train_set_size = int(parsed.train_set) self.epochs = int(parsed.epochs) self.run_name = parsed.run_name self.algorithm_name = parsed.algorithm_name self.left_class_number = int(parsed.left_class_number) self.right_class_number = int(parsed.right_class_number) self.freeze_list = parsed.freeze_list self.classifier_learning_rate = float(parsed.classifier_learning_rate) self.attention_module_learning_rate = float( parsed.attention_module_learning_rate) self.is_freezen = False if str( parsed.is_freezen).lower() == "false" else True self.cbam_use_mloss = False if str( parsed.cbam_use_mloss).lower() == "false" else True self.model_type = str(parsed.model_type).lower() self.is_vgg16_model = True if "vgg" in self.model_type else False self.image_size = int(parsed.image_size) self.alpha = None self.gamma = None if str(parsed.classifier_loss_function).lower() == "bceloss": self.classifier_loss_function = nn.BCELoss() elif str(parsed.classifier_loss_function).lower() == "softf1": self.classifier_loss_function = f1loss.SoftF1Loss() elif str(parsed.classifier_loss_function).lower() == "focal": self.alpha = float(parsed.alpha) self.gamma = float(parsed.gamma) self.classifier_loss_function = focal_loss.FocalLoss( self.alpha, self.gamma) else: raise Exception("classifier loss {} not found".format( parsed.classifier_loss_function)) if str(parsed.am_model).lower() == "sum": self.am_model_type = parsed.am_model elif str(parsed.am_model).lower() == "product": self.am_model_type = parsed.am_model elif str(parsed.am_model).lower() == "sum_shift": self.am_model_type = parsed.am_model elif str(parsed.am_model).lower() == "product_shift": self.am_model_type = parsed.am_model elif str(parsed.am_model).lower() == "cbam": self.am_model_type = parsed.am_model elif str(parsed.am_model).lower() == "conv_product": self.am_model_type = parsed.am_model elif str(parsed.am_model).lower() == "conv_sum": self.am_model_type = parsed.am_model else: raise Exception("model {} not found".format(parsed.am_model)) if str(parsed.am_loss_function).lower() == "bceloss": self.am_loss_function = nn.BCELoss() elif str(parsed.am_loss_function).lower() == "softf1": self.am_loss_function = f1loss.SoftF1Loss() elif str(parsed.am_loss_function).lower() == "focal": self.alpha = float(parsed.alpha) self.gamma = float(parsed.gamma) self.am_loss_function = focal_loss_am.FocalLoss( self.alpha, self.gamma) else: raise Exception("am loss {} not found".format( parsed.am_loss_function)) if str(parsed.dataset_type).lower() == "balanced": self.dataset_type = "balanced" else: self.dataset_type = "imbalanced" self.model_identifier = parsed.model_identifier self.execute_from_model = False if str( parsed.execute_from_model).lower() == "false" else True self.train_batch_size = int(parsed.train_batch_size) self.test_batch_size = int(parsed.test_batch_size) self.classes = self.right_class_number - self.left_class_number self.description = "mi-{}".format(self.model_identifier) self.snapshots_path = None self.train_segments_set = None self.test_set = None self.model = None self.puller = None self.strategy = None self.train_count = None self.initialize_logs() self.initialize_snapshots_dir() self.load_dataset() self.current_epoch = self.get_current_epoch() self.model_state_dict = self.load_model_from_saves() self.model = self.create_model() self.strategy = self.create_strategy() P.write_to_log("incoming args = {}".format(parsed))
def take_snapshot(self, data_set, model, snapshot_name: str = None): cnt = 0 model_segments_list = [] trust_segments_list = [] images_list = [] for images, segments, labels in data_set: segments = segments[:, self.left_class_number:self. right_class_number, :, :] images, labels, segments = self.convert_data_and_label( images, labels, segments) segments = self.puller(segments) _, model_segmentation = model_utils.wait_while_can_execute( model, images) cnt += segments.size(0) images, _, segments = self.de_convert_data_and_label( images, labels, segments) model_segmentation = model_segmentation.cpu() for idx in range(segments.size(0)): images_list.append(images[idx]) model_segments_list.append(model_segmentation[idx]) trust_segments_list.append(segments[idx]) if cnt >= self.snapshot_elements_count: break fig, axes = plt.subplots(len(images_list), model_segments_list[0].size(0) * 3 + 1, figsize=(50, 100)) fig.tight_layout() for idx, img in enumerate(images_list): axes[idx][0].imshow(np.transpose(img.numpy(), (1, 2, 0))) for idx, (trist_answer, model_answer) in enumerate( zip(trust_segments_list, model_segments_list)): for class_number in range(trist_answer.size(0)): a = model_answer[class_number].detach().numpy() a = np.array([a] * 3) axes[idx][1 + class_number * 3].imshow( np.transpose(a, (1, 2, 0))) p.write_to_log( "model idx={}, class={}, sum={}, max={}, min={}". format(idx, class_number, np.sum(a), np.max(a), np.min(a))) a = (a - np.min(a)) / (np.max(a) - np.min(a)) axes[idx][1 + class_number * 3 + 1].imshow( np.transpose(a, (1, 2, 0))) p.write_to_log( "model normed idx={}, class={}, sum={}, max={}, min={}". format(idx, class_number, np.sum(a), np.max(a), np.min(a))) a = trist_answer[class_number].detach().numpy() a = np.array([a] * 3) axes[idx][1 + class_number * 3 + 2].imshow( np.transpose(a, (1, 2, 0))) p.write_to_log( "trust idx={}, class={}, sum={}, max={}, min={}". format(idx, class_number, np.sum(a), np.max(a), np.min(a))) p.write_to_log("=" * 50) axes[idx][1 + class_number * 3].set( xlabel='model answer class: {}'.format(class_number)) axes[idx][1 + class_number * 3 + 1].set(xlabel='model normed answer class: {}'.format( class_number)) axes[idx][1 + class_number * 3 + 2].set( xlabel='trust answer class: {}'.format(class_number)) print("=" * 50) print("=" * 50) print("=" * 50) print("=" * 50) print("=" * 50) plt.savefig(os.path.join(self.snapshot_dir, snapshot_name)) plt.close(fig)
def balance_dataset(data_set, data_size, marked_size): pows = torch.zeros(5) for i in range(5): pows[i] = 2**i train_split = {} mask_dict = {} for a, b, k in data_set: idx = int((k * pows).sum().data) mask_dict.setdefault(idx, 0) mask_dict[idx] += 1 train_split.setdefault(idx, []) train_split[idx].append((a, b, k)) mask_dict_keys = sorted(mask_dict.keys()) for i in mask_dict_keys: P.write_to_log("".join(reversed(format(i, 'b').zfill(5))), mask_dict[i]) sm_list = [0, 0, 0, 0, 0] for k, v in mask_dict.items(): key_ = "".join(reversed(format(k, 'b').zfill(5))) for ill in range(5): exists = 1 if key_[ill] == '1' else 0 sm_list[ill] += v * exists print(sm_list) A = [[], [], [], [], [], []] for key_idx, key in enumerate(mask_dict_keys): key_ = "".join(reversed(format(key, 'b').zfill(5))) for ill in range(5): exists = 1 if key_[ill] == '1' else 0 A[ill].append(mask_dict[key] * exists) A[5].append(mask_dict[key]) b = [] for i in range(5): b.append(marked_size) b.append(data_size) bounds = [(mask_dict[mdk] / data_size, None) for mdk in mask_dict_keys] c = [-mask_dict[i] for i in mask_dict_keys] A = np.array(A) b = np.array(b) c = np.array(c) P.write_to_log(A) P.write_to_log(b) P.write_to_log(c) res = opt.linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='simplex') P.write_to_log(res) accumulate_dict = {} for idx, i in enumerate(res.x): P.write_to_log("{} {:.5f} {} {:.5f} ".format( "".join(reversed(format(mask_dict_keys[idx], 'b').zfill(5))), i, mask_dict[mask_dict_keys[idx]], mask_dict[mask_dict_keys[idx]] * i)) accumulate_dict[ mask_dict_keys[idx]] = mask_dict[mask_dict_keys[idx]] * i + 1 sm = 0 sm_list = [0, 0, 0, 0, 0] for x, k in zip(res.x, mask_dict_keys): sm += x * mask_dict[k] key_ = "".join(reversed(format(k, 'b').zfill(5))) for ill in range(5): exists = 1 if key_[ill] == '1' else 0 sm_list[ill] += x * mask_dict[k] * exists P.write_to_log(sm_list) P.write_to_log(sm) result_list = [] for idx, lst in train_split.items(): needed_cnt = int(accumulate_dict[idx]) for i in range(needed_cnt): result_list.append(lst[i % len(lst)]) return result_list