(batch_i + 1)) log_str += f"\n---- ETA {time_left}" print(log_str) model.seen += imgs.size(0) if epoch % opt.evaluation_interval == 0: print("\n---- Evaluating Model ----") # Evaluate the model on the validation set precision, recall, AP, f1, ap_class = evaluate( model, path=valid_path, iou_thres=0.5, conf_thres=0.001, nms_thres=0.5, img_size=opt.input_len, batch_size=8, dim=opt.dim, ) evaluation_metrics = [ ("val_precision", precision.mean()), ("val_recall", recall.mean()), ("val_mAP", AP.mean()), ("val_f1", f1.mean()), ] logger.list_of_scalars_summary(evaluation_metrics, epoch) # Print class APs and mAP ap_table = [["Index", "Class name", "AP"]]
log_str += f"\n---- ETA {time_left}" print(log_str) model.seen += imgs.size(0) print('Epoch', epoch, opt.evaluation_interval) if epoch % opt.evaluation_interval == 0: print("\n---- Evaluating Model ----") # Evaluate the model on the validation set result = evaluate( model, path=valid_path, iou_thres=0.5, conf_thres=0.5, nms_thres=0.5, img_size=opt.img_size, batch_size=8, ) if result: print('result', result) precision, recall, AP, f1, ap_class = result evaluation_metrics = [ ("val_precision", precision.mean()), ("val_recall", recall.mean()), ("val_mAP", AP.mean()), ("val_f1", f1.mean()), ] logger.list_of_scalars_summary(evaluation_metrics, epoch)
writer.add_scalar('loss', errD.data.cpu().numpy(), iters) train_info += ' loss: {:.4f}'.format(errD.data.cpu().numpy()) print(train_info) # Output training stats if i % 50 == 0: print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f' % (epoch, num_epochs, i, len(dataloader), errD.item(), errG.item(), D_x, D_G_z1, D_G_z2)) # Save Losses for plotting later G_losses.append(errG.item()) D_losses.append(errD.item()) print(epoch%args.val_epoch) if epoch%args.val_epoch == 0: #print('evaluate') ''' evaluate the model ''' acc = test.evaluate(model, dataloader_target, alpha) writer.add_scalar('val_acc', acc, iters) #print('Epoch: [{}] ACC:{}'.format(epoch, acc)) ''' save best model ''' if acc > best_acc: save_model(model, os.path.join(args.save_dir, 'model_best.pth.tar')) best_acc = acc ''' save model ''' save_model(model, os.path.join(args.save_dir, 'model_{}.pth.tar'.format(epoch)))
if step != 0: vis.add_scalar(loss_dict, epoch, epoch * len(train_loader) + step) # Visualization res_dict = model.test(*data, epoch=epoch, save_every=save_every) # vis.add_images(model.get_visuals(), epoch, epoch * len(train_loader) + step, prefix='train') # Random sample test data idx = np.random.randint(len(val_loader.dataset)) inputs = val_loader.dataset[idx:idx + 1] res_dict = model.test(*inputs, epoch=1, save_every=save_every) if step != 0: vis.add_scalar(res_dict, epoch, epoch * len(train_loader) + step) # vis.add_images(model.get_visuals(), epoch, epoch * len(train_loader) + step, prefix='test') logger.print('Epoch {}/{}:{}'.format(epoch, opt.n_epochs - 1, mode)) if epoch > 400: break # Evaluate on val set if opt.evaluate_every > 0 and (epoch + 1) % opt.evaluate_every == 0 and \ opt.n_frames_output > 0: results = evaluate(val_opt, val_loader, model) vis.add_scalar(results, epoch) for metric in results.keys(): logger.print('{}: {}'.format(metric, results[metric])) # Save model checkpoints if (epoch + 1 ) % opt.save_every == 0 and epoch > 0 or epoch == opt.n_epochs - 1: model.save(opt.ckpt_path, epoch + 1)
def train(): cfg = args.cfg data = args.data if len(args.image_size) == 2: image_size, image_size_val = args.image_size[0], args.image_size[1] else: image_size, image_size_val = args.image_size[0], args.image_size[0] epochs = args.epochs batch_size = args.batch_size accumulate = args.accumulate weights = args.weights # Initialize gs = 32 # (pixels) grid size assert math.fmod(image_size, gs) == 0, f"--image-size must be a {gs}-multiple" init_seeds() image_size_min = 6.6 # 320 / 32 / 1.5 image_size_max = 28.5 # 320 / 32 / 28.5 if args.multi_scale: image_size_min = round(image_size / gs / 1.5) + 1 image_size_max = round(image_size / gs * 1.5) image_size = image_size_max * gs # initiate with maximum multi_scale size print(f"Using multi-scale {image_size_min * gs} - {image_size}") # Configure run dataset_dict = parse_data_config(data) train_path = dataset_dict["train"] valid_path = dataset_dict["valid"] num_classes = 1 if args.single_cls else int(dataset_dict["classes"]) # Remove previous results for files in glob.glob("results.txt"): os.remove(files) # Initialize model model = Darknet(cfg).to(device) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for model_key, model_value in dict(model.named_parameters()).items(): if ".bias" in model_key: pg2 += [model_value] # biases elif "Conv2d.weight" in model_key: pg1 += [model_value] # apply weight_decay else: pg0 += [model_value] # all else optimizer = torch.optim.SGD(pg0, lr=parameters["lr0"], momentum=parameters["momentum"], nesterov=True) optimizer.add_param_group({ "params": pg1, # add pg1 with weight_decay "weight_decay": parameters["weight_decay"] }) optimizer.add_param_group({"params": pg2}) # add pg2 with biases del pg0, pg1, pg2 epoch = 0 start_epoch = 0 best_fitness = 0.0 context = None if weights.endswith(".pth"): state = torch.load(weights, map_location=device) # load model try: state["state_dict"] = { k: v for k, v in state["state_dict"].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(state["state_dict"], strict=False) except KeyError as e: error_msg = f"{args.weights} is not compatible with {args.cfg}. " error_msg += f"Specify --weights `` or specify a --cfg " error_msg += f"compatible with {args.weights}. " raise KeyError(error_msg) from e # load optimizer if state["optimizer"] is not None: optimizer.load_state_dict(state["optimizer"]) best_fitness = state["best_fitness"] # load results if state.get("training_results") is not None: with open("results.txt", "w") as file: file.write(state["training_results"]) # write results.txt start_epoch = state["epoch"] + 1 del state elif len(weights) > 0: # possible weights are "*.weights", "yolov3-tiny.conv.15", "darknet53.conv.74" etc. load_darknet_weights(model, weights) else: print("Pre training model weight not loaded.") # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: # skip print amp info model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # source https://arxiv.org/pdf/1812.01187.pdf lr_lambda = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.95 + 0.05 scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda, last_epoch=start_epoch - 1) # Initialize distributed training if device.type != "cpu" and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend="nccl", # "distributed backend" # distributed training init method init_method="tcp://127.0.0.1:8888", # number of nodes for distributed training world_size=1, # distributed training node rank rank=0) model = torch.nn.parallel.DistributedDataParallel(model) model.yolo_layers = model.module.yolo_layers # Dataset # Apply augmentation hyperparameters (option: rectangular training) train_dataset = LoadImagesAndLabels(train_path, image_size, batch_size, augment=True, hyp=parameters, rect=args.rect, cache_images=args.cache_images, single_cls=args.single_cls) # No apply augmentation hyperparameters and rectangular inference valid_dataset = LoadImagesAndLabels(valid_path, image_size_val, batch_size, augment=False, hyp=parameters, rect=True, cache_images=args.cache_images, single_cls=args.single_cls) collate_fn = train_dataset.collate_fn # Dataloader train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=args.workers, shuffle=not args.rect, pin_memory=True, collate_fn=collate_fn) valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, num_workers=args.workers, shuffle=False, pin_memory=True, collate_fn=collate_fn) # Model parameters model.nc = num_classes # attach number of classes to model model.hyp = parameters # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) # attach class weights model.class_weights = labels_to_class_weights(train_dataset.labels, num_classes).to(device) # Model EMA ema = ModelEMA(model, decay=0.9998) # Start training batches_num = len(train_dataloader) # number of batches burns = max(3 * batches_num, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(num_classes) # mAP per class # "P", "R", "mAP", "F1", "val GIoU", "val Objectness", "val Classification" results = (0, 0, 0, 0, 0, 0, 0) print(f"Using {args.workers} dataloader workers.") print(f"Starting training for {args.epochs} epochs...") start_time = time.time() for epoch in range(start_epoch, args.epochs): model.train() # Update image weights (optional) if train_dataset.image_weights: # class weights class_weights = model.class_weights.cpu().numpy() * (1 - maps)**2 image_weights = labels_to_image_weights( train_dataset.labels, num_classes=num_classes, class_weights=class_weights) # rand weighted index train_dataset.indices = random.choices( range(train_dataset.image_files_num), weights=image_weights, k=train_dataset.image_files_num) mean_losses = torch.zeros(4).to(device) print("\n") print(("%10s" * 8) % ("Epoch", "memory", "GIoU", "obj", "cls", "total", "targets", " image_size")) progress_bar = tqdm(enumerate(train_dataloader), total=batches_num) for index, (images, targets, paths, _) in progress_bar: # number integrated batches (since train start) ni = index + batches_num * epoch # uint8 to float32, 0 - 255 to 0.0 - 1.0 images = images.to(device).float() / 255.0 targets = targets.to(device) # Hyperparameter Burn-in if ni <= burns * 2: # giou loss ratio (obj_loss = 1.0 or giou) model.gr = np.interp(ni, [0, burns * 2], [0.0, 1.0]) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x["lr"] = np.interp(ni, [0, burns], [ 0.1 if j == 2 else 0.0, x["initial_lr"] * lr_lambda(epoch) ]) if "momentum" in x: x["momentum"] = np.interp( ni, [0, burns], [0.9, parameters["momentum"]]) # Multi-Scale training if args.multi_scale: # adjust img_size (67% - 150%) every 1 batch if ni / accumulate % 1 == 0: image_size = random.randrange(image_size_min, image_size_max + 1) * gs scale_ratio = image_size / max(images.shape[2:]) if scale_ratio != 1: # new shape (stretched to 32-multiple) new_size = [ math.ceil(size * scale_ratio / gs) * gs for size in images.shape[2:] ] images = F.interpolate(images, size=new_size, mode="bilinear", align_corners=False) # Run model output = model(images) # Compute loss loss, loss_items = compute_loss(output, targets, model) if not torch.isfinite(loss): warnings.warn( f"WARNING: Non-finite loss, ending training {loss_items}") return results # Scale loss by nominal batch_size of (16 * 4 = 64) loss *= batch_size / (batch_size * accumulate) # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize accumulated gradient if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print batch results # update mean losses mean_losses = (mean_losses * index + loss_items) / (index + 1) memory = f"{torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0:.2f}G" context = ("%10s" * 2 + "%10.3g" * 6) % ("%g/%g" % (epoch, args.epochs - 1), memory, *mean_losses, len(targets), image_size) progress_bar.set_description(context) # Update scheduler scheduler.step() # Process epoch results ema.update_attr(model) final_epoch = epoch + 1 == epochs if not args.notest or final_epoch: # Calculate mAP coco = any([ coco_name in data for coco_name in ["coco.data", "coco2014.data", "coco2017.data"] ]) and model.nc == 80 results, maps = evaluate(cfg, data, batch_size=batch_size, image_size=image_size_val, model=ema.ema, save_json=final_epoch and coco, single_cls=args.single_cls, dataloader=valid_dataloader) # Write epoch results with open("results.txt", "a") as f: # P, R, mAP, F1, test_losses=(GIoU, obj, cls) f.write(context + "%10.3g" * 7 % results) f.write("\n") # Write Tensorboard results if tb_writer: tags = [ "train/giou_loss", "train/obj_loss", "train/cls_loss", "metrics/precision", "metrics/recall", "metrics/mAP_0.5", "metrics/F1", "val/giou_loss", "val/obj_loss", "val/cls_loss" ] for x, tag in zip(list(mean_losses[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP # fitness_i = weighted combination of [P, R, mAP, F1] fitness_i = fitness(np.array(results).reshape(1, -1)) if fitness_i > best_fitness: best_fitness = fitness_i # Save training results save = (not args.nosave) or (final_epoch and not args.evolve) if save: with open("results.txt", "r") as f: # Create checkpoint state = { "epoch": epoch, "best_fitness": best_fitness, "training_results": f.read(), "state_dict": ema.ema.module.state_dict() if hasattr(model, "module") else ema.ema.state_dict(), "optimizer": None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(state, "weights/checkpoint.pth") # Save best checkpoint if (best_fitness == fitness_i) and not final_epoch: state = { "epoch": -1, "best_fitness": None, "training_results": None, "state_dict": model.state_dict(), "optimizer": None } torch.save(state, "weights/model_best.pth") # Delete checkpoint del state if not args.evolve: plot_results() # save as results.png print(f"{epoch - start_epoch} epochs completed " f"in " f"{(time.time() - start_time) / 3600:.3f} hours.\n") dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def bayesian_opt(w, m, g, a, lcoor, lno, iou_thresh, iou_type, bayes_opt=True): iou_type = int(round(iou_type)) if (iou_type) == 0: iou_type = (0, 0, 0) elif (iou_type == 1): iou_type = (1, 0, 0) elif (iou_type == 2): iou_type = (0, 1, 0) else: iou_type = (0, 0, 1) hyperparameters = { 'lr': 0.0001, 'epochs': 1, 'resume_from': 0, 'coco_version': '2017', #can be either '2014' or '2017' 'batch_size': 16, 'weight_decay': w, 'momentum': m, 'optimizer': 'sgd', 'alpha': a, 'gamma': g, 'lcoord': lcoor, 'lno_obj': lno, 'iou_type': iou_type, 'iou_ignore_thresh': iou_thresh, 'inf_confidence': 0.01, 'inf_iou_threshold': 0.5, 'wasserstein': False, 'tfidf': True, 'idf_weights': True, 'tfidf_col_names': ['img_freq', 'none', 'none', 'none', 'no_softmax'], 'augment': 1, 'workers': 4, 'pretrained': False, 'path': 'yolo2017_semiprtnd', 'reduction': 'sum' } mode = { 'bayes_opt': bayes_opt, 'multi_scale': False, 'debugging': False, 'show_output': False, 'multi_gpu': True, 'show_temp_summary': False, 'save_summary': bayes_opt == False } # print(hyperparameters) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # print('Using: ',device) model, optimizer, hyperparameters = init_model.init_model(hyperparameters, mode, show=False) if type(model) is nn.DataParallel: inp_dim = model.module.inp_dim else: inp_dim = model.inp_dim coco_version = hyperparameters['coco_version'] if bayes_opt == True: tr_subset = 0.1 ts_subset = 1 else: tr_subset = 1 ts_subset = 1 if (mode['save_summary'] == True): writer = SummaryWriter('../results/' + hyperparameters['path']) if hyperparameters['augment'] > 0: train_dataset = Coco(partition='train', coco_version=coco_version, subset=tr_subset, transform=transforms.Compose([ Augment(hyperparameters['augment']), ResizeToTensor(inp_dim) ])) else: train_dataset = Coco(partition='train', coco_version=coco_version, subset=subset, transform=transforms.Compose( [ResizeToTensor(inp_dim)])) dataset_len = (len(train_dataset)) batch_size = hyperparameters['batch_size'] train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=helper.collate_fn, num_workers=hyperparameters['workers']) for i in range(hyperparameters['epochs']): outcome = yolo_function.train_one_epoch(model, optimizer, train_dataloader, hyperparameters, mode) if outcome['broken'] == 1: return 0 else: mAP = test.evaluate( model, device, coco_version, confidence=hyperparameters['inf_confidence'], iou_threshold=hyperparameters['inf_iou_threshold'], subset=ts_subset) if (mode['save_summary'] == True): writer.add_scalar('Loss/train', outcome['avg_loss'], hyperparameters['resume_from']) writer.add_scalar('AIoU/train', outcome['avg_iou'], hyperparameters['resume_from']) writer.add_scalar('PConf/train', outcome['avg_conf'], hyperparameters['resume_from']) writer.add_scalar('NConf/train', outcome['avg_no_conf'], hyperparameters['resume_from']) writer.add_scalar('PClass/train', outcome['avg_pos'], hyperparameters['resume_from']) writer.add_scalar('NClass/train', outcome['avg_neg'], hyperparameters['resume_from']) writer.add_scalar('mAP/valid', mAP, hyperparameters['resume_from']) hyperparameters['resume_from'] = hyperparameters['resume_from'] + 1 if (mode['bayes_opt'] == False): torch.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'avg_loss': outcome['avg_loss'], 'avg_iou': outcome['avg_iou'], 'avg_pos': outcome['avg_pos'], 'avg_neg': outcome['avg_neg'], 'avg_conf': outcome['avg_conf'], 'avg_no_conf': outcome['avg_no_conf'], 'epoch': hyperparameters['resume_from'] }, PATH + hyperparameters['path'] + '.tar') # hyperparameters['resume_from']=checkpoint['epoch']+1 return mAP
def main(): make_deterministic() # region Prepare data with Timer('\nData preparation time: %s\n'): ru_lang = Language() en_lang = Language() yandex = Yandex( 'datasets/yandex/corpus.en_ru.1m.ru', 'datasets/yandex/corpus.en_ru.1m.en', ru_lang, en_lang, data_slice=H.dataset_slice, ) paracrawl = ParaCrawl( 'datasets/paracrawl/en-ru.txt', ru_lang, en_lang, data_slice=slice(0), ) low = ru_lang.lower_than(H.ru_word_count_minimum) infrequent_words_n = max( ceil(ru_lang.words_n * H.infrequent_words_percent), len(low)) if infrequent_words_n > 0: ru_lang.drop_words(ru_lang.lowk(infrequent_words_n)) print( f'{infrequent_words_n:,} infrequent Russian words are dropped') low = en_lang.lower_than(H.en_word_count_minimum) if len(low) > 0: en_lang.drop_words(*low) print(f'{len(low):,} infrequent English words are dropped') print( f'Russian language: {ru_lang.words_n:,} words, {ru_lang.sentence_length:,} words in a sentence' ) print( f'English language: {en_lang.words_n:,} words, {en_lang.sentence_length:,} words in a sentence' ) batch = H.batch_size dataset = ConcatDataset((yandex, paracrawl)) loader = DataLoader(dataset, batch, shuffle=True) # endregion # region Models and optimizers model = Seq2Seq( Encoder(ru_lang.words_n, H.encoder_embed_dim, H.encoder_hidden_dim, H.encoder_bi, H.decoder_hd), Attention(H.encoder_hd, H.decoder_hd), Decoder(en_lang.words_n, H.decoder_embed_dim, H.decoder_hidden_dim, H.encoder_hd), ).to(Device).train() optimizer = Adam(model.parameters(), lr=H.learning_rate) criterion = CrossEntropyLoss(ignore_index=Token_PAD, reduction='sum') # endregion # region Training teaching_percent = H.teaching_percent total = len(dataset) log_interval = max(5, round(total / batch / 1000)) for epoch in range(1, H.epochs + 1): with Printer() as printer: printer.print(f'Train epoch {epoch}: starting...') for i, ((ru, ru_l), en_sos, en_eos) in enumerate(loader, 1): # Zero the parameter gradients optimizer.zero_grad() # Run data through model predictions = model(ru, ru_l, en_sos, teaching_percent) # Calculate loss loss = criterion(predictions, en_eos) # Back propagate and perform optimization loss.backward() clip_grad_norm_(model.parameters(), H.gradient_norm_clip) optimizer.step() # Print log if i % log_interval == 0: printer.print( f'Train epoch {epoch}: {i * batch / total:.1%} [{i * batch:,}/{total:,}]' ) printer.print(f'Train epoch {epoch}: completed') # endregion torch.save( ( ru_lang.__getnewargs__(), en_lang.__getnewargs__(), model.cpu().eval().data, ), 'data/data.pt', ) evaluate(model.to(Device), ru_lang, en_lang, 'datasets/yandex/corpus.en_ru.1m.ru', slice(H.dataset_slice.stop + 1, H.dataset_slice.stop + 1 + 100))
wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss), file_path) del loss, out, float_out avg_loss /= len(train_sampler) epoch_time = time.time() - start_epoch_time print('Training Summary Epoch: [{0}]\t' 'Time taken (s): {epoch_time:.0f}\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=avg_loss)) start_iter = 0 # Reset start iteration for next epoch with torch.no_grad(): wer, cer, output_data = evaluate(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=decoder) loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format( epoch + 1, wer=wer, cer=cer)) values = { 'loss_results': loss_results, 'cer_results': cer_results, 'wer_results': wer_results }
source_loader, 0): # source_data: (128,3,28,28), source_data_label: (128,1) train_info = 'Epoch: [{0}][{1}/{2}]'.format(epoch, i + 1, len(source_loader)) source_data, source_data_label = source_data.to( device), source_data_label.to(device) optimizer.zero_grad() cls_output = model(source_data) label_loss = class_criterion(cls_output, source_data_label.squeeze()) label_loss.backward() optimizer.step() train_info += ' loss: {:.4f}'.format(label_loss.data.cpu().numpy()) if i % 50 == 0: print(train_info) if (epoch + 1) % 1 == 0: print("testing.... ") acc = evaluate(model, target_loader, False) print("acc: ", acc) print("best acc so far... ", best_acc) if acc > best_acc: best_acc = acc print("This is the best model!!!") save_model(model, os.path.join(save_dir, 'model_best.pth.tar')) save_model(model, os.path.join(save_dir, 'model_{}.pth.tar'.format(epoch)))
#import pdb;pdb.set_trace(); encoder = EncoderRNN(N_word, hidden_size).to(device) decoder = AttnDecoderRNN(hidden_size, CLASS_size, dropout_p=0.1, max_length=max_length).to(device) n_iterations = train_df.shape[0] #trainIters(encoder, decoder, n_iterations, print_every=50, plot_every=10) import pdb;pdb.set_trace(); trainIters(encoder, decoder, 1, print_every=50, plot_every=10) sentence = train_df.iloc[0]["description"] sentence = normalizeString(sentence) input_tensor = embeddedTensorFromSentence(sentence,device,word_emb,N_word) target_class = train_df.iloc[0]["department_new"] class_index = [] target_index = class_dict[target_class] print(target_index) #y_true.append(target_index) output, attention = evaluate(encoder, decoder, input_tensor,max_length,device) #import pdb;pdb.set_trace(); topv, topi = output.topk(1) #import pdb;pdb.set_trace(); #torch.save(encoder.state_dict(), "encoder") #torch.save(decoder.state_dict(), "decoder") #encoder = torch.load("encoder") #decoder = torch.load("decoder") #desc1 = full_table.iloc[0]["description"] #dep1 = full_table.iloc[0]["department"] #input_tensor = embeddedTensorFromSentence(desc1,device,word_emb,N_word) #print(classes_) #evaluateTest(encoder,decoder) #import pdb;pdb.set_trace(); #output, attention = evaluate(encoder, decoder, input_tensor,max_length,device) #showAttention(desc2, attention)
def main(config): '''===== Generator =====''' gen_logger = config.get_logger('generator') # setup data_loader instances data_loader = config.initialize('data_loader', module_data, 'generator') valid_data_loader = data_loader.split_validation() # build model architecture, then print to console model = config.initialize('arch', model_arch, 'generator') gen_logger.info(model) # get function handles of loss and metrics loss_fn = getattr(module_loss, config['generator']['loss']) metric_fns = [getattr(module_metric, met) for met in config['generator']['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.initialize('optimizer', torch.optim, 'generator', trainable_params) lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler, 'generator', optimizer) generator = { 'logger': gen_logger, 'data_loader': data_loader, 'valid_data_loader': valid_data_loader, 'model': model, 'loss_fn': loss_fn, 'metric_fns': metric_fns, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler } '''===== Discriminator =====''' dis_logger = config.get_logger('discriminator') # setup data_loader instances data_loader = config.initialize('data_loader', module_data, 'discriminator') valid_data_loader = data_loader.split_validation() # build model architecture, then print to console model = config.initialize('arch', model_arch, 'discriminator') dis_logger.info(model) # get function handles of loss and metrics loss_fn = getattr(module_loss, config['discriminator']['loss']) metric_fns = [getattr(module_metric, met) for met in config['discriminator']['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.initialize('optimizer', torch.optim, 'discriminator', trainable_params) lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler, 'discriminator', optimizer) discriminator = { 'logger': dis_logger, 'data_loader': data_loader, 'valid_data_loader': valid_data_loader, 'model': model, 'loss_fn': loss_fn, 'metric_fns': metric_fns, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler } '''===== Training =====''' trainer = Trainer(generator, discriminator, config) # trainer = Trainer(model, loss_fn, metric_fns, optimizer, # config=config, # data_loader=data_loader, # valid_data_loader=valid_data_loader, # lr_scheduler=lr_scheduler) trainer.train() log = evaluate(model, metric_fns, data_loader, loss_fn) '''===== Testing =====''' logger.info('< Evaluation >') for key, value in log.items(): logger.info(' {:15s}: {}'.format(str(key), value))
def train(cfg): # Initialize init_seeds() image_size_min = 6.6 # 320 / 32 / 1.5 image_size_max = 28.5 # 320 / 32 / 28.5 if cfg.TRAIN.MULTI_SCALE: image_size_min = round(cfg.TRAIN.IMAGE_SIZE / 32 / 1.5) image_size_max = round(cfg.TRAIN.IMAGE_SIZE / 32 * 1.5) image_size = image_size_max * 32 # initiate with maximum multi_scale size print(f"Using multi-scale {image_size_min * 32} - {image_size}") # Remove previous results for files in glob.glob("results.txt"): os.remove(files) # Initialize model model = YOLOv3(cfg).to(device) # Optimizer optimizer = optim.SGD(model.parameters(), lr=cfg.TRAIN.LR, momentum=cfg.TRAIN.MOMENTUM, weight_decay=cfg.TRAIN.DECAY, nesterov=True) # Define the loss function calculation formula of the model compute_loss = YoloV3Loss(cfg) epoch = 0 start_epoch = 0 best_maps = 0.0 context = None # Dataset # Apply augmentation hyperparameters train_dataset = VocDataset(anno_file_type=cfg.TRAIN.DATASET, image_size=cfg.TRAIN.IMAGE_SIZE, cfg=cfg) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=cfg.TRAIN.MINI_BATCH_SIZE, num_workers=cfg.TRAIN.WORKERS, shuffle=cfg.TRAIN.SHUFFLE, pin_memory=cfg.TRAIN.PIN_MENORY) if cfg.TRAIN.WEIGHTS.endswith(".pth"): state = torch.load(cfg.TRAIN.WEIGHTS, map_location=device) # load model try: state["state_dict"] = { k: v for k, v in state["state_dict"].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(state["state_dict"], strict=False) except KeyError as e: error_msg = f"{cfg.TRAIN.WEIGHTS} is not compatible with {cfg.CONFIG_FILE}. " error_msg += f"Specify --weights `` or specify a --config-file " error_msg += f"compatible with {cfg.TRAIN.WEIGHTS}. " raise KeyError(error_msg) from e # load optimizer if state["optimizer"] is not None: optimizer.load_state_dict(state["optimizer"]) best_maps = state["best_maps"] # load results if state.get("training_results") is not None: with open("results.txt", "w") as file: file.write(state["training_results"]) # write results.txt start_epoch = state["batches"] + 1 // len(train_dataloader) del state elif len(cfg.TRAIN.WEIGHTS) > 0: # possible weights are "*.weights", "yolov3-tiny.conv.15", "darknet53.conv.74" etc. load_darknet_weights(model, cfg.TRAIN.WEIGHTS) else: print("Pre training model weight not loaded.") # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: # skip print amp info model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # source https://arxiv.org/pdf/1812.01187.pdf scheduler = CosineDecayLR(optimizer, max_batches=cfg.TRAIN.MAX_BATCHES, lr=cfg.TRAIN.LR, warmup=cfg.TRAIN.WARMUP_BATCHES) # Initialize distributed training if device.type != "cpu" and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend="nccl", # "distributed backend" # distributed training init method init_method="tcp://127.0.0.1:9999", # number of nodes for distributed training world_size=1, # distributed training node rank rank=0) model = torch.nn.parallel.DistributedDataParallel(model) model.backbone = model.module.backbone # Model EMA # TODO: ema = ModelEMA(model, decay=0.9998) # Start training batches_num = len(train_dataloader) # number of batches # 'loss_GIOU', 'loss_Confidence', 'loss_Classification' 'loss' results = (0, 0, 0, 0) epochs = cfg.TRAIN.MAX_BATCHES // len(train_dataloader) print(f"Using {cfg.TRAIN.WORKERS} dataloader workers.") print( f"Starting training {cfg.TRAIN.MAX_BATCHES} batches for {epochs} epochs..." ) start_time = time.time() for epoch in range(start_epoch, epochs): model.train() # init batches batches = 0 mean_losses = torch.zeros(4) print("\n") print( ("%10s" * 7) % ("Batch", "memory", "GIoU", "conf", "cls", "total", " image_size")) progress_bar = tqdm(enumerate(train_dataloader), total=batches_num) for index, (images, small_label_bbox, medium_label_bbox, large_label_bbox, small_bbox, medium_bbox, large_bbox) in progress_bar: # number integrated batches (since train start) batches = index + len(train_dataloader) * epoch scheduler.step(batches) images = images.to(device) small_label_bbox = small_label_bbox.to(device) medium_label_bbox = medium_label_bbox.to(device) large_label_bbox = large_label_bbox.to(device) small_bbox = small_bbox.to(device) medium_bbox = medium_bbox.to(device) large_bbox = large_bbox.to(device) # Hyper parameter Burn-in if batches <= cfg.TRAIN.WARMUP_BATCHES: for m in model.named_modules(): if m[0].endswith('BatchNorm2d'): m[1].track_running_stats = batches == cfg.TRAIN.WARMUP_BATCHES # Run model pred, raw = model(images) # Compute loss loss, loss_giou, loss_conf, loss_cls = compute_loss( pred, raw, small_label_bbox, medium_label_bbox, large_label_bbox, small_bbox, medium_bbox, large_bbox) # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize accumulated gradient if batches % cfg.TRAIN.BATCH_SIZE // cfg.TRAIN.MINI_BATCH_SIZE == 0: optimizer.step() optimizer.zero_grad() # TODO: ema.update(model) # Print batch results # update mean losses loss_items = torch.tensor([loss_giou, loss_conf, loss_cls, loss]) mean_losses = (mean_losses * index + loss_items) / (index + 1) memory = f"{torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0:.2f}G" context = ("%10s" * 2 + "%10.3g" * 5) % ( "%g/%g" % (batches + 1, cfg.TRAIN.MAX_BATCHES), memory, *mean_losses, train_dataset.image_size) progress_bar.set_description(context) # Multi-Scale training if cfg.TRAIN.MULTI_SCALE: # adjust img_size (67% - 150%) every 10 batch size if batches % cfg.TRAIN.RESIZE_INTERVAL == 0: train_dataset.image_size = random.randrange( image_size_min, image_size_max + 1) * 32 # Write Tensorboard results if tb_writer: # 'loss_GIOU', 'loss_Confidence', 'loss_Classification' 'loss' titles = ["GIoU", "Confidence", "Classification", "Train loss"] for xi, title in zip( list(mean_losses) + list(results), titles): tb_writer.add_scalar(title, xi, index) # Process epoch results # TODO: ema.update_attr(model) final_epoch = epoch + 1 == epochs # Calculate mAP # skip first epoch maps = 0. if epoch > 0: maps = evaluate(cfg, args) # Write epoch results with open("results.txt", "a") as f: # 'loss_GIOU', 'loss_Confidence', 'loss_Classification' 'loss', 'maps' f.write(context + "%10.3g" * 1 % maps) f.write("\n") # Update best mAP if maps > best_maps: best_maps = maps # Save training results with open("results.txt", 'r') as f: # Create checkpoint state = { 'batches': batches, 'best_maps': maps, 'training_results': f.read(), 'state_dict': model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(state, "weights/checkpoint.pth") # Save best checkpoint if best_maps == maps: state = { 'batches': -1, 'best_maps': None, 'training_results': None, 'state_dict': model.state_dict(), 'optimizer': None } torch.save(state, "weights/model_best.pth") # Delete checkpoint del state print(f"{epoch - start_epoch} epochs completed " f"in {(time.time() - start_time) / 3600:.3f} hours.\n") dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache()
def main(log_dir, model_path, augmentation, dataset, batch_size, learning_rate, num_workers, restore_dir, lr_value, lr_steps): arguments = copy.deepcopy(locals()) os.mkdir(log_dir) shutil.copy2(__file__, os.path.join(log_dir, "script.py")) shutil.copy2(model_path, os.path.join(log_dir, "model.py")) logger = logging.getLogger("train") logger.setLevel(logging.DEBUG) logger.handlers = [] ch = logging.StreamHandler() logger.addHandler(ch) fh = logging.FileHandler(os.path.join(log_dir, "log.txt")) logger.addHandler(fh) logger.info("%s", repr(arguments)) torch.backends.cudnn.benchmark = True device = torch.device("cuda:0") # Load the model loader = importlib.machinery.SourceFileLoader( 'model', os.path.join(log_dir, "model.py")) mod = types.ModuleType(loader.name) loader.exec_module(mod) model = mod.Model(55) model = torch.nn.DataParallel(model) model.to(device) if restore_dir is not None: model.load_state_dict( torch.load(os.path.join(restore_dir, "state.pkl"))) logger.info("{} paramerters in total".format( sum(x.numel() for x in model.parameters()))) # Load the dataset # Increasing `repeat` will generate more cached files cache = CacheNPY("v64d", transform=Obj2Voxel(64, double=True, rotate=True), repeat=augmentation) def transform(x): x = cache(x) x = torch.from_numpy(x.astype(np.float32)).unsqueeze(0) / 8 x = low_pass_filter(x, 2) return x def target_transform(x): classes = [ '02691156', '02747177', '02773838', '02801938', '02808440', '02818832', '02828884', '02843684', '02871439', '02876657', '02880940', '02924116', '02933112', '02942699', '02946921', '02954340', '02958343', '02992529', '03001627', '03046257', '03085013', '03207941', '03211117', '03261776', '03325088', '03337140', '03467517', '03513137', '03593526', '03624134', '03636649', '03642806', '03691459', '03710193', '03759954', '03761084', '03790512', '03797390', '03928116', '03938244', '03948459', '03991062', '04004475', '04074963', '04090263', '04099429', '04225987', '04256520', '04330267', '04379243', '04401088', '04460130', '04468005', '04530566', '04554684' ] return classes.index(x[0]) train_set = Shrec17("shrec17_data", dataset, perturbed=True, download=True, transform=transform, target_transform=target_transform) train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, sampler=EqSampler(train_set), num_workers=num_workers, pin_memory=True, drop_last=True) optimizer = torch.optim.Adam(model.parameters(), lr=0) def train_step(data, target): model.train() data, target = data.to(device), target.to(device) prediction = model(data) loss = F.cross_entropy(prediction, target) optimizer.zero_grad() loss.backward() optimizer.step() correct = prediction.argmax(1).eq(target).long().sum().item() return loss.item(), correct def get_learning_rate(epoch): assert len(lr_value) == len(lr_steps) + 1 for lim, lr in zip(lr_steps, lr_value): if epoch < lim: return lr * learning_rate return lr_value[-1] * learning_rate dynamics = [] epoch = 0 if restore_dir is not None: dynamics = torch.load(os.path.join(restore_dir, "dynamics.pkl")) epoch = dynamics[-1]['epoch'] + 1 score = best_score = 0 for epoch in range(epoch, 2000): lr = get_learning_rate(epoch) logger.info("learning rate = {} and batch size = {}".format( lr, train_loader.batch_size)) for p in optimizer.param_groups: p['lr'] = lr total_loss = 0 total_correct = 0 time_before_load = time.perf_counter() for batch_idx, (data, target) in enumerate(train_loader): time_after_load = time.perf_counter() time_before_step = time.perf_counter() loss, correct = train_step(data, target) total_loss += loss total_correct += correct avg_loss = total_loss / (batch_idx + 1) avg_correct = total_correct / len(data) / (batch_idx + 1) logger.info( "[{}:{}/{}] LOSS={:.2} <LOSS>={:.2} ACC={:.2} <ACC>={:.2} time={:.2}+{:.2}" .format(epoch, batch_idx, len(train_loader), loss, avg_loss, correct / len(data), avg_correct, time_after_load - time_before_load, time.perf_counter() - time_before_step)) time_before_load = time.perf_counter() dynamics.append({ 'epoch': epoch, 'batch_idx': batch_idx, 'step': epoch * len(train_loader) + batch_idx, 'learning_rate': lr, 'batch_size': len(data), 'loss': loss, 'correct': correct, 'avg_loss': avg_loss, 'avg_correct': avg_correct, 'best_score': best_score, 'score': score, }) torch.save(model.state_dict(), os.path.join(log_dir, "state.pkl")) torch.save(dynamics, os.path.join(log_dir, "dynamics.pkl")) if epoch % 100 == 0: micro, macro = evaluate(log_dir, 1, "val", 20, 1, "state.pkl") score = micro["mAP"] + macro["mAP"] logger.info("Score={} Best={}".format(score, best_score)) if score > best_score: best_score = score torch.save(model.state_dict(), os.path.join(log_dir, "best_state.pkl"))
(time.time() - start_time) / (batch_i + 1)) log_str += f"\n---- ETA {time_left}" print(log_str) model.seen += imgs.size(0) if epoch % opt.evaluation_interval == 0: print("\n---- Evaluating Model ----") # Evaluate the model on the validation set precision, recall, AP, f1, ap_class = evaluate( model, path=valid_path, iou_thres=0.5, conf_thres=0.5, nms_thres=0.5, img_size=opt.img_size, batch_size=8, ) evaluation_metrics = [ ("val_precision", precision.mean()), ("val_recall", recall.mean()), ("val_mAP", AP.mean()), ("val_f1", f1.mean()), ] logger.list_of_scalars_summary(evaluation_metrics, epoch) # Print class APs and mAP ap_table = [["Index", "Class name", "AP"]] for i, c in enumerate(ap_class):
if epoch % config['checkpoint_interval'] == 0: torch.save( model.state_dict(), config['checkpoint_path'] + f"yolov3_%s_%d.pth" % (config['type'], epoch)) # torch.save(model.state_dict(), # f"checkpoints/yolov3_ckpt_%d.pth" % epoch) if epoch % config['evaluation_interval'] == 0: print("\n---- Evaluating Model ----") # Evaluate the model on the validation set precision, recall, AP, f1, ap_class, landm = evaluate( model, path=valid_path, iou_thres=0.5, conf_thres=0.5, nms_thres=0.5, img_size=config['img_size'], batch_size=config['vbatch_size'], type=config['type'], ) evaluation_metrics = [ ("val_precision", precision.mean()), ("val_recall", recall.mean()), ("val_mAP", AP.mean()), ("val_f1", f1.mean()), ] if model.type in landm_set: evaluation_metrics.append(("landm", landm.mean())) logger.list_of_scalars_summary(evaluation_metrics, epoch) val_acc.append(evaluation_metrics) with open(config['val_metrics'].format(config['type']), 'w') as f:
# # if opt.verbose: print(log_str) model.module.seen += imgs.size(0) # if batch_i > 30: break scheduler.step() if epoch % opt.evaluation_interval == 0: print("\n---- Evaluating Model ----") # Evaluate the model on the validation set metrics_output = evaluate( model, Loss, path=valid_path, iou_thres=0.5, conf_thres=0.5, nms_thres=0.5, img_size=opt.img_size, batch_size=20, ) if metrics_output is not None: precision, recall, AP, f1, ap_class = metrics_output evaluation_metrics = [ ("validation/precision", precision.mean()), ("validation/recall", recall.mean()), ("validation/mAP", AP.mean()), ("validation/f1", f1.mean()), ] logger.list_of_scalars_summary(evaluation_metrics, epoch)
ev_iterations = 100 n_units = 40 print(' ') print('number of hidden units: ' + str(n_units)) print(' ') test_data_stack = [] test_hh_stack = [] for i in range(ev_iterations): import test # Evaluate on test dataset test_data = test.generate_test_dataset() test_perp, test_hh, test_y_bin_error_sum = test.evaluate(test_data, test=True) test_data_stack.extend(test_data['coordinates']) test_hh_stack.extend(test_hh) input_data = test_hh_stack output_data = [] for i in range(len(test_data_stack)): output_data.append(test_data_stack[i][0] + test_data_stack[i][1] * 9) print('') # data allocation X_train, X_test, y_train, y_test = train_test_split(input_data, output_data) # parameters
b_size = target_data.size()[0] domain_label = torch.full((b_size, ), 1, dtype=torch.long, device=device) # all 1 _, dom_output = model(target_data, alpha) target_domain_loss = domain_criterion(dom_output, domain_label) loss = label_loss + source_domain_loss + target_domain_loss optimizer.zero_grad() loss.backward() optimizer.step() iters += 1 train_info += ' loss: {:.4f}'.format(label_loss.data.cpu().numpy()) if i % 50 == 0: print(train_info) if (epoch + 1) % 1 == 0: print("testing.... ") acc = evaluate(model, test_loader, 0, False) print("acc: ", acc) print("best acc so far... ", best_acc) if acc > best_acc: best_acc = acc print("This is the best model!!!") save_model(model, os.path.join(save_dir, 'model_best.pth.tar')) save_model(model, os.path.join(save_dir, 'model_{}.pth.tar'.format(epoch)))
#%% device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Darknet(opt.model_def).to(device) model.load_state_dict(torch.load(opt.model)) #加载模型 # 解析config文件 data_config = parse_data_config(opt.data_config) valid_path = data_config["valid"] #获取验证集路径 class_names = load_classes(data_config["names"]) #加载类别对应名字 eval_model = lambda model: evaluate(model, path=valid_path, iou_thres=0.5, conf_thres=0.01, nms_thres=0.5, img_size=model.img_size, batch_size=8) obtain_num_parameters = lambda model: sum( [param.nelement() for param in model.parameters()]) origin_model_metric = eval_model(model) #稀疏化训练的模型的评价指标(还没有剪枝) origin_nparameters = obtain_num_parameters(model) #稀疏化训练的模型的参数 # 返回CBL组件的id,单独的Conv层的id,以及需要被剪枝的层的id CBL_idx, Conv_idx, prune_idx = parse_module_defs(model.module_defs) # 获取CBL组件的BN层的权重,即Gamma参数,我们会根据这个参数来剪枝 bn_weights = gather_bn_weights(model.module_list, prune_idx)
def train(args): args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log_dir = os.path.expanduser(args.log_dir) utils.cleanup_log_dir(log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") log_file = '-{}-{}-reproduce-s{}'.format(args.run_name, args.env_name, args.seed) logger.configure(dir=args.log_dir, format_strs=['csv', 'stdout'], log_suffix=log_file) venv = ProcgenEnv(num_envs=args.num_processes, env_name=args.env_name, \ num_levels=args.num_levels, start_level=args.start_level, \ distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor(venv=venv, filename=None, keep_buf=100) venv = VecNormalize(venv=venv, ob=False) envs = VecPyTorchProcgen(venv, device) obs_shape = envs.observation_space.shape actor_critic = Policy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) if modelbased: rollouts = BiggerRolloutStorage( args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, aug_type=args.aug_type, split_ratio=args.split_ratio) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, aug_type=args.aug_type, split_ratio=args.split_ratio) batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch) if args.use_ucb: aug_id = data_augs.Identity aug_list = [ aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys()) ] agent = algo.UCBDrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), ucb_exploration_coef=args.ucb_exploration_coef, ucb_window_length=args.ucb_window_length) elif args.use_meta_learning: aug_id = data_augs.Identity aug_list = [aug_to_func[t](batch_size=batch_size) \ for t in list(aug_to_func.keys())] aug_model = AugCNN() aug_model.to(device) agent = algo.MetaDrAC(actor_critic, aug_model, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, meta_grad_clip=args.meta_grad_clip, meta_num_train_steps=args.meta_num_train_steps, meta_num_test_steps=args.meta_num_test_steps, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_coef=args.aug_coef) elif args.use_rl2: aug_id = data_augs.Identity aug_list = [ aug_to_func[t](batch_size=batch_size) for t in list(aug_to_func.keys()) ] rl2_obs_shape = [envs.action_space.n + 1] rl2_learner = Policy(rl2_obs_shape, len(list(aug_to_func.keys())), base_kwargs={ 'recurrent': True, 'hidden_size': args.rl2_hidden_size }) rl2_learner.to(device) agent = algo.RL2DrAC(actor_critic, rl2_learner, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rl2_entropy_coef, lr=args.lr, eps=args.eps, rl2_lr=args.rl2_lr, rl2_eps=args.rl2_eps, max_grad_norm=args.max_grad_norm, aug_list=aug_list, aug_id=aug_id, aug_coef=args.aug_coef, num_aug_types=len(list(aug_to_func.keys())), recurrent_hidden_size=args.rl2_hidden_size, num_actions=envs.action_space.n, device=device) elif False: # Regular Drac aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) agent = algo.DrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name) elif False: # Model Free Planning Drac aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) actor_critic = PlanningPolicy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) agent = algo.DrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name) else: # Model based Drac aug_id = data_augs.Identity aug_func = aug_to_func[args.aug_type](batch_size=batch_size) actor_critic = ModelBasedPolicy(obs_shape, envs.action_space.n, base_kwargs={ 'recurrent': False, 'hidden_size': args.hidden_size }) actor_critic.to(device) agent = algo.ConvDrAC(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, aug_id=aug_id, aug_func=aug_func, aug_coef=args.aug_coef, env_name=args.env_name) obs = envs.reset() rollouts.obs[0].copy_(obs) if modelbased: rollouts.next_obs[0].copy_(obs) # TODO: is this right? rollouts.to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in trange(num_updates): actor_critic.train() for step in range(args.num_steps): # Sample actions with torch.no_grad(): obs_id = aug_id(rollouts.obs[step]) value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): obs_id = aug_id(rollouts.obs[-1]) next_value = actor_critic.get_value( obs_id, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.gae_lambda) if args.use_ucb and j > 0: agent.update_ucb_values(rollouts) if isinstance(agent, algo.ConvDrAC): value_loss, action_loss, dist_entropy, transition_model_loss, reward_model_loss = agent.update( rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "\nUpdate {}, step {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}" .format(j, total_num_steps, len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), dist_entropy, value_loss, action_loss)) logger.logkv("train/nupdates", j) logger.logkv("train/total_num_steps", total_num_steps) logger.logkv("losses/dist_entropy", dist_entropy) logger.logkv("losses/value_loss", value_loss) logger.logkv("losses/action_loss", action_loss) if isinstance(agent, algo.ConvDrAC): logger.logkv("losses/transition_model_loss", transition_model_loss) logger.logkv("losses/reward_model_loss", reward_model_loss) logger.logkv("train/mean_episode_reward", np.mean(episode_rewards)) logger.logkv("train/median_episode_reward", np.median(episode_rewards)) ### Eval on the Full Distribution of Levels ### eval_episode_rewards = evaluate(args, actor_critic, device, aug_id=aug_id) logger.logkv("test/mean_episode_reward", np.mean(eval_episode_rewards)) logger.logkv("test/median_episode_reward", np.median(eval_episode_rewards)) logger.dumpkvs()
loss = criterion(output, cls) # compute loss optimizer.zero_grad() # set grad of all parameters to zero loss.backward() # compute gradient for each parameters optimizer.step() # update parameters ''' write out information to tensorboard ''' writer.add_scalar('loss vs iters', loss.data.cpu().numpy(), iters) # training loss vs num of iterations train_info += ' loss: {:.4f}'.format(loss.data.cpu().numpy()) print(train_info) if epoch % args.val_epoch == 0: ''' evaluate the model ''' acc = test.evaluate(model, val_loader, 0) writer.add_scalar( 'val_acc vs epoch', acc, epoch) # mIOU score on validation set vs num of epochs writer.add_scalar( 'val_acc vs iters', acc, iters) # mIOU score on validation set vs num of iterations print('Epoch: [{}] ACC:{}'.format(epoch, acc)) ''' save best model ''' if acc > best_acc: if args.model_level == 'baseline': save_model( model, os.path.join(args.save_dir, 'baseline_model.pth.tar')) else: save_model(
def main(args): #torch.backends.cudnn.benchmark=True # This makes dilated conv much faster for CuDNN 7.5 # MODEL num_features = [args.features*i for i in range(1, args.levels+1)] if args.feature_growth == "add" else \ [args.features*2**i for i in range(0, args.levels)] target_outputs = int(args.output_size * args.sr) model = Waveunet(args.channels, num_features, args.channels, args.instruments, kernel_size=args.kernel_size, target_output_size=target_outputs, depth=args.depth, strides=args.strides, conv_type=args.conv_type, res=args.res, separate=args.separate) if args.cuda: model = model_utils.DataParallel(model) print("move model to gpu") model.cuda() print('model: ', model) print('parameter count: ', str(sum(p.numel() for p in model.parameters()))) writer = SummaryWriter(args.log_dir) ### DATASET musdb = get_musdb_folds(args.dataset_dir) # If not data augmentation, at least crop targets to fit model output shape crop_func = partial(crop_targets, shapes=model.shapes) # Data augmentation function for training augment_func = partial(random_amplify, shapes=model.shapes, min=0.7, max=1.0) train_data = SeparationDataset(musdb, "train", args.instruments, args.sr, args.channels, model.shapes, True, args.hdf_dir, audio_transform=augment_func) val_data = SeparationDataset(musdb, "val", args.instruments, args.sr, args.channels, model.shapes, False, args.hdf_dir, audio_transform=crop_func) test_data = SeparationDataset(musdb, "test", args.instruments, args.sr, args.channels, model.shapes, False, args.hdf_dir, audio_transform=crop_func) dataloader = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, worker_init_fn=utils.worker_init_fn) ##### TRAINING #### # Set up the loss function if args.loss == "L1": criterion = nn.L1Loss() elif args.loss == "L2": criterion = nn.MSELoss() else: raise NotImplementedError("Couldn't find this loss!") # Set up optimiser optimizer = Adam(params=model.parameters(), lr=args.lr) # Set up training state dict that will also be saved into checkpoints state = {"step": 0, "worse_epochs": 0, "epochs": 0, "best_loss": np.Inf} # LOAD MODEL CHECKPOINT IF DESIRED if args.load_model is not None: print("Continuing training full model from checkpoint " + str(args.load_model)) state = model_utils.load_model(model, optimizer, args.load_model, args.cuda) print('TRAINING START') while state["worse_epochs"] < args.patience: print("Training one epoch from iteration " + str(state["step"])) avg_time = 0. model.train() with tqdm(total=len(train_data) // args.batch_size) as pbar: np.random.seed() for example_num, (x, targets) in enumerate(dataloader): if args.cuda: x = x.cuda() for k in list(targets.keys()): targets[k] = targets[k].cuda() t = time.time() # Set LR for this iteration utils.set_cyclic_lr(optimizer, example_num, len(train_data) // args.batch_size, args.cycles, args.min_lr, args.lr) writer.add_scalar("lr", utils.get_lr(optimizer), state["step"]) # Compute loss for each instrument/model optimizer.zero_grad() outputs, avg_loss = model_utils.compute_loss(model, x, targets, criterion, compute_grad=True) optimizer.step() state["step"] += 1 t = time.time() - t avg_time += (1. / float(example_num + 1)) * (t - avg_time) writer.add_scalar("train_loss", avg_loss, state["step"]) if example_num % args.example_freq == 0: input_centre = torch.mean( x[0, :, model.shapes["output_start_frame"]:model. shapes["output_end_frame"]], 0) # Stereo not supported for logs yet writer.add_audio("input", input_centre, state["step"], sample_rate=args.sr) for inst in outputs.keys(): writer.add_audio(inst + "_pred", torch.mean(outputs[inst][0], 0), state["step"], sample_rate=args.sr) writer.add_audio(inst + "_target", torch.mean(targets[inst][0], 0), state["step"], sample_rate=args.sr) pbar.update(1) # VALIDATE val_loss = validate(args, model, criterion, val_data) print("VALIDATION FINISHED: LOSS: " + str(val_loss)) writer.add_scalar("val_loss", val_loss, state["step"]) # EARLY STOPPING CHECK checkpoint_path = os.path.join(args.checkpoint_dir, "checkpoint_" + str(state["step"])) if val_loss >= state["best_loss"]: state["worse_epochs"] += 1 else: print("MODEL IMPROVED ON VALIDATION SET!") state["worse_epochs"] = 0 state["best_loss"] = val_loss state["best_checkpoint"] = checkpoint_path # CHECKPOINT print("Saving model...") model_utils.save_model(model, optimizer, state, checkpoint_path) state["epochs"] += 1 #### TESTING #### # Test loss print("TESTING") # Load best model based on validation loss state = model_utils.load_model(model, None, state["best_checkpoint"], args.cuda) test_loss = validate(args, model, criterion, test_data) print("TEST FINISHED: LOSS: " + str(test_loss)) writer.add_scalar("test_loss", test_loss, state["step"]) # Mir_eval metrics test_metrics = evaluate(args, musdb["test"], model, args.instruments) # Dump all metrics results into pickle file for later analysis if needed with open(os.path.join(args.checkpoint_dir, "results.pkl"), "wb") as f: pickle.dump(test_metrics, f) # Write most important metrics into Tensorboard log avg_SDRs = { inst: np.mean([np.nanmean(song[inst]["SDR"]) for song in test_metrics]) for inst in args.instruments } avg_SIRs = { inst: np.mean([np.nanmean(song[inst]["SIR"]) for song in test_metrics]) for inst in args.instruments } for inst in args.instruments: writer.add_scalar("test_SDR_" + inst, avg_SDRs[inst], state["step"]) writer.add_scalar("test_SIR_" + inst, avg_SIRs[inst], state["step"]) overall_SDR = np.mean([v for v in avg_SDRs.values()]) writer.add_scalar("test_SDR", overall_SDR) print("SDR: " + str(overall_SDR)) writer.close()
''' move data to gpu ''' imgs, seg = imgs.cuda(), seg.cuda() ''' forward path ''' output = model(imgs) ''' compute loss, backpropagation, update parameters ''' loss = criterion(output, seg) # compute loss optimizer.zero_grad() # set grad of all parameters to zero loss.backward() # compute gradient for each parameters optimizer.step() # update parameters ''' write out information to tensorboard ''' writer.add_scalar('loss', loss.data.cpu().numpy(), iters) train_info += ' loss: {:.4f}'.format(loss.data.cpu().numpy()) print(train_info) if epoch % args.val_epoch == 0: ''' evaluate the model ''' acc = evaluate(model, val_loader) writer.add_scalar('val_acc', acc, iters) print('Epoch: [{}] ACC:{}'.format(epoch, acc)) ''' save best model ''' if acc > best_acc: save_model( model, os.path.join(args.save_improved_dir, 'model_best.pth.tar')) best_acc = acc ''' save model ''' #save_model(model, os.path.join(args.save_improved_dir, 'model_{}.pth.tar'.format(epoch))) sched.step()
def train(args, seeds): global last_checkpoint_time args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda:0" if args.cuda else "cpu") if 'cuda' in device.type: print('Using CUDA\n') torch.set_num_threads(1) utils.seed(args.seed) # Configure logging if args.xpid is None: args.xpid = "lr-%s" % time.strftime("%Y%m%d-%H%M%S") log_dir = os.path.expandvars(os.path.expanduser(args.log_dir)) plogger = FileWriter( xpid=args.xpid, xp_args=args.__dict__, rootdir=log_dir, seeds=seeds, ) stdout_logger = HumanOutputFormat(sys.stdout) checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (log_dir, args.xpid, "model.tar"))) # Configure actor envs start_level = 0 if args.full_train_distribution: num_levels = 0 level_sampler_args = None seeds = None else: num_levels = 1 level_sampler_args = dict( num_actors=args.num_processes, strategy=args.level_replay_strategy, replay_schedule=args.level_replay_schedule, score_transform=args.level_replay_score_transform, temperature=args.level_replay_temperature, eps=args.level_replay_eps, rho=args.level_replay_rho, nu=args.level_replay_nu, alpha=args.level_replay_alpha, staleness_coef=args.staleness_coef, staleness_transform=args.staleness_transform, staleness_temperature=args.staleness_temperature) envs, level_sampler = make_lr_venv( num_envs=args.num_processes, env_name=args.env_name, seeds=seeds, device=device, num_levels=num_levels, start_level=start_level, no_ret_normalization=args.no_ret_normalization, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, level_sampler_args=level_sampler_args) is_minigrid = args.env_name.startswith('MiniGrid') actor_critic = model_for_env_name(args, envs) actor_critic.to(device) print(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) batch_size = int(args.num_processes * args.num_steps / args.num_mini_batch) def checkpoint(): if args.disable_checkpoint: return logging.info("Saving checkpoint to %s", checkpointpath) torch.save( { "model_state_dict": actor_critic.state_dict(), "optimizer_state_dict": agent.optimizer.state_dict(), "args": vars(args), }, checkpointpath, ) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, env_name=args.env_name) level_seeds = torch.zeros(args.num_processes) if level_sampler: obs, level_seeds = envs.reset() else: obs = envs.reset() level_seeds = level_seeds.unsqueeze(-1) rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes timer = timeit.default_timer update_start_time = timer() for j in range(num_updates): actor_critic.train() for step in range(args.num_steps): # Sample actions with torch.no_grad(): obs_id = rollouts.obs[step] value, action, action_log_dist, recurrent_hidden_states = actor_critic.act( obs_id, rollouts.recurrent_hidden_states[step], rollouts.masks[step]) action_log_prob = action_log_dist.gather(-1, action) # Obser reward and next obs obs, reward, done, infos = envs.step(action) # Reset all done levels by sampling from level sampler for i, info in enumerate(infos): if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if level_sampler: level_seeds[i][0] = info['level_seed'] # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, action_log_dist, value, reward, masks, bad_masks, level_seeds) with torch.no_grad(): obs_id = rollouts.obs[-1] next_value = actor_critic.get_value( obs_id, rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.gamma, args.gae_lambda) # Update level sampler if level_sampler: level_sampler.update_with_rollouts(rollouts) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if level_sampler: level_sampler.after_update() # Log stats every log_interval updates or if it is the last update if (j % args.log_interval == 0 and len(episode_rewards) > 1) or j == num_updates - 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps update_end_time = timer() num_interval_updates = 1 if j == 0 else args.log_interval sps = num_interval_updates * (args.num_processes * args.num_steps) / (update_end_time - update_start_time) update_start_time = update_end_time logging.info(f"\nUpdate {j} done, {total_num_steps} steps\n ") logging.info( f"\nEvaluating on {args.num_test_seeds} test levels...\n ") eval_episode_rewards, transitions = evaluate( args, actor_critic, args.num_test_seeds, device) plogger._save_data(transitions, f'test_trajectories_{j}.pkl') logging.info( f"\nEvaluating on {args.num_test_seeds} train levels...\n ") train_eval_episode_rewards, transitions = evaluate( args, actor_critic, args.num_test_seeds, device, start_level=0, num_levels=args.num_train_seeds, seeds=seeds, level_sampler=level_sampler) stats = { "step": total_num_steps, "pg_loss": action_loss, "value_loss": value_loss, "dist_entropy": dist_entropy, "train:mean_episode_return": np.mean(episode_rewards), "train:median_episode_return": np.median(episode_rewards), "test:mean_episode_return": np.mean(eval_episode_rewards), "test:median_episode_return": np.median(eval_episode_rewards), "train_eval:mean_episode_return": np.mean(train_eval_episode_rewards), "train_eval:median_episode_return": np.median(train_eval_episode_rewards), "sps": sps, } if is_minigrid: stats["train:success_rate"] = np.mean( np.array(episode_rewards) > 0) stats["train_eval:success_rate"] = np.mean( np.array(train_eval_episode_rewards) > 0) stats["test:success_rate"] = np.mean( np.array(eval_episode_rewards) > 0) if j == num_updates - 1: logging.info( f"\nLast update: Evaluating on {args.num_test_seeds} test levels...\n " ) final_eval_episode_rewards, transitions = evaluate( args, actor_critic, args.final_num_test_seeds, device) mean_final_eval_episode_rewards = np.mean( final_eval_episode_rewards) median_final_eval_episide_rewards = np.median( final_eval_episode_rewards) plogger.log_final_test_eval({ 'num_test_seeds': args.final_num_test_seeds, 'mean_episode_return': mean_final_eval_episode_rewards, 'median_episode_return': median_final_eval_episide_rewards }) plogger.log(stats) if args.verbose: stdout_logger.writekvs(stats) # Log level weights if level_sampler and j % args.weight_log_interval == 0: plogger.log_level_weights(level_sampler.sample_weights()) # Checkpoint timer = timeit.default_timer if last_checkpoint_time is None: last_checkpoint_time = timer() try: if j == num_updates - 1 or \ (args.save_interval > 0 and timer() - last_checkpoint_time > args.save_interval * 60): # Save every 10 min. checkpoint() last_checkpoint_time = timer() except KeyboardInterrupt: return
'D:\py_pro\YOLOv3-PyTorch\yolo_cfg\\' + model_name + '.cfg', 'weights': 'D:\py_pro\YOLOv3-PyTorch\weights\\' + map_name + '\\yolov3_ep43-map82.67-loss0.15187.pt', 'train_path': 'D:\py_pro\YOLOv3-PyTorch\data\\' + map_name + '\\train.txt', 'val_path': 'D:\py_pro\YOLOv3-PyTorch\data\\' + map_name + '\\val.txt', 'prune_num': 16, # YOLOv3标准网络中有23个res块,这里代表剪掉多少块 } model = YOLOv3(import_param['cfg_path']).cuda() model.load_state_dict(torch.load(import_param['weights'])) precision, recall, before_AP, f1, ap_class = evaluate( model, path=import_param['val_path'], iou_thres=import_param['iou_thres'], conf_thres=import_param['conf_thres'], nms_thres=import_param['nms_thres'], img_size=import_param['img_size'], batch_size=import_param['batch_size'], ) # 剪枝前模型参数总量 before_parameters = sum([param.nelement() for param in model.parameters()]) print(f'稀疏化训练后模型mAP:{before_AP.mean():.4f}') CBL_idx, _, shortcut_idx = parse_blocks_layer(model.blocks) # 将所有要剪枝的BN层的绝对值化γ参数,拷贝到bn_weights一维tensor上 bn_weights = gather_bn_weights(model.module_list, shortcut_idx) # torch.sort return: (value, index) 是排序后的值列表,排序后的值在排序前的索引 默认从小到大排序 sorted_bn = torch.sort(bn_weights)[0]
print('Finished Training!') if __name__ == "__main__": device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print(device) bert_model = "Musixmatch/umberto-commoncrawl-cased-v1" # bert_model = "idb-ita/gilberto-uncased-from-camembert" num_classes = 11 bert = UmbertoCustom(bert_model=bert_model, num_classes=num_classes).to(device) train_iter, valid_iter, test_iter = get_train_valid_test_fine(bert_model=bert_model, max_seq_lenght=512) opt = optim.Adam(bert.parameters(), lr=2e-5) init_time = time.time() train(model=bert, optimizer=opt, train_loader=train_iter, valid_loader=valid_iter, num_epochs=5, eval_every=len(train_iter) // 2, file_path="../data/models/") tot_time = time.time() - init_time print("time taken:", int(tot_time // 60), "minutes", int(tot_time % 60), "seconds") best_model = UmbertoCustom(bert_model=bert_model, num_classes=num_classes).to(device) load_checkpoint("../data/models" + '/model2.pt', best_model, device) evaluate(best_model, test_iter, num_classes, device)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=10000, help="number of epochs") parser.add_argument("--batch_size", type=int, default=20, help="size of each image batch") parser.add_argument("--data_config", type=str, default="config/adc.data", help="path to data config file") # parser.add_argument("--pretrained_weights", type=str, default="config/yolov3_ckpt_5.pth") # models/model1/yolov3_ckpt_73.pth parser.add_argument("--pretrained_weights", type=str) # models/model1/yolov3_ckpt_73.pth parser.add_argument( "--n_cpu", type=int, default=0, help="number of cpu threads to use during batch generation") parser.add_argument("--img_size", type=int, default=[768, 1024], help="size of each image dimension") parser.add_argument("--evaluation_interval", type=int, default=1, help="interval evaluations on validation set") parser.add_argument("--multiscale", default='False', choices=['True', 'False']) parser.add_argument("--augment", default='False', choices=['True', 'False']) parser.add_argument("--save_path", type=str, default='models/weights_1350_0102', help="save model path") parser.add_argument("--debug", type=str, default='False', choices=['True', 'False'], help="debug") parser.add_argument("--lr", type=float, default=0.01, help="learning rate") args = parser.parse_args(argv) args.debug = True if args.debug == 'True' else False args.multiscale = True if args.multiscale == 'True' else False args.augment = True if args.augment == 'True' else False print_args(args) print( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')) if args.debug: print('debug...') import shutil # if os.path.exists(args.save_path): # shutil.rmtree(args.save_path) args.evaluation_interval = 1 # debug模式下先删除save_path,并每间隔一轮验证一次 # assert not os.path.exists(args.save_path) # os.makedirs(args.save_path) # adc.dat下有train和valid两个dat还有anchor.txt的路径 data_config = parse_data_config(args.data_config) train_path = data_config["train"] valid_path = data_config["valid"] if args.debug: valid_path = train_path anchors = get_anchors(data_config['anchors']).to('cuda') model = ResNet(anchors).to('cuda') if args.pretrained_weights: print('pretrained weights: ', args.pretrained_weights) model.load_pretrained_weights(args.pretrained_weights) dataset = ListDataset(train_path, img_size=args.img_size, augment=args.augment, multiscale=args.multiscale) eval = evaluate(path=valid_path, img_size=args.img_size, batch_size=args.batch_size, debug=args.debug) if args.debug: dataset.img_files = dataset.img_files[:10 * args.batch_size] dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_cpu, collate_fn=dataset.collate_fn, ) print('Number train sample: ', len(dataset)) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=5e-5) # 这里优化器和学习率是不是要调节? print('\n### train ...') for epoch in range(args.epochs): model.train() lr = max(1e-10, args.lr * (0.95**epoch)) for param_group in optimizer.param_groups: param_group['lr'] = lr for batch_i, (imgs, targets, _) in enumerate(dataloader): imgs = Variable(imgs.to('cuda')) # 训练集有经过augment_sequential,而验证集没有 # targets=([[0.0000, 0.7328, 0.2808, 0.0934, 0.0808], # [1.0000, 0.5255, 0.5466, 0.0596, 0.1587], # [1.0000, 0.5585, 0.8077, 0.0553, 0.2250], # [3.0000, 0.4519, 0.4351, 0.1365, 0.2048]], device='cuda:0') targets = Variable(targets.to('cuda'), requires_grad=False) yolo_map, _ = model(imgs) # yolo_map.shape : [4,] 其中每个yolo_map的格式如下: batch,featuremap_h,featuremap_w,anchor_num,(x,y,w,h,conf) loss, metrics = model.loss(yolo_map, targets) loss.backward() optimizer.step() optimizer.zero_grad() if (batch_i + 1) % 100 == 0 or (batch_i + 1) == len(dataloader): time_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') lr = optimizer.param_groups[0]['lr'] loss = metrics["loss"] xy = metrics["xy"] wh = metrics["wh"] conf = metrics["conf"] loss_str = 'loss: {:<8.2f}'.format(loss) loss_str += 'xy: {:<8.2f}'.format(xy) loss_str += 'wh: {:<8.2f}'.format(wh) loss_str += 'conf: {:<8.2f}'.format(conf) epoch_str = 'Epoch: {:4}({:4}/{:4})'.format( epoch, batch_i + 1, len(dataloader)) print('[{}]{} {} lr:{}'.format(time_str, epoch_str, loss_str, lr)) print() if epoch % args.evaluation_interval == 0: print("\n---- Evaluating Model ----") save_model_epoch = 'yolov3_ckpt_{}.pth'.format(epoch) model.save_weights(os.path.join(args.save_path, save_model_epoch)) print(save_model_epoch) example_save_path = args.save_path for conf in [0.1, 0.3, 0.5, 0.7]: metrics = eval(model, iou_thres=0.5, conf_thres=conf, nms_thres=0.5, save_path=example_save_path) example_save_path = None print( 'image_acc: {}\t{}\tbbox_acc: {}\tbbox_recall: {}'.format( *metrics[1:])) names = ['image', 'ture', 'det', 'box_acc', 'image_acc'] print('{:<10}{:<10}{:<10}{:<10}{:<10}'.format(*names)) print('{:<10}{:<10}{:<10}{:<10}{:<10}'.format(*metrics[0][0])) print('{:<10}{:<10}{:<10}{:<10}{:<10}'.format(*metrics[0][1])) print()
ev_iterations = 100 n_units = 40 print(' ') print('number of hidden units: ' + str(n_units)) print(' ') test_data_stack = [] test_hh_stack = [] for i in range(ev_iterations): import test # Evaluate on test dataset test_data = test.generate_test_dataset() test_perp, test_hh = test.evaluate(test_data, test=True) test_data_stack.extend(test_data['coordinates']) test_hh_stack.extend(test_hh) input_data = test_hh_stack output_data = [] for i in range(len(test_data_stack)): output_data.append(test_data_stack[i][0] + test_data_stack[i][1] * 9) print('') # data allocation X_train, X_test, y_train, y_test = train_test_split(input_data, output_data) # parameters
def train(check_every=0, save_every=5): global model, epoch, step optimizer = get_optimizer() model.train() start = tic = time() train_mAP = test_mAP = 0. if summary['map']['train']: train_mAP = summary['map']['train'][-1][1] test_mAP = summary['map']['test'][-1][1] if pretty: pretty_head() for e in range(epoch, num_epochs): if not pretty: print('- Epoch {}'.format(e)) for x, y, a in loader_train: if len(y) == 0: continue # no target in this image loss = train_step(x, y, a, optimizer) toc = time() iter_time = toc - tic tic = toc if check_every and step > 0 and step % check_every == 0: # evaluate the mAP # Keep quite voc_train.mute = True voc_test.mute = True if pretty: pretty_tail() print('Checking mAP ...') train_mAP = evaluate(model, loader_val, 200) summary['map']['train'].append((step, train_mAP)) test_mAP = evaluate(model, loader_test, 200) summary['map']['test'].append((step, test_mAP)) if pretty: pretty_head() else: print('train mAP = {:.1f}%'.format(100 * train_mAP)) print('test mAP = {:.1f}%'.format(100 * test_mAP)) voc_train.mute = pretty voc_test.mute = pretty step += 1 if pretty: pretty_body(summary, start, iter_time, learning_rate, epoch, step, a['image_id'], train_mAP, test_mAP) else: print('Use time: {:.2f}s'.format(iter_time)) print('-- Iteration {it}, loss = {loss:.4f}\n'.format( it=step, loss=loss)) epoch += 1 # save model if epoch % save_every == 0: save() if epoch in decay_epochs: save() optimizer = lr_decay() if pretty: pretty_tail()
table = {'left_eye': 4, 'right_eye': 5, 'upper_lip': 12, 'lower_lip': 13} image_path = args.img_path cp = 'cp/79999_iter.pth' try: image = cv2.imread(image_path) ori = image.copy() im2 = image.copy() im2 = cv2.rectangle(im2, (0, 0), (1080, 1080), (255, 255, 255), thickness=1080) except AttributeError: print('Image not found. Please enter a valid path.') quit() parsing = evaluate(image_path, cp) parsing = cv2.resize(parsing, image.shape[0:2], interpolation=cv2.INTER_NEAREST) parts = [ table['left_eye'], table['right_eye'], table['upper_lip'], table['lower_lip'] ] color = [139, 0, 139] for part in parts: image = mask(image, parsing, part, color) im2 = mask(im2, parsing, part, color) cv2.imshow('image', cv2.resize(ori, (512, 512)))
# Tensorboard logging tensorboard_log = [] for j, yolo in enumerate(yolo_layers): for name, metric in yolo.metrics.items(): if name != "grid_size": tensorboard_log += [(f"{name}_{j + 1}", metric)] tensorboard_log += [("loss", loss.item())] logger_fakenight.list_of_scalars_summary(tensorboard_log, epoch) print("\n---- Evaluating Model on Daytime ----") # Evaluate the model on the validation set precision, recall, AP, f1, ap_class = evaluate( model_list, path=valid_path_daytime, iou_thres=0.5, conf_thres=0.5, nms_thres=0.5, img_size=opt.img_size, batch_size=8, ) evaluation_metrics = [ ("val_precision", precision.mean()), ("val_recall", recall.mean()), ("val_mAP", AP.mean()), ("val_f1", f1.mean()), ] logger_daytime.list_of_scalars_summary(evaluation_metrics, epoch) # Print class APs and mAP ap_table = [["Index", "Class name", "AP"]] for i, c in enumerate(ap_class):
ev_iterations = 100 n_units = 40 print(' ') print('number of hidden units: ' + str(n_units)) print(' ') test_data_stack = [] test_hh_stack = [] for i in range(ev_iterations): import test # Evaluate on test dataset test_data = test.generate_test_dataset() test_perp, test_hh, test_y_bin_error_sum = test.evaluate(test_data, test=True) test_data_stack.extend(test_data['coordinates']) test_hh_stack.extend(test_hh) N_train = ev_iterations * 100 * 4 / 5 N_test = ev_iterations * 100 / 5 input_data = test_hh_stack output_data = [] for i in range(len(test_data_stack)): output_data.append(test_data_stack[i][0] + test_data_stack[i][1] * 9) print('') # data allocation
#!/usr/bin/env python3.4 import os import sys from data import DataDir from test import evaluate if 1 == len(sys.argv): print("Execute with:") print(" ./evaluate.py foo [classifier]") print( "Where foo indicates a classify-foo.txt file in data/,") print( "and classifier is one of " "baseline|tandem|reluctant-tandem|bigram|frequency.)") sys.exit(1) env = sys.argv[1] classifier = sys.argv[2] if len(sys.argv) >= 3 else "tandem" evaluate(env, classifier)