コード例 #1
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        '--classifier',
        default='guoday',
        type=str,
        required=True,
        help='classifier type, guoday or MLP or GRU_MLP or ...')
    parser.add_argument('--optimizer',
                        default='RAdam',
                        type=str,
                        required=True,
                        help='optimizer we use, RAdam or ...')
    parser.add_argument("--do_label_smoothing",
                        default='yes',
                        type=str,
                        required=True,
                        help="Whether to do label smoothing. yes or no.")
    parser.add_argument('--draw_loss_steps',
                        default=1,
                        type=int,
                        required=True,
                        help='training steps to draw loss')
    parser.add_argument('--label_name',
                        default='label',
                        type=str,
                        required=True,
                        help='label name in original train set index')

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        default='yes',
                        type=str,
                        required=True,
                        help="Whether to run training. yes or no.")
    parser.add_argument("--do_test",
                        default='yes',
                        type=str,
                        required=True,
                        help="Whether to run training. yes or no.")
    parser.add_argument("--do_eval",
                        default='yes',
                        type=str,
                        required=True,
                        help="Whether to run eval on the dev set. yes or no.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=200, type=int, help="")
    parser.add_argument("--lstm_hidden_size", default=300, type=int, help="")
    parser.add_argument("--lstm_layers", default=2, type=int, help="")
    parser.add_argument("--dropout", default=0.5, type=float, help="")

    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--report_steps", default=-1, type=int, help="")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--split_num", default=3, type=int, help="text split")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    try:
        os.makedirs(args.output_dir)
    except:
        pass

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)

    # tensorboard_log_dir = args.output_dir

    # loss_now = tf.placeholder(dtype=tf.float32, name='loss_now')
    # loss_mean = tf.placeholder(dtype=tf.float32, name='loss_mean')
    # loss_now_variable = loss_now
    # loss_mean_variable = loss_mean
    # train_loss = tf.summary.scalar('train_loss', loss_now_variable)
    # dev_loss_mean = tf.summary.scalar('dev_loss_mean', loss_mean_variable)
    # merged = tf.summary.merge([train_loss, dev_loss_mean])

    config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3)
    config.hidden_dropout_prob = args.dropout

    # Prepare model
    if args.do_train == 'yes':
        model = BertForSequenceClassification.from_pretrained(
            args.model_name_or_path, args, config=config)

        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    if args.do_train == 'yes':
        print(
            '________________________now training______________________________'
        )
        # Prepare data loader

        train_examples = read_examples(os.path.join(args.data_dir,
                                                    'train.csv'),
                                       is_training=True,
                                       label_name=args.label_name)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.split_num, True)
        # print('train_feature_size=', train_features.__sizeof__())
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        # print('train_data=',train_data[0])
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)

        num_train_optimization_steps = args.train_steps

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if args.optimizer == 'RAdam':
            optimizer = RAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate)
        else:
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=args.train_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        model.train()
        tr_loss = 0
        loss_batch = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)

        # with tf.Session() as sess:
        #     summary_writer = tf.summary.FileWriter(tensorboard_log_dir, sess.graph)
        #     sess.run(tf.global_variables_initializer())

        list_loss_mean = []
        bx = []
        eval_F1 = []
        ax = []

        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids=input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         labels=label_ids)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            loss_batch += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)

            bar.set_description("loss {}".format(train_loss))
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            if args.fp16:
                # optimizer.backward(loss)
                loss.backward()
            else:
                loss.backward()

            # draw loss every n docs
            if (step + 1) % int(args.draw_loss_steps /
                                (args.train_batch_size /
                                 args.gradient_accumulation_steps)) == 0:
                list_loss_mean.append(round(loss_batch, 4))
                bx.append(step + 1)
                plt.plot(bx,
                         list_loss_mean,
                         label='loss_mean',
                         linewidth=1,
                         color='b',
                         marker='o',
                         markerfacecolor='green',
                         markersize=2)
                plt.savefig(args.output_dir + '/labeled.jpg')
                loss_batch = 0

            # paras update every batch data.
            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear.get_lr(
                        global_step, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            # report results every 200 real batch.
            if step % (args.eval_steps *
                       args.gradient_accumulation_steps) == 0 and step > 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            # do evaluation totally 10 times during training stage.
            if args.do_eval == 'yes' and (step + 1) % int(
                    num_train_optimization_steps / 10) == 0 and step > 450:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    eval_examples = read_examples(os.path.join(
                        args.data_dir, file),
                                                  is_training=True,
                                                  label_name=args.label_name)
                    eval_features = convert_examples_to_features(
                        eval_examples, tokenizer, args.max_seq_length,
                        args.split_num, False)
                    all_input_ids = torch.tensor(select_field(
                        eval_features, 'input_ids'),
                                                 dtype=torch.long)
                    all_input_mask = torch.tensor(select_field(
                        eval_features, 'input_mask'),
                                                  dtype=torch.long)
                    all_segment_ids = torch.tensor(select_field(
                        eval_features, 'segment_ids'),
                                                   dtype=torch.long)
                    all_label = torch.tensor([f.label for f in eval_features],
                                             dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label)

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  labels=label_ids)
                            logits = model(input_ids=input_ids,
                                           token_type_ids=segment_ids,
                                           attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_labels = np.concatenate(inference_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    model.train()
                    ###############################################
                    num_gold_0 = np.sum(gold_labels == 0)
                    num_gold_1 = np.sum(gold_labels == 1)
                    num_gold_2 = np.sum(gold_labels == 2)

                    right_0 = 0
                    right_1 = 0
                    right_2 = 0
                    error_0 = 0
                    error_1 = 0
                    error_2 = 0

                    for gold_label, inference_label in zip(
                            gold_labels, inference_labels):
                        if gold_label == inference_label:
                            if gold_label == 0:
                                right_0 += 1
                            elif gold_label == 1:
                                right_1 += 1
                            else:
                                right_2 += 1
                        elif inference_label == 0:
                            error_0 += 1
                        elif inference_label == 1:
                            error_1 += 1
                        else:
                            error_2 += 1

                    recall_0 = right_0 / (num_gold_0 + 1e-5)
                    recall_1 = right_1 / (num_gold_1 + 1e-5)
                    recall_2 = right_2 / (num_gold_2 + 1e-5)
                    precision_0 = right_0 / (error_0 + right_0 + 1e-5)
                    precision_1 = right_1 / (error_1 + right_1 + 1e-5)
                    precision_2 = right_2 / (error_2 + right_2 + 1e-5)
                    f10 = 2 * precision_0 * recall_0 / (precision_0 +
                                                        recall_0 + 1e-5)
                    f11 = 2 * precision_1 * recall_1 / (precision_1 +
                                                        recall_1 + 1e-5)
                    f12 = 2 * precision_2 * recall_2 / (precision_2 +
                                                        recall_2 + 1e-5)

                    output_dev_result_file = os.path.join(
                        args.output_dir, "dev_results.txt")
                    with open(output_dev_result_file, 'a',
                              encoding='utf-8') as f:
                        f.write('precision:' + str(precision_0) + ' ' +
                                str(precision_1) + ' ' + str(precision_2) +
                                '\n')
                        f.write('recall:' + str(recall_0) + ' ' +
                                str(recall_1) + ' ' + str(recall_2) + '\n')
                        f.write('f1:' + str(f10) + ' ' + str(f11) + ' ' +
                                str(f12) + '\n' + '\n')

                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)
                    # draw loss.
                    eval_F1.append(round(eval_accuracy, 4))
                    ax.append(step)
                    plt.plot(ax,
                             eval_F1,
                             label='eval_F1',
                             linewidth=1,
                             color='r',
                             marker='o',
                             markerfacecolor='blue',
                             markersize=2)
                    for a, b in zip(ax, eval_F1):
                        plt.text(a, b, b, ha='center', va='bottom', fontsize=8)
                    plt.savefig(args.output_dir + '/labeled.jpg')

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("more accurate model arises, now best F1 = ",
                              eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model, only save the model it-self
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    '''
                    if (step+1) / int(num_train_optimization_steps/10) > 9.5:
                        print("=" * 80)
                        print("End of training. Saving Model......")
                        # Save a trained model, only save the model it-self
                        model_to_save = model.module if hasattr(model, 'module') else model
                        output_model_file = os.path.join(args.output_dir, "pytorch_model_final_step.bin")
                        torch.save(model_to_save.state_dict(), output_model_file)
                        print("=" * 80)
                    '''

    if args.do_test == 'yes':
        start_time = time.time()
        print(
            '___________________now testing for best eval f1 model_________________________'
        )
        try:
            del model
        except:
            pass
        gc.collect()
        args.do_train = 'no'
        model = BertForSequenceClassification.from_pretrained(os.path.join(
            args.output_dir, "pytorch_model.bin"),
                                                              args,
                                                              config=config)
        model.half()
        for layer in model.modules():
            if isinstance(layer, torch.nn.modules.batchnorm._BatchNorm):
                layer.float()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from "
                    "https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('test.csv', 'test')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False,
                                          label_name=args.label_name)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                    # print('test_logits=', logits)
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            if flag == 'dev':
                print(flag, accuracy(logits, gold_labels))
            elif flag == 'test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub.csv"),
                                       index=False)
                # df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False)
            else:
                raise ValueError('flag not in [dev, test]')
        print('inference time usd = {}s'.format(time.time() - start_time))
        '''
コード例 #2
0
def run():
    d = {
        'image_id': os.listdir(config.TRAIN_IMAGE_PATH),
        'mask_id': os.listdir(config.TRAIN_MASK_PATH)
    }
    df = pd.DataFrame(data=d)

    folds = df.copy()

    kf = KFold(n_splits=config.N_FOLDS, shuffle=True, random_state=42)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(folds)):

        print(f'FOLD: {fold+1}/{config.N_FOLDS}')

        train_test = folds.iloc[train_idx]
        valid_test = folds.iloc[valid_idx]

        train_test.reset_index(drop=True, inplace=True)
        valid_test.reset_index(drop=True, inplace=True)

        train_dataset = dataset.HuBMAPDataset(
            train_test, transforms=transforms.transforms_train)
        train_loader = DataLoader(train_dataset,
                                  batch_size=config.TRAIN_BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=config.NUM_WORKERS)

        valid_dataset = dataset.HuBMAPDataset(
            valid_test, transforms=transforms.transforms_valid)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=config.VALID_BATCH_SIZE,
                                  shuffle=False,
                                  num_workers=config.NUM_WORKERS)

        loss_history = {"train": [], "valid": []}

        dice_history = {"train": [], "valid": []}

        jaccard_history = {"train": [], "valid": []}

        dice_max = 0.0
        kernel_type = 'unext50'
        best_file = f'../drive/MyDrive/{kernel_type}_best_fold{fold}_strong_aug_70_epochs.bin'

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        model = UneXt50().to(device)
        optimizer = Lookahead(RAdam(filter(lambda p: p.requires_grad,
                                           model.parameters()),
                                    lr=config.LR),
                              alpha=0.5,
                              k=5)
        # base_opt = optim.Adam(model.parameters(), lr=3e-4)
        # optimizer = SWA(base_opt)
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, config.N_EPOCHS)
        # scheduler = GradualWarmupSchedulerV2(optimizer, multiplier=10, total_epoch=config.WARMUP_EPO, after_scheduler=scheduler_cosine)
        loss_fn = metrics.symmetric_lovasz

        for epoch in range(config.N_EPOCHS):

            scheduler.step(epoch)
            avg_train_loss, train_dice_scores, train_jaccard_scores = engine.train_loop_fn(
                model, train_loader, optimizer, loss_fn,
                metrics.dice_coef_metric, metrics.jaccard_coef_metric, device)

            # if epoch > 10 and epoch % 5 == 0:
            #   optimizer.update_swa()

            loss_history["train"].append(avg_train_loss)
            dice_history["train"].append(train_dice_scores)
            jaccard_history["train"].append(train_jaccard_scores)

            avg_val_loss, val_dice_scores, val_jaccard_scores = engine.val_loop_fn(
                model, valid_loader, optimizer, loss_fn,
                metrics.dice_coef_metric, metrics.jaccard_coef_metric, device)

            loss_history["valid"].append(avg_val_loss)
            dice_history["valid"].append(val_dice_scores)
            jaccard_history["valid"].append(val_jaccard_scores)

            print(
                f"Epoch: {epoch+1} | lr: {optimizer.param_groups[0]['lr']:.7f} | train loss: {avg_train_loss:.4f} | val loss: {avg_val_loss:.4f}"
            )
            print(
                f"train dice: {train_dice_scores:.4f} | val dice: {val_dice_scores:.4f} | train jaccard: {train_jaccard_scores:.4f} | val jaccard: {val_jaccard_scores:.4f}"
            )

            if val_dice_scores > dice_max:
                print('score2 ({:.6f} --> {:.6f}).  Saving model ...'.format(
                    dice_max, val_dice_scores))
                torch.save(model.state_dict(), best_file)
                dice_max = val_dice_scores
if args.optimizer.lower()=='sgd':
    optimizer = optim.SGD(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
if args.optimizer.lower()=='sgdwm':
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower()=='adam':
    optimizer = torch.optim.Adam(net.parameters(), lr=args.lr,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(net.parameters(),lr=args.lr, momentum=args.momentum,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':#no tensorboardX
    from lars import LARS
    optimizer = LARS(net.parameters(), lr=args.lr,momentum=args.momentum,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer  = Lamb(net.parameters(),lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(net.parameters(), lr=args.lr,weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'dyna':
    from dyna import Dyna
    optimizer = Dyna(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
else:
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum,
                          weight_decay=args.weight_decay)
コード例 #4
0
ファイル: train.py プロジェクト: VladAIacob/VoiceCloning
def train(validate, n_gpus, rank, output_directory, epochs, optim_algo, learning_rate,
          weight_decay, sigma, iters_per_checkpoint, batch_size, seed,
          checkpoint_path, ignore_layers, include_layers, finetune_layers,
          warmstart_checkpoint_path, with_tensorboard, grad_clip_val,
          fp16_run):
    fp16_run = bool(fp16_run)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if n_gpus > 1:
        init_distributed(rank, n_gpus, **dist_config)

    criterion = FlowtronLoss(sigma, bool(model_config['n_components']),
                             bool(model_config['use_gate_layer']))
    model = Flowtron(**model_config).cuda()

    if len(finetune_layers):
        for name, param in model.named_parameters():
            if name in finetune_layers:
                param.requires_grad = True
            else:
                param.requires_grad = False

    print("Initializing %s optimizer" % (optim_algo))
    if optim_algo == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                     weight_decay=weight_decay)
    elif optim_algo == 'RAdam':
        optimizer = RAdam(model.parameters(), lr=learning_rate,
                          weight_decay=weight_decay)
    else:
        print("Unrecognized optimizer %s!" % (optim_algo))
        exit(1)

    # Load checkpoint if one exists
    iteration = 0
    if warmstart_checkpoint_path != "":
        model = warmstart(warmstart_checkpoint_path, model)

    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, ignore_layers)
        iteration += 1  # next iteration is iteration + 1

    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    print(model)
    scaler = amp.GradScaler(enabled=fp16_run)

    train_loader, valset, collate_fn = prepare_dataloaders(
        data_config, n_gpus, batch_size)

    # Get shared output_directory ready
    if rank == 0 and not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
        print("Output directory", output_directory)

    if with_tensorboard and rank == 0:
        tboard_out_path = os.path.join(output_directory, 'logs')
        print("Setting up Tensorboard log in %s" % (tboard_out_path))
        logger = FlowtronLogger(tboard_out_path)

    # force set the learning rate to what is specified
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))

    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for batch in train_loader:
            try:
                model.zero_grad()

                mel, embeds, text, in_lens, out_lens, gate_target, attn_prior = batch
                mel, embeds, text = mel.cuda(), embeds.cuda(), text.cuda()

                in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(), gate_target.cuda()
                attn_prior = attn_prior.cuda() if valset.use_attn_prior else None
                with amp.autocast(enabled=fp16_run):
                    z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                        mel, embeds, text, in_lens, out_lens, attn_prior)

                    loss_nll, loss_gate = criterion(
                        (z, log_s_list, gate_pred, mean, log_var, prob),
                        gate_target, out_lens)
                    loss = loss_nll + loss_gate

                if n_gpus > 1:
                    reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                    reduced_gate_loss = reduce_tensor(loss_gate.data, n_gpus).item()
                    reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item()
                else:
                    reduced_loss = loss.item()
                    reduced_gate_loss = loss_gate.item()
                    reduced_nll_loss = loss_nll.item()

                scaler.scale(loss).backward()
                if grad_clip_val > 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val)

                scaler.step(optimizer)
                scaler.update()

                if rank == 0:
                    print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True)

                if with_tensorboard and rank == 0:
                    logger.add_scalar('training_loss', reduced_loss, iteration)
                    logger.add_scalar('training_loss_gate', reduced_gate_loss, iteration)
                    logger.add_scalar('training_loss_nll', reduced_nll_loss, iteration)
                    logger.add_scalar('learning_rate', learning_rate, iteration)

                if iteration % iters_per_checkpoint == 0:
                    if validate:
                        val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss(
                            model, criterion, valset, collate_fn, batch_size, n_gpus)
                        if rank == 0:
                            print("Validation loss {}: {:9f}  ".format(iteration, val_loss))
                            if with_tensorboard:
                                logger.log_validation(
                                    val_loss, val_loss_nll, val_loss_gate, attns,
                                    gate_pred, gate_target, iteration)
                
                    checkpoint_path = "{}/model_{}.pt".format(
                                           output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                        checkpoint_path)

                iteration += 1
            except:
                print("SOMETHING BAD HAPPENED")
コード例 #5
0
    optimizer = optim.Adam(model.parameters(),
                           lr=args.init_lr,
                           betas=betas,
                           eps=1e-9)
elif args.optim == '1cycle':
    optimizer = optim.Adam(model.parameters(),
                           lr=args.init_lr,
                           betas=betas,
                           eps=1e-9)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer,
                                              max_lr=args.max_lr,
                                              steps_per_epoch=len(train_data),
                                              epochs=args.max_epochs)
elif args.optim == 'radam':
    optimizer = RAdam(model.parameters(),
                      lr=args.init_lr,
                      betas=betas,
                      eps=1e-9)
elif args.optim == 'schedule':
    optimizer = optim.Adam(model.parameters(),
                           lr=args.init_lr,
                           betas=betas,
                           eps=1e-9)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=args.schedule_factor,
        patience=args.schedule_patience)

# **************** TRAINING ******************
print('Training starts...')

alignment = None
コード例 #6
0
def main():
    n_envs = len(os.sched_getaffinity(0))
    factory = FallingEnvFactory()
    # factory = HalfCheetahEnvFactory()
    # factory = HumanoidFallingEnvFactory()
    env: Env = factory.make_env()
    envs: VectorEnv = AsyncVectorEnv([factory.make_env for _ in range(n_envs)])
    env_container = EnvContainer(env, envs)

    state_dim, = env.observation_space.shape
    action_dim, = env.action_space.shape
    relu = nn.ReLU()
    tanh = nn.Tanh()
    identity = nn.Identity()

    actor = ProbMLPConstantLogStd(state_dim, action_dim, [256, 256], relu, tanh, -1.0)
    critic = MultiLayerPerceptron(state_dim, 1, [256, 256], relu, identity)
    scaler_ = StandardScaler()
    print("Fit scaler")
    env.reset()
    state_seq = []
    for _ in tqdm(range(512)):
        action = env.action_space.sample()
        state, _, done, _ = env.step(action)
        state_seq.append(state)
        if done:
            env.reset()
    state_seq = np.stack(state_seq)
    scaler_.fit(state_seq)
    scaler = ScalerNet(scaler_)

    module_dict = ModuleDict()
    module_dict.set(ModuleKey.actor, actor)
    module_dict.set(ModuleKey.scaler, scaler)
    module_dict.set(ModuleKey.critic, critic)

    action_getter: ActionGetter = ActionGetterModule(actor, scaler)
    sample_collector: SampleCollector = SampleCollectorV0(env_container, action_getter, 2048, 1)

    mse_loss = nn.MSELoss()
    critic_tensor_inserter: TensorInserter = \
        TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \
        TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \
        TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic, TensorKey.cumulative_reward_predictions_tensor)
    critic_loss_calculator: LossCalculator = \
        LossCalculatorInputTarget(TensorKey.cumulative_reward_predictions_tensor, TensorKey.cumulative_rewards_tensor,
                                  mse_loss)

    actor_tensor_inserter: TensorInserter = \
        TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \
        TensorInserterTensorize(ArrayKey.actions, TensorKey.actions_tensor) + \
        TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \
        TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic,
                              TensorKey.cumulative_reward_predictions_tensor) + \
        TensorInserterLambda([TensorKey.cumulative_rewards_tensor, TensorKey.cumulative_reward_predictions_tensor],
                             lambda x, y: x - y, TensorKey.advantages_tensor) + \
        TensorInserterModuleLambda(ModuleKey.actor, [TensorKey.states_tensor, TensorKey.actions_tensor],
                                   lambda actor, state, action: actor.get_log_prob(state, action),
                                   TensorKey.new_log_probs_tensor) + \
        TensorInserterLambda([TensorKey.new_log_probs_tensor, TensorKey.log_probs_tensor, TensorKey.advantages_tensor],
                             get_ppo_surrogate_tensor, TensorKey.ppo_surrogates_tensor)

    actor_loss_calculator: LossCalculator = \
        LossCalculatorLambda([TensorKey.ppo_surrogates_tensor], lambda x: -torch.mean(x))

    actor_optimizer = RAdam(params=actor.parameters(), lr=3e-4)
    actor_updater: ModuleUpdater = ModuleUpdaterOptimizer(actor_optimizer)
    critic_optimizer = RAdam(params=critic.parameters(), lr=3e-4)
    critic_updater: ModuleUpdater = ModuleUpdaterOptimizer(critic_optimizer)

    actor_trainee = Trainee([actor], actor_updater, actor_tensor_inserter, actor_loss_calculator, 10)
    critic_trainee = Trainee([critic], critic_updater, critic_tensor_inserter, critic_loss_calculator, 10)

    trainer = RLTrainer(sample_collector, [critic_trainee, actor_trainee], 100000, 128)
    trainer.train(module_dict)
def main(lr=0.1):
    global best_acc
    args.lr = lr
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10',
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='/tmp/cifar10',
                                           train=False,
                                           download=True,
                                           transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=100,
                                             shuffle=False,
                                             num_workers=2)

    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')

    # Model
    print('==> Building model..')
    # net = VGG('VGG19')
    # net = ResNet18()
    # net = PreActResNet18()
    # net = GoogLeNet()
    # net = DenseNet121()
    # net = ResNeXt29_2x64d()
    # net = MobileNet()
    # net = MobileNetV2()
    # net = DPN92()
    # net = ShuffleNetG2()
    # net = SENet18()
    # net = ShuffleNetV2(1)
    # net = EfficientNetB0()
    # net = RegNetX_200MF()
    net = ResNet50()
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    ckpt = './checkpoint/' + args.optimizer + str(lr) + '_ckpt.pth'

    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isdir(
            'checkpoint'), 'Error: no checkpoint directory found!'
        checkpoint = torch.load(ckpt)
        net.load_state_dict(checkpoint['net'])
        best_acc = checkpoint['acc']
        start_epoch = checkpoint['epoch']

    criterion = nn.CrossEntropyLoss()
    if args.optimizer.lower() == 'sgd':
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              weight_decay=args.weight_decay)
    if args.optimizer.lower() == 'sgdwm':
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adam':
        optimizer = torch.optim.Adam(net.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'rmsprop':
        optimizer = optim.RMSprop(net.parameters(),
                                  lr=args.lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adagrad':
        optimizer = optim.Adagrad(net.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'radam':
        from radam import RAdam
        optimizer = RAdam(net.parameters(),
                          lr=args.lr,
                          weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lars':  #no tensorboardX
        from lars import LARS
        optimizer = LARS(net.parameters(),
                         lr=args.lr,
                         momentum=args.momentum,
                         weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lamb':
        from lamb import Lamb
        optimizer = Lamb(net.parameters(),
                         lr=args.lr,
                         weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'novograd':
        from novograd import NovoGrad
        optimizer = NovoGrad(net.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)
    else:
        optimizer = optim.SGD(net.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    # lrs = create_lr_scheduler(args.warmup_epochs, args.lr_decay)
    # lr_scheduler = LambdaLR(optimizer,lrs)
    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_decay, gamma=0.1)
    train_acc = []
    valid_acc = []

    # Training
    def train(epoch):
        print('\nEpoch: %d' % epoch)
        net.train()
        train_loss = 0
        correct = 0
        total = 0
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            print(batch_idx)
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            # lr_scheduler.step()
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        print(100. * correct / total)
        train_acc.append(correct / total)

    def test(epoch):
        global best_acc
        net.eval()
        test_loss = 0
        correct = 0
        total = 0
        print('test')
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(testloader):
                print(batch_idx)
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = net(inputs)
                loss = criterion(outputs, targets)

                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

        # Save checkpoint.
        acc = 100. * correct / total
        print(acc)
        valid_acc.append(correct / total)

        if acc > best_acc:
            print('Saving..')
            state = {
                'net': net.state_dict(),
                'acc': acc,
                'epoch': epoch,
            }
            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(state, ckpt)
            best_acc = acc

    for epoch in range(200):
        if epoch in args.lr_decay:
            checkpoint = torch.load(ckpt)
            net.load_state_dict(checkpoint['net'])
            best_acc = checkpoint['acc']
            args.lr *= 0.1
            if args.optimizer.lower() == 'sgd':
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
            if args.optimizer.lower() == 'sgdwm':
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=args.momentum,
                                      weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'adam':
                optimizer = optim.Adam(net.parameters(),
                                       lr=args.lr,
                                       weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'rmsprop':
                optimizer = optim.RMSprop(net.parameters(),
                                          lr=args.lr,
                                          momentum=args.momentum,
                                          weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'adagrad':
                optimizer = optim.Adagrad(net.parameters(),
                                          lr=args.lr,
                                          weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'radam':
                from radam import RAdam

                optimizer = RAdam(net.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'lars':  # no tensorboardX
                optimizer = LARS(net.parameters(),
                                 lr=args.lr,
                                 momentum=args.momentum,
                                 weight_decay=args.weight_decay,
                                 dampening=args.damping)
            elif args.optimizer.lower() == 'lamb':
                optimizer = Lamb(net.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
            elif args.optimizer.lower() == 'novograd':
                optimizer = NovoGrad(net.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
            else:
                optimizer = optim.SGD(net.parameters(),
                                      lr=args.lr,
                                      momentum=args.momentum,
                                      weight_decay=args.weight_decay)
        train(epoch)
        test(epoch)
    file = open(args.optimizer + str(lr) + 'log.json', 'w+')
    json.dump([train_acc, valid_acc], file)
    return best_acc
コード例 #8
0
    return batch_1D, idxs, batch_truths_1D, idxs_truths, steps


print('Using device:', args.device)

model = ModifiedUNet(args, in_channels=1, out_channels=1, bottleneck_out=None, init_features=32).to(args.device)

now = datetime.datetime.now()
dir_name = now.strftime("%B_%d_at_%H_%M_%p")
save_dir = './save/' + dir_name
imgs_dir = './imgs/' + dir_name

if args.optimizer == 'adam':
    optimizer = torch.optim.Adam(model.parameters(), args.lr)
else:
    optimizer = RAdam(model.parameters(), args.lr)

if args.loss == 'mse':
    loss_fn = torch.nn.MSELoss()
else:
    loss_fn = torch.nn.BCELoss()

# laoding checkpoint
if args.load_path:
    files = os.listdir(args.load_path)
    files = sorted(files, key=lambda x: int(os.path.splitext(x)[0]))
    last_path = os.path.join(args.load_path, files[-1])

    checkpoint = torch.load(last_path)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
コード例 #9
0
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.base_lr,
                                 weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'rmsprop':
    optimizer = optim.RMSprop(model.parameters(),
                              lr=args.base_lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(),
                              lr=args.base_lr,
                              weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'radam':
    from radam import RAdam
    optimizer = RAdam(model.parameters(),
                      lr=args.base_lr,
                      weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lars':  #no tensorboardX
    from lars import LARS
    optimizer = LARS(model.parameters(),
                     lr=args.base_lr,
                     momentum=args.momentum,
                     weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'lamb':
    from lamb import Lamb
    optimizer = Lamb(model.parameters(),
                     lr=args.base_lr,
                     weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'novograd':
    from novograd import NovoGrad
    optimizer = NovoGrad(model.parameters(),
コード例 #10
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--bs', metavar='bs', type=int, default=2)
    parser.add_argument('--path', type=str, default='../../data')
    parser.add_argument('--results', type=str, default='../../results/model')
    parser.add_argument('--nw', type=int, default=0)
    parser.add_argument('--max_images', type=int, default=None)
    parser.add_argument('--val_size', type=int, default=None)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--lr', type=float, default=0.003)
    parser.add_argument('--lr_decay', type=float, default=0.99997)
    parser.add_argument('--kernel_lvl', type=float, default=1)
    parser.add_argument('--noise_lvl', type=float, default=1)
    parser.add_argument('--motion_blur', type=bool, default=False)
    parser.add_argument('--homo_align', type=bool, default=False)
    parser.add_argument('--resume', type=bool, default=False)

    args = parser.parse_args()

    print()
    print(args)
    print()

    if not os.path.isdir(args.results): os.makedirs(args.results)

    PATH = args.results
    if not args.resume:
        f = open(PATH + "/param.txt", "a+")
        f.write(str(args))
        f.close()

    writer = SummaryWriter(PATH + '/runs')

    # CUDA for PyTorch
    use_cuda = torch.cuda.is_available()
    device = torch.device('cuda:0' if use_cuda else "cpu")

    # Parameters
    params = {'batch_size': args.bs, 'shuffle': True, 'num_workers': args.nw}

    # Generators
    print('Initializing training set')
    training_set = Dataset(args.path + '/train/', args.max_images,
                           args.kernel_lvl, args.noise_lvl, args.motion_blur,
                           args.homo_align)
    training_generator = data.DataLoader(training_set, **params)

    print('Initializing validation set')
    validation_set = Dataset(args.path + '/test/', args.val_size,
                             args.kernel_lvl, args.noise_lvl, args.motion_blur,
                             args.homo_align)

    validation_generator = data.DataLoader(validation_set, **params)

    # Model
    model = UNet(in_channel=3, out_channel=3)
    if args.resume:
        models_path = get_newest_model(PATH)
        print('loading model from ', models_path)
        model.load_state_dict(torch.load(models_path))

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = torch.nn.DataParallel(model)

    model.to(device)

    # Loss + optimizer
    criterion = BurstLoss()
    optimizer = RAdam(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=8 // args.bs, gamma=args.lr_decay)
    if args.resume:
        n_iter = np.loadtxt(PATH + '/train.txt', delimiter=',')[:, 0][-1]
    else:
        n_iter = 0

    # Loop over epochs
    for epoch in range(args.epochs):
        train_loss = 0.0

        # Training
        model.train()
        for i, (X_batch, y_labels) in enumerate(training_generator):
            # Alter the burst length for each mini batch

            burst_length = np.random.randint(2, 9)
            X_batch = X_batch[:, :burst_length, :, :, :]

            # Transfer to GPU
            X_batch, y_labels = X_batch.to(device).type(
                torch.float), y_labels.to(device).type(torch.float)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            pred = model(X_batch)
            loss = criterion(pred, y_labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.detach().cpu().numpy()
            writer.add_scalar('training_loss', loss.item(), n_iter)

            if i % 100 == 0 and i > 0:
                loss_printable = str(np.round(train_loss, 2))

                f = open(PATH + "/train.txt", "a+")
                f.write(str(n_iter) + "," + loss_printable + "\n")
                f.close()

                print("training loss ", loss_printable)

                train_loss = 0.0

            if i % 1000 == 0:
                if torch.cuda.device_count() > 1:
                    torch.save(
                        model.module.state_dict(),
                        os.path.join(PATH,
                                     'model_' + str(int(n_iter)) + '.pt'))
                else:
                    torch.save(
                        model.state_dict(),
                        os.path.join(PATH,
                                     'model_' + str(int(n_iter)) + '.pt'))

            if i % 1000 == 0:
                # Validation
                val_loss = 0.0
                with torch.set_grad_enabled(False):
                    model.eval()
                    for v, (X_batch,
                            y_labels) in enumerate(validation_generator):
                        # Alter the burst length for each mini batch

                        burst_length = np.random.randint(2, 9)
                        X_batch = X_batch[:, :burst_length, :, :, :]

                        # Transfer to GPU
                        X_batch, y_labels = X_batch.to(device).type(
                            torch.float), y_labels.to(device).type(torch.float)

                        # forward + backward + optimize
                        pred = model(X_batch)
                        loss = criterion(pred, y_labels)

                        val_loss += loss.detach().cpu().numpy()

                        if v < 5:
                            im = make_im(pred, X_batch, y_labels)
                            writer.add_image('image_' + str(v), im, n_iter)

                    writer.add_scalar('validation_loss', val_loss, n_iter)

                    loss_printable = str(np.round(val_loss, 2))
                    print('validation loss ', loss_printable)

                    f = open(PATH + "/eval.txt", "a+")
                    f.write(str(n_iter) + "," + loss_printable + "\n")
                    f.close()

            n_iter += args.bs
コード例 #11
0
ファイル: train.py プロジェクト: chicm/yt8m
def train(args):
    print('start training...')
    model, model_file = create_model(args)
    train_loader, val_loader = get_train_val_loaders(
        batch_size=args.batch_size, val_batch_size=args.val_batch_size)
    train_loader = get_frame_train_loader(batch_size=args.batch_size)
    #model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)

    if args.optim == 'Adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=0.0001)
    elif args.optim == 'RAdam':
        optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0001)
    else:
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=0.9,
                              weight_decay=0.0001)

    if args.lrs == 'plateau':
        lr_scheduler = ReduceLROnPlateau(optimizer,
                                         mode='min',
                                         factor=args.factor,
                                         patience=args.patience,
                                         min_lr=args.min_lr)
    else:
        lr_scheduler = CosineAnnealingLR(optimizer,
                                         args.t_max,
                                         eta_min=args.min_lr)

    model = model.cuda()
    if torch.cuda.device_count() > 1:
        model_name = model.name
        model = DataParallel(model)
        model.name = model_name

    #model=model.train()

    best_f2 = 99999.
    best_key = 'loss'

    print(
        'epoch |    lr     |       %        |  loss  |  avg   |  loss  |  0.01  |  0.20  |  0.50  |  best  | time |  save |'
    )

    if not args.no_first_val:
        val_metrics = validate(args, model, val_loader)
        print(
            'val   |           |                |        |        | {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.4f} |       |        |'
            .format(val_metrics['loss'], val_metrics['f2_th_0.01'],
                    val_metrics['f2_th_0.20'], val_metrics['f2_th_0.50'],
                    val_metrics[best_key]))

        best_f2 = val_metrics[best_key]

    if args.val:
        return

    model.train()

    if args.lrs == 'plateau':
        lr_scheduler.step(best_f2)
    else:
        lr_scheduler.step()

    train_iter = 0

    for epoch in range(args.start_epoch, args.num_epochs):
        #train_loader, val_loader = get_train_val_loaders(batch_size=args.batch_size, val_batch_size=args.val_batch_size, val_num=args.val_num)

        train_loss = 0

        current_lr = get_lrs(optimizer)
        bg = time.time()
        for batch_idx, data in enumerate(train_loader):
            train_iter += 1
            if train_loader.seg:
                rgb, audio, labels = [x.cuda() for x in data]
            else:
                rgb, audio, labels = data[0].cuda(), data[2].cuda(
                ), data[4].cuda()

            output = model(rgb, audio)

            loss = criterion(output, labels)
            batch_size = rgb.size(0)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            #with amp.scale_loss(loss, optimizer) as scaled_loss:
            #    scaled_loss.backward()

            train_loss += loss.item()
            print('\r {:4d} | {:.7f} | {:06d}/{} | {:.4f} | {:.4f} |'.format(
                epoch, float(current_lr[0]), args.batch_size * (batch_idx + 1),
                train_loader.num, loss.item(), train_loss / (batch_idx + 1)),
                  end='')

            if train_iter > 0 and train_iter % args.iter_val == 0:
                if isinstance(model, DataParallel):
                    torch.save(model.module.state_dict(),
                               model_file + '_latest')
                else:
                    torch.save(model.state_dict(), model_file + '_latest')

                val_metrics = validate(args, model, val_loader)

                _save_ckp = ''
                if args.always_save or val_metrics[best_key] < best_f2:
                    best_f2 = val_metrics[best_key]
                    if isinstance(model, DataParallel):
                        torch.save(model.module.state_dict(), model_file)
                    else:
                        torch.save(model.state_dict(), model_file)
                    _save_ckp = '*'
                print(
                    ' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} |  {:4s} |'
                    .format(val_metrics['loss'], val_metrics['f2_th_0.01'],
                            val_metrics['f2_th_0.20'],
                            val_metrics['f2_th_0.50'], best_f2,
                            (time.time() - bg) / 60, _save_ckp))

                model.train()
                if args.lrs == 'plateau':
                    lr_scheduler.step(best_f2)
                else:
                    lr_scheduler.step()
                current_lr = get_lrs(optimizer)
コード例 #12
0
def get_reward_model_optimizer(params, *, model_lr, model_weight_decay):
    return RAdam(params, lr=model_lr, weight_decay=model_weight_decay)
コード例 #13
0
print("Load duration : {}".format(time.time()-st))
print("[!] load data end")

# torch.cuda.set_device(4)
# torch.distributed.init_process_group(backend='nccl',
#                                      init_method='env://')

model = FCAE()
model.cuda()

# criterion = nn.MSELoss()
critierion = CosineDistanceLoss()
# criterion.cuda()
# optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)
optimizer = RAdam(model.parameters(), lr= learning_rate)
optimizer = Lookahead(optimizer, alpha=0.5,k=5)

opt_level = 'O1'
# assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled."
model,optimizer = amp.initialize(model,optimizer,opt_level = opt_level)
scheduler = StepLR(optimizer,step_size=50,gamma = 0.1)
model = nn.DataParallel(model)


print("[*] training ...")

log = open(os.path.join(save_path,'log.csv'), 'w', encoding='utf-8', newline='')
log_writer = csv.writer(log)

best_val_loss = 100
コード例 #14
0
ファイル: train.py プロジェクト: oleges1/cells-cells
def train(config, num_classes=1108):
    model = model_whale(num_classes=num_classes,
                        inchannels=6,
                        model_name=config.train.model_name,
                        pretrained=config.train.pretrained).cuda()
    if config.train.freeze:
        model.freeze()

    base_opt = RAdam(model.parameters(), lr=config.train.lr)
    optimizer = Lookahead(base_opt)
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.train.lr,  betas=(0.9, 0.99), weight_decay=0.0002)

    resultDir = config.train.result_dir
    checkPoint = join(resultDir, 'checkpoint')
    #     if not config.train.in_colab:
    #         os.makedirs(checkPoint, exist_ok=True)
    train_dataset = CustomDataset(config.train.csv_file,
                                  config.train.img_dir,
                                  transforms=transforms['train'])
    dataset_size = len(train_dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(config.train.validation_split * dataset_size))
    if config.train.shuffle_dataset:
        np.random.seed(config.train.random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.train.batch_size,
        sampler=train_sampler,
        num_workers=config.train.num_workers)
    validation_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.train.batch_size,
        sampler=valid_sampler,
        num_workers=config.train.num_workers)

    train_loss = 0.

    # load from cpk:
    if config.train.load_cpk:
        model.load_pretrain(os.path.join(
            checkPoint, '%08d_model.pth' % (config.train.start_epoch)),
                            skip=[])
        cpk = torch.load(
            os.path.join(checkPoint,
                         '%08d_optimizer.pth' % (config.train.start_epoch)))
        optimizer.load_state_dict(cpk['optimizer'])
        adjust_learning_rate(optimizer, config.train.lr)
        start_epoch = cpk['epoch']
    else:
        start_epoch = 0

    top1_batch, map5_batch = 0, 0

    for epoch in range(start_epoch + 1, config.train.epochs):
        print('Starting:', epoch, 'Iterations:', len(train_loader))
        for i, data in enumerate(train_loader):
            model.train()
            model.mode = 'train'
            images, labels = data
            images = images.cuda()
            labels = labels.cuda().long()
            global_feat, local_feat, results = data_parallel(model, images)
            model.getLoss(global_feat,
                          local_feat,
                          results,
                          labels,
                          config,
                          verbose=(i % config.loss.verbose_interval == 0))
            batch_loss = model.loss

            optimizer.zero_grad()
            batch_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_norm=5.0,
                                           norm_type=2)
            optimizer.step()
            results = torch.sigmoid(results)
            train_loss += batch_loss.data.cpu().numpy()
            top1_batch += accuracy(results, labels, topk=[1])[0]
            map5_batch += mapk(labels, results, k=5)

            if i % config.train.verbose_interval == 0:
                print(
                    'epoch: %03d, iter: %05d, train_loss: %f, top1_batch: %f, map5_batch: %f'
                    % (epoch, i,
                       float(train_loss / config.train.verbose_interval),
                       float(top1_batch / config.train.verbose_interval),
                       float(map5_batch / config.train.verbose_interval)))

                #                 print(f'epoch: {epoch}, iter: {i}, train_loss: {float(train_loss / config.train.verbose_interval)}, top1_batch: {float(top1_batch / config.train.verbose_interval)}, map5_batch: {float(map5_batch / config.train.verbose_interval)}')
                train_loss, top1_batch, map5_batch = 0, 0, 0

                valid_loss, top1_valid, map5_valid = valid_eval(
                    config, model, validation_loader)
                print(
                    'epoch: %03d, iter: %05d, valid_loss: %f, valid_top1_batch: %f, valid_map5_batch: %f'
                    % (epoch, i, valid_loss, top1_valid, map5_valid))


#                 print(f'epoch: {epoch}, iter: {i}, valid_loss: {valid_loss}, top1_batch: {top1_valid}, map5_batch: {map5_valid}')

        if epoch % config.train.save_period == 0:
            os.system("touch " + resultDir + "/checkpoint/%08d_model.pth" %
                      (epoch))
            os.system("touch " + resultDir + "/checkpoint/%08d_optimizer.pth" %
                      (epoch))
            time.sleep(1)
            torch.save(model.state_dict(),
                       resultDir + '/checkpoint/%08d_model.pth' % (epoch))
            torch.save({
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
            }, resultDir + '/checkpoint/%08d_optimizer.pth' % (epoch))
コード例 #15
0
    def __init__(self,
                 TasNET,
                 batch_size,
                 checkpoint="checkpoint",
                 log_folder="./log",
                 rnn_arch="LSTM",
                 optimizer='radam',
                 rerun_mode=False,
                 lr=1e-5,
                 momentum=0.9,
                 weight_decay=0,
                 num_epoches=20,
                 clip_norm=False,
                 sr=8000,
                 cudnnBenchmark=True):
        
        logger.info('---Experiment Variables---')
        logger.info('RNN Architecture: '+rnn_arch)
        logger.info('Batch Size      : '+str(batch_size))
        logger.info('Optimizer       : '+optimizer)
        logger.info('--------------------------\n')

        logger.info('Rerun mode: '+str(rerun_mode))
       
        self.TasNET = TasNET

        self.log_folder = log_folder
        self.writer = SummaryWriter(log_folder)
        self.all_log = 'all_log.log' #all log filename
        self.log('Progress Log save path: '+log_folder)

        self.log("TasNET:\n{}".format(self.TasNET))
        if type(lr) is str:
            lr = float(lr)
            logger.info("Transfrom lr from str to float => {}".format(lr))
        
        self.log('Batch size used: '+str(batch_size))

        if optimizer == 'radam':
            self.optimizer = RAdam(self.TasNET.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            self.optimizer = torch.optim.Adam(
                self.TasNET.parameters(),
                lr=lr,
                weight_decay=weight_decay)
        
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,
                            'min', factor=0.5, patience=3,verbose=True)
        self.TasNET.to(device)

        self.checkpoint = checkpoint
        self.log('Model save path: '+checkpoint)
        
        self.num_epoches = num_epoches
        self.clip_norm = clip_norm
        self.sr = sr

        if self.clip_norm:
            self.log("Clip gradient by 2-norm {}".format(clip_norm))

        if not os.path.exists(self.checkpoint):
            os.makedirs(checkpoint)
            
        torch.backends.cudnn.benchmark=cudnnBenchmark
        self.log('cudnn benchmark status: '+str(torch.backends.cudnn.benchmark)+'\n')
コード例 #16
0
    def __init__(self, log_dir, cfg):

        self.path = log_dir
        self.cfg = cfg

        if cfg.TRAIN.FLAG:
            self.model_dir = os.path.join(self.path, 'Model')
            self.log_dir = os.path.join(self.path, 'Log')
            mkdir_p(self.model_dir)
            mkdir_p(self.log_dir)
            self.writer = SummaryWriter(log_dir=self.log_dir)
            sys.stdout = Logger(logfile=os.path.join(self.path, "logfile.log"))

        self.data_dir = cfg.DATASET.DATA_DIR
        self.max_epochs = cfg.TRAIN.MAX_EPOCHS
        self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL

        s_gpus = cfg.GPU_ID.split(',')
        self.gpus = [int(ix) for ix in s_gpus]
        self.num_gpus = len(self.gpus)

        self.batch_size = cfg.TRAIN.BATCH_SIZE
        self.lr = cfg.TRAIN.LEARNING_RATE

        torch.cuda.set_device(self.gpus[0])
        cudnn.benchmark = True

        # load dataset
        cogent = cfg.DATASET.congent
        if not args.eval and not args.test:
        self.dataset = ClevrDataset(data_dir=self.data_dir, split="train" + cogent)
        self.dataloader = DataLoader(dataset=self.dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=True,
                                       num_workers=cfg.WORKERS, drop_last=False, collate_fn=collate_fn)

        self.dataset_val = ClevrDataset(data_dir=self.data_dir, split="val" + cogent)
        self.dataloader_val = DataLoader(dataset=self.dataset_val, batch_size=256, drop_last=False,
                                         shuffle=False, num_workers=cfg.WORKERS, collate_fn=collate_fn)

        # load model
        self.vocab = load_vocab(cfg)
        self.model, self.model_ema = mac.load_MAC(cfg, self.vocab)
        self.weight_moving_average(alpha=0)
        if cfg.TRAIN.RADAM:
            self.optimizer = RAdam(self.model.parameters(), lr=self.lr)
        else:
            self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        self.previous_best_acc = 0.0
        self.previous_best_epoch = 0

        self.total_epoch_loss = 0
        self.prior_epoch_loss = 10

        self.print_info()
        self.loss_fn = torch.nn.CrossEntropyLoss().cuda()

    def print_info(self):
        print('Using config:')
        pprint.pprint(self.cfg)
        print("\n")

        pprint.pprint("Size of dataset: {}".format(len(self.dataset)))
        print("\n")

        print("Using MAC-Model:")
        pprint.pprint(self.model)
        print("\n")

    def weight_moving_average(self, alpha=0.999):
        for param1, param2 in zip(self.model_ema.parameters(), self.model.parameters()):
            param1.data *= alpha
            param1.data += (1.0 - alpha) * param2.data

    def set_mode(self, mode="train"):
        if mode == "train":
            self.model.train()
            self.model_ema.train()
        else:
            self.model.eval()
            self.model_ema.eval()

    def reduce_lr(self):
        epoch_loss = self.total_epoch_loss # / float(len(self.dataset) // self.batch_size)
        lossDiff = self.prior_epoch_loss - epoch_loss
        if ((lossDiff < 0.015 and self.prior_epoch_loss < 0.5 and self.lr > 0.00002) or \
            (lossDiff < 0.008 and self.prior_epoch_loss < 0.15 and self.lr > 0.00001) or \
            (lossDiff < 0.003 and self.prior_epoch_loss < 0.10 and self.lr > 0.000005)):
            self.lr *= 0.5
            print("Reduced learning rate to {}".format(self.lr))
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = self.lr
        self.prior_epoch_loss = epoch_loss
        self.total_epoch_loss = 0

    def save_models(self, iteration):
        save_model(self.model, self.optimizer, iteration, self.model_dir, model_name="model")
        save_model(self.model_ema, None, iteration, self.model_dir, model_name="model_ema")

    def train_epoch(self, epoch):
        cfg = self.cfg
        total_loss = 0
        total_correct = 0
        total_samples = 0

        self.labeled_data = iter(self.dataloader)
        self.set_mode("train")

        dataset = tqdm(self.labeled_data, total=len(self.dataloader))

        for data in dataset:
            ######################################################
            # (1) Prepare training data
            ######################################################
            image, question, question_len, answer = data['image'], data['question'], data['question_length'], data['answer']
            answer = answer.long()
            question = Variable(question)
            answer = Variable(answer)

            if cfg.CUDA:
                image = image.cuda()
                question = question.cuda()
                answer = answer.cuda().squeeze()
            else:
                question = question
                image = image
                answer = answer.squeeze()

            ############################
            # (2) Train Model
            ############################
            self.optimizer.zero_grad()

            scores = self.model(image, question, question_len)
            loss = self.loss_fn(scores, answer)
            loss.backward()

            if self.cfg.TRAIN.CLIP_GRADS:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.cfg.TRAIN.CLIP)

            self.optimizer.step()
            self.weight_moving_average()

            ############################
            # (3) Log Progress
            ############################
            correct = scores.detach().argmax(1) == answer
            total_correct += correct.sum().cpu().item()
            total_loss += loss.item() * answer.size(0)
            total_samples += answer.size(0)

            avg_loss = total_loss / total_samples
            train_accuracy = total_correct / total_samples
            # accuracy = correct.sum().cpu().numpy() / answer.shape[0]

            # if avg_loss == 0:
            #     avg_loss = loss.item()
            #     train_accuracy = accuracy
            # else:
            #     avg_loss = 0.99 * avg_loss + 0.01 * loss.item()
            #     train_accuracy = 0.99 * train_accuracy + 0.01 * accuracy
            # self.total_epoch_loss += loss.item() * answer.size(0)

            dataset.set_description(
                'Epoch: {}; Avg Loss: {:.5f}; Avg Train Acc: {:.5f}'.format(epoch + 1, avg_loss, train_accuracy)
            )

        self.total_epoch_loss = avg_loss
        print(self.total_epoch_loss)

        dict = {
            "avg_loss": avg_loss,
            "train_accuracy": train_accuracy
        }
        return dict

    def train(self):
        cfg = self.cfg
        print("Start Training")
        for epoch in range(self.max_epochs):
            dict = self.train_epoch(epoch)
            self.reduce_lr()
            self.log_results(epoch, dict)
            if cfg.TRAIN.EALRY_STOPPING:
                if epoch - cfg.TRAIN.PATIENCE == self.previous_best_epoch:
                    break

        self.save_models(self.max_epochs)
        self.writer.close()
        print("Finished Training")
        print("Highest validation accuracy: {} at epoch {}")

    def log_results(self, epoch, dict, max_eval_samples=None):
        epoch += 1
        self.writer.add_scalar("avg_loss", dict["avg_loss"], epoch)
        self.writer.add_scalar("train_accuracy", dict["train_accuracy"], epoch)

        val_accuracy, val_accuracy_ema = self.calc_accuracy("validation", max_samples=max_eval_samples)
        self.writer.add_scalar("val_accuracy_ema", val_accuracy_ema, epoch)
        self.writer.add_scalar("val_accuracy", val_accuracy, epoch)

        print("Epoch: {}\tVal Acc: {},\tVal Acc EMA: {},\tAvg Loss: {},\tLR: {}".
              format(epoch, val_accuracy, val_accuracy_ema, dict["avg_loss"], self.lr))

        if val_accuracy > self.previous_best_acc:
            self.previous_best_acc = val_accuracy
            self.previous_best_epoch = epoch

        if epoch % self.snapshot_interval == 0:
            self.save_models(epoch)

    def calc_accuracy(self, mode="train", max_samples=None):
        self.set_mode("validation")

        if mode == "train":
            loader = self.dataloader
            # num_imgs = len(self.dataset)
        elif mode == "validation":
            loader = self.dataloader_val
            # num_imgs = len(self.dataset_val)

        # batch_size = 256
        # total_iters = num_imgs // batch_size
        # if max_samples is not None:
        #     max_iter = max_samples // batch_size
        # else:
        #     max_iter = None

        # all_accuracies = []
        total_correct = 0
        total_correct_ema = 0
        total_samples = 0
        # all_accuracies_ema = []

        for data in tqdm(loader, total=len(loader)):
            # try:
            #     data = next(eval_data)
            # except StopIteration:
            #     break
            # if max_iter is not None and _iteration == max_iter:
            #     break

            image, question, question_len, answer = data['image'], data['question'], data['question_length'], data['answer']
            answer = answer.long()
            question = Variable(question)
            answer = Variable(answer)

            if self.cfg.CUDA:
                image = image.cuda()
                question = question.cuda()
                answer = answer.cuda().squeeze()

            with torch.no_grad():
                scores = self.model(image, question, question_len)
                scores_ema = self.model_ema(image, question, question_len)

            correct_ema = scores_ema.detach().argmax(1) == answer
            total_correct_ema += correct_ema.sum().cpu().item()
            # accuracy_ema = correct_ema.sum().cpu().numpy() / answer.shape[0]

            # all_accuracies_ema.append(accuracy_ema)

            correct = scores.detach().argmax(1) == answer
            total_correct += correct.sum().cpu().item()
            # accuracy = correct.sum().cpu().numpy() / answer.shape[0]
            # all_accuracies.append(accuracy)
            total_samples += answer.size(0)

        accuracy_ema = total_correct_ema / total_samples
        accuracy = total_correct / total_samples

        return accuracy, accuracy_ema
コード例 #17
0
ファイル: srnet.py プロジェクト: steganalyzer/bsn
 def get_optimizer(model, lr):
     return RAdam(model.parameters(), lr=lr, weight_decay=1e-5)
コード例 #18
0
class DDPG(nn.Module):
    def __init__(
        self,
        d_state,
        d_action,
        device,
        gamma,
        tau,
        policy_lr,
        value_lr,
        value_loss,
        value_n_layers,
        value_n_units,
        value_activation,
        policy_n_layers,
        policy_n_units,
        policy_activation,
        grad_clip,
        policy_noise=0.2,
        noise_clip=0.5,
        expl_noise=0.1,
        tdg_error_weight=0,
        td_error_weight=1,
    ):
        super().__init__()

        self.actor = Actor(d_state, d_action, policy_n_layers, policy_n_units,
                           policy_activation).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = RAdam(self.actor.parameters(), lr=policy_lr)

        self.critic = ActionValueFunction(d_state, d_action, value_n_layers,
                                          value_n_units,
                                          value_activation).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = RAdam(self.critic.parameters(), lr=value_lr)

        self.discount = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.expl_noise = expl_noise
        self.normalizer = None
        self.value_loss = value_loss
        self.grad_clip = grad_clip
        self.device = device

        self.tdg_error_weight = tdg_error_weight
        self.td_error_weight = td_error_weight
        self.step_counter = 0

    def setup_normalizer(self, normalizer):
        self.normalizer = copy.deepcopy(normalizer)

    def get_action(self, states, deterministic=False):
        states = states.to(self.device)
        if self.normalizer is not None:
            states = self.normalizer.normalize_states(states)
        actions = self.actor(states)
        if not deterministic:
            actions = actions + torch.randn_like(actions) * self.expl_noise
        return actions.clamp(-1, +1)

    def get_action_with_logp(self, states):
        states = states.to(self.device)
        if self.normalizer is not None:
            states = self.normalizer.normalize_states(states)
        a = self.actor(states)
        return a, torch.ones(
            a.shape[0], device=a.device) * np.inf  # inf: should not be used

    def get_action_value(self, states, actions):
        if self.normalizer is not None:
            states = self.normalizer.normalize_states(states)
        with torch.no_grad():
            states = states.to(self.device)
            actions = actions.to(self.device)
            return self.critic(states, actions)[0]  # just q1

    def update(self, states, actions, logps, rewards, next_states, masks):
        if self.normalizer is not None:
            states = self.normalizer.normalize_states(states)
            next_states = self.normalizer.normalize_states(next_states)
        self.step_counter += 1

        # Select action according to policy and add clipped noise
        noise = (torch.randn_like(actions) * self.policy_noise).clamp(
            -self.noise_clip, self.noise_clip)
        raw_next_actions = self.actor_target(next_states)
        next_actions = (raw_next_actions + noise).clamp(-1, 1)

        # Compute the target Q value
        next_Q = self.critic_target(next_states, next_actions)
        q_target = rewards.unsqueeze(
            1) + self.discount * masks.float().unsqueeze(1) * next_Q
        zero_targets = torch.zeros_like(q_target, device=self.device)

        q = self.critic(states, actions)  # Q(s,a)
        q_td_error = q_target - q
        critic_loss, standard_loss, gradient_loss = torch.tensor(
            0, device=self.device), torch.tensor(
                0, device=self.device), torch.tensor(0, device=self.device)
        if self.td_error_weight > 0:
            if self.value_loss == 'huber':
                standard_loss = 0.5 * F.smooth_l1_loss(q_td_error,
                                                       zero_targets)
            elif self.value_loss == 'mse':
                standard_loss = 0.5 * F.mse_loss(q_td_error, zero_targets)
            critic_loss = critic_loss + self.td_error_weight * standard_loss
        if self.tdg_error_weight > 0:
            gradients_error_norms = torch.autograd.grad(
                outputs=q_td_error,
                inputs=actions,
                grad_outputs=torch.ones(q_td_error.size(), device=self.device),
                retain_graph=True,
                create_graph=True,
                only_inputs=True)[0].flatten(start_dim=1).norm(dim=1,
                                                               keepdim=True)
            if self.value_loss == 'huber':
                gradient_loss = 0.5 * F.smooth_l1_loss(gradients_error_norms,
                                                       zero_targets)
            elif self.value_loss == 'mse':
                gradient_loss = 0.5 * F.mse_loss(gradients_error_norms,
                                                 zero_targets)
            critic_loss = critic_loss + self.tdg_error_weight * gradient_loss

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_value_(self.critic.parameters(),
                                        self.grad_clip)
        self.critic_optimizer.step()

        # Compute actor loss
        q = self.critic(states, self.actor(states))  # Q(s,pi(s))
        actor_loss = -q.mean()

        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_value_(self.actor.parameters(),
                                        self.grad_clip)
        self.actor_optimizer.step()

        # Update the frozen target models
        for param, target_param in zip(self.critic.parameters(),
                                       self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

        for param, target_param in zip(self.actor.parameters(),
                                       self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

        return raw_next_actions[
            0, 0].item(), self.td_error_weight * standard_loss.item(
            ), self.tdg_error_weight * gradient_loss.item(), actor_loss.item()

    @staticmethod
    def catastrophic_divergence(q_loss, pi_loss):
        return q_loss > 1e2 or (pi_loss is not None and abs(pi_loss) > 1e5)
コード例 #19
0
ファイル: train_4cWM.py プロジェクト: xll426/urban_watershed
def main():
    outPath = f"{args.results}_{args.model_name}_CBAM_WM"
    if not os.path.exists(outPath):
        os.makedirs(outPath)

    ts = str(datetime.datetime.now()).split(".")[0].replace(" ", "_")
    ts = ts.replace(":", "_").replace("-", "_")
    file_path = os.path.join(outPath,
                             "{}_run_{}.json".format(args.model_name, ts))
    ##############choose model##########################
    net = get_model(args.num_classes, args.model_name).to(device)

    if args.pre_train:
        net = torch.load(args.ckp)["model_state"]  #load the pretrained model
        print("load pre-trained model sucessfully")
    if torch.cuda.device_count() > 1:
        print("using multi gpu")
        net = torch.nn.DataParallel(net, device_ids=[0, 1])
    else:
        print('using one gpu')

    ##########hyper parameters setting#################
    # optimizer = Adam(net.parameters(), lr=args.lr)
    optimizer = RAdam(params=net.parameters(), lr=args.lr, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           'max',
                                                           factor=0.2,
                                                           patience=4,
                                                           verbose=True)
    # milestones = [x*40 for x in range(10)]
    # print(milestones)
    # scheduler = CyclicCosAnnealingLR(optimizer,milestones=milestones,eta_min=1e-7)

    # criterion = FocalLoss2d().to(device)
    # criterion = BCEDiceLossWeighted().to(device)
    criterion = WeightedBceLoss().to(device)
    criterion2 = SoftDiceLoss().to(device)

    ##########prepare dataset################################
    train_loader, val_loader = build_loader(batch_size=args.batch_size,
                                            num_workers=4)

    history = collections.defaultdict(list)

    best_miou = -100

    for epoch in range(args.num_epochs):
        print("Epoch: {}/{}".format(epoch + 1, args.num_epochs))
        # optimizer.step()
        # scheduler.step(epoch)
        ####################train####################################
        train_hist = train(train_loader, args.num_classes, device, net,
                           optimizer, criterion)
        print('loss', train_hist["loss"], 'miou', train_hist["miou"], 'fg_iou',
              train_hist["fg_iou"], 'mcc', train_hist["mcc"])

        for k, v in train_hist.items():
            history["train " + k].append(v)

    ######################valid##################################
        val_hist = validate(val_loader, args.num_classes, device, net,
                            scheduler, criterion2)
        print('loss', val_hist["loss"], 'miou', val_hist["miou"], 'fg_iou',
              val_hist["fg_iou"], 'mcc', val_hist["mcc"])

        if val_hist["miou"] > best_miou:
            state = {
                "epoch": epoch + 1,
                "model_state": net,
                "best_miou": val_hist["miou"]
            }
            checkpoint = f'{args.model_name}_val_{val_hist["miou"]}_epoch{epoch + 1}.pth'
            torch.save(state, os.path.join(outPath, checkpoint))  # save model
            print("The model has saved successfully!")
            best_miou = val_hist["miou"]

        for k, v in val_hist.items():
            history["val " + k].append(v)

        f = open(file_path, "w+")
        f.write(json.dumps(history))
        f.close()
コード例 #20
0
##### Build Model #####
from model.CADVFi import CAIN
print("Building model: CADVFi")
model = CAIN(depth=args.depth)
# Just make every model to DataParallel
model = torch.nn.DataParallel(model).to(device)
#print(model)

##### Define Loss & Optimizer #####
criterion = Loss(args)

args.radam = False
if args.radam:
    from radam import RAdam
    optimizer = RAdam(model.parameters(),
                      lr=args.lr,
                      betas=(args.beta1, args.beta2))
else:
    from torch.optim import Adam
    optimizer = Adam(model.parameters(),
                     lr=args.lr,
                     betas=(args.beta1, args.beta2))
print('# of parameters: %d' % sum(p.numel() for p in model.parameters()))

# If resume, load checkpoint: model + optimizer
if args.resume:
    utils.load_checkpoint(args, model, optimizer)

# Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode='min',
コード例 #21
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('data', metavar='DIR', help='path to dataset')
    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='input batch size for training (default: 128)')
    parser.add_argument('--batch-size-val',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--epochs',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='number of epochs to train (default: 1000)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-4,
                        metavar='LR',
                        help='learning rate (default: 1e-4)')
    parser.add_argument('--image-size',
                        type=float,
                        default=80,
                        metavar='IMSIZE',
                        help='input image size (default: 80)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--multi-gpu',
                        action='store_true',
                        default=False,
                        help='parallel training on multiple GPUs')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    parser.add_argument('--model-save-path',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='For Saving the current Model')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    args = parser.parse_args()

    torch.set_default_tensor_type('torch.FloatTensor')

    device = torch.device("cpu" if args.no_cuda else "cuda")

    train_data = dataset(args.data,
                         "train",
                         args.image_size,
                         transform=transforms.Compose([ToTensor()]),
                         shuffle=True)
    valid_data = dataset(args.data,
                         "val",
                         args.image_size,
                         transform=transforms.Compose([ToTensor()]))

    trainloader = DataLoader(train_data,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=8)
    validloader = DataLoader(valid_data,
                             batch_size=args.batch_size_val,
                             shuffle=False,
                             num_workers=8)

    model = Model(args.image_size, args.image_size)

    optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=1e-8)

    if not args.no_cuda:
        model.cuda()

    if torch.cuda.device_count() > 1 and args.multi_gpu:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = torch.nn.DataParallel(model)

    if args.resume:
        model.load_state_dict(
            torch.load(os.path.join(args.resume, model_save_name)))
        optimizer.load_state_dict(
            torch.load(os.path.join(args.resume, optimizer_save_name)))

    train(model, optimizer, trainloader, validloader, device, args)
コード例 #22
0
if __name__ == "__main__":
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    train_loader, validation_loader, test_loader, classes = load_CIFAR10_data()

    # model = CIFAR10Model()
    # model = CIFAR10ModelTest()
    # model = resnet18(3, 10)
    # model = CIFAR10ResLayer()
    # model = CIFAR100Model()
    # model, criterion, optimizer = transfer_radam_resnet101(100)

    model = resnet18(3, 10, drop=False)
    summary(model, (3, 32, 32), device="cpu")
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = RAdam(model.parameters())
    # optimizer = optim.Adam(model.parameters())

    accuracies = []
    train_losses = []
    losses = []

    for epoch in range(1, EPOCH + 1):
        loss = train(model, train_loader, optimizer, criterion, device, epoch,
                     1)
        train_losses.append(loss)
        if not TEST:
            acc, loss = validate(model, validation_loader, criterion, device,
                                 epoch, classes, 1)
            accuracies.append(acc)
            losses.append(loss)
コード例 #23
0
def main(args):
    # 1. prepare data & models
    train_transforms = transforms.Compose([
        ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
        CropCenter(CROP_SIZE),
        AffineAugmenter(min_scale=0.9, max_offset=0.1, rotate=True),
        BrightnessContrastAugmenter(brightness=0.3, contrast=0.3),
        BlurAugmenter(max_kernel=5),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ("image", )),
    ])
    test_transforms = transforms.Compose([
        ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
        CropCenter(CROP_SIZE),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ("image", )),
    ])

    print("Reading data...")
    train_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'),
                                             train_transforms,
                                             split="train")
    train_dataloader = data.DataLoader(train_dataset,
                                       batch_size=args.batch_size,
                                       num_workers=4,
                                       pin_memory=True,
                                       shuffle=True,
                                       drop_last=True)
    val_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'),
                                           test_transforms,
                                           split="val")
    val_dataloader = data.DataLoader(val_dataset,
                                     batch_size=args.batch_size,
                                     num_workers=4,
                                     pin_memory=True,
                                     shuffle=False,
                                     drop_last=False)

    print("Creating model...")
    device = torch.device("cuda: 0") if args.gpu else torch.device("cpu")
    model = models.resnext50_32x4d(pretrained=True)
    # for param in model.parameters():
    #     param.requires_grad = False

    model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True)

    # model.fc = nn.Sequential(
    #     # nn.BatchNorm1d(model.fc.in_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
    #     # nn.Linear(model.fc.in_features, model.fc.in_features, bias=True),
    #     # nn.ReLU(),
    #     nn.BatchNorm1d(model.fc.in_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
    #     nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True))

    model.to(device)

    # optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=0.01, amsgrad=True)
    optimizer = RAdam(model.parameters(),
                      lr=args.learning_rate)  # , weight_decay=0.01)

    optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=3)
    # optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.2)

    loss_fn = fnn.mse_loss

    # 2. train & validate
    print("Ready for training...")
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        train_loss = train(model,
                           train_dataloader,
                           loss_fn,
                           optimizer,
                           device=device)
        val_loss = validate(model, val_dataloader, loss_fn, device=device)
        print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}".format(
            epoch, train_loss, val_loss))
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            with open(f"{args.name}_best.pth", "wb") as fp:
                torch.save(model.state_dict(), fp)
        # with open(f"{args.name}_{epoch}_{train_loss:7.4}_{val_loss:7.4}.pth", "wb") as fp:
        #     torch.save(model.state_dict(), fp)

    # 3. predict
    test_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'test'),
                                            test_transforms,
                                            split="test")
    test_dataloader = data.DataLoader(test_dataset,
                                      batch_size=args.batch_size,
                                      num_workers=4,
                                      pin_memory=True,
                                      shuffle=False,
                                      drop_last=False)

    with open(f"{args.name}_best.pth", "rb") as fp:
        best_state_dict = torch.load(fp, map_location="cpu")
        model.load_state_dict(best_state_dict)

    test_predictions = predict(model, test_dataloader, device)
    with open(f"{args.name}_test_predictions.pkl", "wb") as fp:
        pickle.dump(
            {
                "image_names": test_dataset.image_names,
                "landmarks": test_predictions
            }, fp)

    create_submission(args.data, test_predictions, f"{args.name}_submit.csv")

    if args.draw:
        print("Drawing landmarks...")
        directory = os.path.join("result",
                                 test_dataset.image_names[0].split('.')[0])
        if not os.path.exists(directory):
            os.makedirs(directory)
        random_idxs = np.random.choice(len(test_dataset.image_names),
                                       size=1000,
                                       replace=False)
        for i, idx in enumerate(random_idxs, 1):
            image = cv2.imread(test_dataset.image_names[idx])
            image = draw_landmarks(image, test_predictions[idx])
            cv2.imwrite(os.path.join("result", test_dataset.image_names[idx]),
                        image)
コード例 #24
0
ファイル: train.py プロジェクト: ravehun/fast-transformers
 def configure_optimizers(self):
     from radam import RAdam
     optimizer = RAdam(self.parameters())
     return optimizer
コード例 #25
0
class Trainer:
    def __init__(self, args, train_loader, test_loader, tokenizer):
        self.args = args
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.tokenizer = tokenizer
        self.vocab_size = tokenizer.vocab_size
        self.pad_id = tokenizer.pad_token_id
        self.eos_id = tokenizer.eos_token_id
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() and not args.no_cuda else
            'cpu', args.local_rank)
        self.writer = SummaryWriter() if args.local_rank in [-1, 0] else None
        self.n_gpus = torch.distributed.get_world_size(
        ) if args.distributed else torch.cuda.device_count()
        assert args.pretrain != args.finetune  # Do not set both finetune and pretrain arguments to the same (True, False)

        if args.pretrained_model:
            self.gpt = torch.load(args.pretrained_model)
        else:
            self.gpt = GPT(vocab_size=self.vocab_size,
                           seq_len=args.max_seq_len,
                           d_model=args.hidden,
                           n_layers=args.n_layers,
                           n_heads=args.n_attn_heads,
                           d_ff=args.ffn_hidden,
                           embd_pdrop=args.embd_dropout,
                           attn_pdrop=args.attn_dropout,
                           resid_pdrop=args.resid_dropout,
                           pad_id=self.pad_id)

        if args.pretrain:
            self.model = GPTLMHead(self.gpt)
            self.model.to(self.device)
        if args.finetune:
            with open(args.cached_label_dict, 'r') as file:
                label_dict = json.load(file)
            self.model = GPTClsHead(self.gpt,
                                    n_class=len(label_dict),
                                    cls_token_id=self.eos_id)
            self.model.to(self.device)

        if args.distributed:
            self.model = DistributedDataParallel(self.model,
                                                 device_ids=[args.local_rank],
                                                 output_device=args.local_rank)

        self.optimizer = RAdam(self.model.parameters(), args.lr)
        self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_id).to(
            self.device)
        self.cls_criterion = nn.CrossEntropyLoss().to(self.device)

    @timeit
    def train(self, epoch):
        if self.args.pretrain:
            self.pretrain(epoch)
        if self.args.finetune:
            self.finetune(epoch)

    def pretrain(self, epoch):
        losses = 0
        n_batches, n_samples = len(self.train_loader), len(
            self.train_loader.dataset)

        self.model.train()
        for i, batch in enumerate(self.train_loader):
            inputs = batch[0].to(self.device)
            targets = inputs[:, 1:].contiguous()
            # |inputs| : (batch_size, seq_len), |targets| : (batch_size, seq_len-1)

            lm_logits = self.model(inputs)
            lm_logits = lm_logits[:, :-1].contiguous()
            # |lm_logits| : (batch_size, seq_len-1, vocab_size)

            loss = self.criterion(lm_logits.view(-1, self.vocab_size),
                                  targets.view(-1))
            losses += loss.item()

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if self.args.local_rank in [-1, 0]:
                self.writer.add_scalar('Loss/pre-train', loss.item(),
                                       ((epoch - 1) * n_batches) + i)
                if i % (n_batches // 5) == 0 and i != 0:
                    print('Iteration {} ({}/{})\tLoss: {:.4f}'.format(
                        i, i, n_batches, losses / i))

        print('Train Epoch {} [rank: {}]\t>\tLoss: {:.4f}'.format(
            epoch, self.args.local_rank, losses / n_batches))

    def finetune(self, epoch):
        losses, accs = 0, 0
        n_batches, n_samples = len(self.train_loader), len(
            self.train_loader.dataset)  # n_batches = batch size per GPU

        self.model.train()
        for i, batch in enumerate(self.train_loader):
            inputs, labels = map(lambda x: x.to(self.device), batch)
            # |inputs| : (batch_size, seq_len), |labels| : (batch_size)

            lm_logits, cls_logits = self.model(inputs)
            lm_logits = lm_logits[:, :-1].contiguous()
            # |lm_logits| : (batch_size, seq_len-1, vocab_size), |cls_logits| : (batch_size, n_class)

            lm_loss = self.criterion(lm_logits.view(-1, self.vocab_size),
                                     inputs[:, 1:].contiguous().view(-1))
            cls_loss = self.cls_criterion(cls_logits, labels)
            loss = cls_loss + (self.args.auxiliary_ratio * lm_loss)

            losses += loss.item()
            acc = (cls_logits.argmax(dim=-1) == labels).to(
                dtype=cls_logits.dtype).mean()
            accs += acc

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if self.args.local_rank in [-1, 0]:
                self.writer.add_scalar('Loss/fine-tune', loss.item(),
                                       ((epoch - 1) * n_batches) + i)
                self.writer.add_scalar('Accuracy/fine-tune', acc,
                                       ((epoch - 1) * n_batches) + i)
                if i % (n_batches // 5) == 0 and i != 0:
                    print('Iteration {} ({}/{})\tLoss: {:.4f} Acc: {:.1f}%'.
                          format(i, i, n_batches, losses / i, accs / i * 100.))

        print(
            'Train Epoch {} [rank: {}]\t>\tLoss: {:.4f} / Acc: {:.1f}%'.format(
                epoch, self.args.local_rank, losses / n_batches,
                accs / n_batches * 100.))

    def evaluate(self, epoch):
        losses, accs = 0, 0
        n_batches, n_samples = len(self.test_loader), len(
            self.test_loader.dataset)

        self.model.eval()
        with torch.no_grad():
            for i, batch in enumerate(self.test_loader):
                if self.args.pretrain:
                    inputs = batch.to(self.device)
                    targets = inputs[:, 1:].contiguous()

                    lm_logits = self.model(inputs)
                    lm_logits = lm_logits[:, :-1].contiguous()

                    loss = self.criterion(lm_logits.view(-1, self.vocab_size),
                                          targets.view(-1))
                    losses += loss.item()

                    if self.args.local_rank in [-1, 0]:
                        self.writer.add_scalar('Loss/pre-train(eval)',
                                               loss.item(),
                                               ((epoch - 1) * n_batches) + i)

                elif self.args.finetune:
                    inputs, labels = map(lambda x: x.to(self.device), batch)

                    lm_logits, cls_logits = self.model(inputs)
                    lm_logits = lm_logits[:, :-1].contiguous()

                    lm_loss = self.criterion(
                        lm_logits.view(-1, self.vocab_size),
                        inputs[:, 1:].contiguous().view(-1))
                    cls_loss = self.cls_criterion(cls_logits, labels)
                    loss = cls_loss + (self.args.auxiliary_ratio * lm_loss)

                    losses += loss.item()
                    acc = (cls_logits.argmax(dim=-1) == labels).to(
                        dtype=cls_logits.dtype).mean()
                    accs += acc

                    if self.args.local_rank in [-1, 0]:
                        self.writer.add_scalar('Loss/fine-tune(eval)',
                                               loss.item(),
                                               ((epoch - 1) * n_batches) + i)
                        self.writer.add_scalar('Accuracy/fine-tune(eval)', acc,
                                               ((epoch - 1) * n_batches) + i)

        print(
            'Eval Epoch {} [rank: {}]\t>\tLoss: {:.4f} / Acc: {:.1f}%'.format(
                epoch, self.args.local_rank, losses / n_batches,
                accs / n_batches * 100.))

    def save(self, epoch, model_prefix='model', root='.model'):
        path = Path(root) / (model_prefix + '.ep%d' % epoch)
        if not path.parent.exists():
            path.parent.mkdir()

        if self.args.distributed:
            if self.args.local_rank == 0:
                torch.save(self.gpt, path)
        else:
            torch.save(self.gpt, path)
コード例 #26
0
ファイル: train.py プロジェクト: markson14/Kaggle
def train_model(train_parameters):

    k = train_parameters["k"]
    loaders = train_parameters["loaders"]
    num_epochs = train_parameters["num_epochs"]
    net = train_parameters["net"]
    ENCODER = train_parameters["ENCODER"]
    ENCODER_WEIGHTS = train_parameters["ENCODER_WEIGHTS"]
    ACTIVATION = train_parameters["ACTIVATION"]

    model = load_model(net, ENCODER, ENCODER_WEIGHTS, ACTIVATION)
    """ multi-gpu """
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)

    model.to("cuda")

    #     if k==0:
    #         summary(model.module.encoder,(3,384,576))

    logdir = "./logs/segmentation_{}_{}Fold".format(net, k)

    # model, criterion, optimizer
    optimizer = RAdam([
        {
            'params': model.module.decoder.parameters(),
            'lr': 1e-2
        },
        {
            'params': model.module.encoder.parameters(),
            'lr': 1e-3
        },
        #         {'params': model.decoder.parameters(), 'lr': 1e-2},
        #         {'params': model.encoder.parameters(), 'lr': 1e-3},
    ])

    criterion = smp.utils.losses.BCEDiceLoss(eps=1.)
    #     criterion = FocalLoss()
    #     criterion = FocalDiceLoss()
    # criterion = smp.utils.losses.DiceLoss(eps=1.)
    scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
    runner = SupervisedRunner()

    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        callbacks=[
            EarlyStoppingCallback(patience=10, min_delta=0.001),
            DiceCallback()
        ],
        #                    AUCCallback(),
        #                    IouCallback()],
        logdir=logdir,
        num_epochs=num_epochs,
        verbose=True)

    del loaders, optimizer, scheduler, model, runner
    torch.cuda.empty_cache()
    gc.collect()
    print("Collect GPU cache")
コード例 #27
0
ファイル: t5_assin.py プロジェクト: unicamp-dl/PTT5
 def configure_optimizers(self):
     return RAdam(self.parameters(), lr=self.hparams.lr)
コード例 #28
0
class TasNET_trainer(object):
    def __init__(self,
                 TasNET,
                 batch_size,
                 checkpoint="checkpoint",
                 log_folder="./log",
                 rnn_arch="LSTM",
                 optimizer='radam',
                 rerun_mode=False,
                 lr=1e-5,
                 momentum=0.9,
                 weight_decay=0,
                 num_epoches=20,
                 clip_norm=False,
                 sr=8000,
                 cudnnBenchmark=True):
        
        logger.info('---Experiment Variables---')
        logger.info('RNN Architecture: '+rnn_arch)
        logger.info('Batch Size      : '+str(batch_size))
        logger.info('Optimizer       : '+optimizer)
        logger.info('--------------------------\n')

        logger.info('Rerun mode: '+str(rerun_mode))
       
        self.TasNET = TasNET

        self.log_folder = log_folder
        self.writer = SummaryWriter(log_folder)
        self.all_log = 'all_log.log' #all log filename
        self.log('Progress Log save path: '+log_folder)

        self.log("TasNET:\n{}".format(self.TasNET))
        if type(lr) is str:
            lr = float(lr)
            logger.info("Transfrom lr from str to float => {}".format(lr))
        
        self.log('Batch size used: '+str(batch_size))

        if optimizer == 'radam':
            self.optimizer = RAdam(self.TasNET.parameters(), lr=lr, weight_decay=weight_decay)
        else:
            self.optimizer = torch.optim.Adam(
                self.TasNET.parameters(),
                lr=lr,
                weight_decay=weight_decay)
        
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,
                            'min', factor=0.5, patience=3,verbose=True)
        self.TasNET.to(device)

        self.checkpoint = checkpoint
        self.log('Model save path: '+checkpoint)
        
        self.num_epoches = num_epoches
        self.clip_norm = clip_norm
        self.sr = sr

        if self.clip_norm:
            self.log("Clip gradient by 2-norm {}".format(clip_norm))

        if not os.path.exists(self.checkpoint):
            os.makedirs(checkpoint)
            
        torch.backends.cudnn.benchmark=cudnnBenchmark
        self.log('cudnn benchmark status: '+str(torch.backends.cudnn.benchmark)+'\n')


    def SISNR(self, output, target):
        #output:(128,4000)
        batchsize = np.shape(output)[0]
        target = target.view(batchsize,-1)
        output = output - torch.mean(output,1,keepdim=True)
        target = target - torch.mean(target,1,keepdim=True)

        s_shat = torch.sum(output*target,1,keepdim=True)
        s_2 = torch.sum(target**2,1,keepdim=True)
        s_target = (s_shat / s_2) * target   #(128,4000)

        e_noise = output - s_target    

        return 10*torch.log10(torch.sum(e_noise**2,1,keepdim=True)\
                    /torch.sum(s_target**2,1,keepdim=True))        #(128,1)


    def loss(self,output1,output2,target1,target2):
    	#PIT loss
        loss1 = self.SISNR(output1,target1)+self.SISNR(output2,target2)
        loss2 = self.SISNR(output1,target2)+self.SISNR(output2,target1)
        min = torch.min(loss1, loss2)   #(128,1)
        return torch.mean(min)        #scale

    def train(self, dataloader, epoch):
        self.TasNET.train()
        self.log("Training...")
        tot_loss = 0
        tot_batch = len(dataloader)
        batch_indx = (epoch-1)*tot_batch

        currProcess = 0
        fivePercentProgress = tot_batch//20

        for mix_speech, speech1, speech2 in dataloader:
            self.optimizer.zero_grad()

            if torch.cuda.is_available():
                mix_speech= mix_speech.cuda()
                speech1 = speech1.cuda()
                speech2 = speech2.cuda()

            mix_speech = Variable(mix_speech)
            speech1 = Variable(speech1)
            speech2 = Variable(speech2)

            output1, output2 = self.TasNET(mix_speech)
            cur_loss = self.loss(output1,output2,speech1,speech2)
            tot_loss += cur_loss.item()
            
            #write summary
            batch_indx += 1
            self.writer.add_scalar('train_loss', cur_loss, batch_indx)
            cur_loss.backward()
            if self.clip_norm:
                nn.utils.clip_grad_norm_(self.TasNET.parameters(),
                                         self.clip_norm)
            self.optimizer.step()
            currProcess+=1
            if currProcess % fivePercentProgress == 0:
                self.log('batch {}: {:.2f}% progress ({}/{})| LR: {}'.format(batch_indx, currProcess*100/tot_batch, currProcess, tot_batch, str(self.get_curr_lr())))
                
        return tot_loss / tot_batch, tot_batch

    def validate(self, dataloader, epoch):
        """one epoch"""
        self.TasNET.eval()
        self.log("Evaluating...")
        tot_loss = 0
        tot_batch = len(dataloader)
        batch_indx = (epoch-1)*tot_batch

        currProcess = 0
        fivePercentProgress = tot_batch//20
        #print(tot_batch)

        with torch.no_grad():
            for mix_speech,speech1,speech2 in dataloader:
                if torch.cuda.is_available():
                    mix_speech = mix_speech.cuda()
                    speech1 = speech1.cuda()
                    speech2 = speech2.cuda()

                mix_speech = Variable(mix_speech)
                speech1 = Variable(speech1)
                speech2 = Variable(speech2)

                output1, output2 = self.TasNET(mix_speech)
                cur_loss = self.loss(output1,output2,speech1,speech2)
                tot_loss += cur_loss.item()
                #write summary
                batch_indx += 1
                currProcess += 1
                if currProcess % fivePercentProgress == 0:
                    self.log('batch {}: {:.2f}% progress ({}/{})| LR: {}'.format(batch_indx, currProcess*100/tot_batch, currProcess, tot_batch, str(self.get_curr_lr())))
                self.writer.add_scalar('dev_loss', cur_loss, batch_indx)
        return tot_loss / tot_batch, tot_batch

    def run(self, train_set, dev_set):
        init_loss, _ = self.validate(dev_set,1)
        self.log("Start training for {} epoches".format(self.num_epoches))
        self.log("Epoch {:2d}: dev loss ={:.4e}".format(0, init_loss))
        torch.save(self.TasNET.state_dict(), os.path.join(self.checkpoint, 'TasNET_0.pkl'))
        for epoch in range(1, self.num_epoches+1):
            train_start = time.time()
            train_loss, train_num_batch = self.train(train_set, epoch)
            valid_start = time.time()
            valid_loss, valid_num_batch = self.validate(dev_set, epoch)
            valid_end = time.time()
            self.scheduler.step(valid_loss)
            self.log(
                "Epoch {:2d}: train loss = {:.4e}({:.2f}s/{:d}) |"
                " dev loss= {:.4e}({:.2f}s/{:d})".format(
                    epoch, train_loss, valid_start - train_start,
                    train_num_batch, valid_loss, valid_end - valid_start,
                    valid_num_batch))
            save_path = os.path.join(
                self.checkpoint, "TasNET_{:d}_trainloss_{:.4e}_valloss_{:.4e}.pkl".format(
                    epoch, train_loss, valid_loss))
            torch.save(self.TasNET.state_dict(), save_path)
        self.log("Training for {} epoches done!".format(self.num_epoches))
    
    def rerun(self, train_set, dev_set, model_path, epoch_done):
        self.TasNET.load_state_dict(torch.load(model_path))
        # init_loss, _ = self.validate(dev_set,epoch_done)
        # logger.info("Start training for {} epoches".format(self.num_epoches))
        # logger.info("Epoch {:2d}: dev loss ={:.4e}".format(0, init_loss))
        # torch.save(self.TasNET.state_dict(), os.path.join(self.checkpoint, 'TasNET_0.pkl'))
        for epoch in range(epoch_done+1, self.num_epoches+1):
            train_start = time.time()
            train_loss, train_num_batch = self.train(train_set,epoch)
            valid_start = time.time()
            valid_loss, valid_num_batch = self.validate(dev_set,epoch)
            valid_end = time.time()
            self.scheduler.step(valid_loss)
            self.log(
                "Epoch {:2d}: train loss = {:.4e}({:.2f}s/{:d}) |"
                " dev loss= {:.4e}({:.2f}s/{:d})".format(
                    epoch, train_loss, valid_start - train_start,
                    train_num_batch, valid_loss, valid_end - valid_start,
                    valid_num_batch))
            save_path = os.path.join(
                self.checkpoint, "TasNET_{:d}_trainloss_{:.4e}_valloss_{:.4e}.pkl".format(
                    epoch, train_loss, valid_loss))
            torch.save(self.TasNET.state_dict(), save_path)
        self.log("Training for {} epoches done!".format(self.num_epoches))

    def get_curr_lr(self):
        for i, param_group in enumerate(self.optimizer.param_groups):
            curr_lr = float(param_group['lr'])
            return curr_lr
    
    def log(self, log_data):
        logger.info(log_data)
        try:
            f = open(self.log_folder+'/'+self.all_log,'a+')
            f.write(log_data+'\n')
            f.close()
        except:
            logger.info('failed to save last log')
コード例 #29
0
ファイル: coconut_train.py プロジェクト: AlanChaw/WTI
def train():
    # Init Training
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased',
                                           output_hidden_states=True,
                                           output_attentions=True)
    bert_model.eval()
    center_loss = None

    data_loaders = get_data_loader()
    coconut_model = CoconutModel(num_of_classes=args.num_of_classes,
                                 num_of_group=args.num_of_group,
                                 feature_size=args.feature_size)

    if torch.cuda.is_available():
        coconut_model = coconut_model.cuda()
        bert_model = bert_model.cuda()

    optimizer = RAdam(params=coconut_model,
                      lr=args.lr,
                      betas=(0.0, 0.999),
                      eps=1e-3,
                      weight_decay=args.l2_reg)

    # optimizer = torch.optim.SGD(params,
    #                             lr=args.lr,
    #                             momentum=0.9,
    #                             nesterov=True,
    #                             weight_decay=args.l2_reg)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer,
                                                        milestones=[80, 150],
                                                        gamma=0.1)

    starting_epoch = 0

    if args.resume:
        checkpoints = load_model(model=coconut_model,
                                 optimizer=optimizer,
                                 lr_scheduler=lr_scheduler,
                                 center_loss=center_loss)
        (starting_epoch, coconut_model, optimizer, lr_scheduler,
         center_loss) = checkpoints

    for epoch in range(starting_epoch, args.epoch):
        train_model(epoch=epoch,
                    model=coconut_model,
                    optimizer=optimizer,
                    loader=data_loaders["train_loader"],
                    tokenizer=tokenizer,
                    bert_model=bert_model,
                    center_loss=center_loss)
        lr_scheduler.step()
        eval_model(epoch=epoch,
                   model=coconut_model,
                   loader=data_loaders["dev_loader"],
                   tokenizer=tokenizer,
                   bert_model=bert_model)
        save_mode(epoch=epoch,
                  model=coconut_model,
                  optimizer=optimizer,
                  lr_scheduler=lr_scheduler,
                  center_loss=center_loss)
    return
コード例 #30
0
ファイル: hack_train.py プロジェクト: gggrafff/MadeCvHw1
def main(args):
    # 1. prepare data & models
    train_transforms = transforms.Compose([
        ScaleMinSideToSize((CROP_SIZE, CROP_SIZE)),
        CropCenter(CROP_SIZE),
        TransformByKeys(transforms.ToPILImage(), ("image", )),
        TransformByKeys(transforms.ToTensor(), ("image", )),
        TransformByKeys(
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ("image", )),
    ])

    print("Reading data...")
    train_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'),
                                             train_transforms,
                                             split="train")
    train_dataloader = data.DataLoader(train_dataset,
                                       batch_size=args.batch_size,
                                       num_workers=1,
                                       pin_memory=True,
                                       shuffle=True,
                                       drop_last=True)
    val_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'train'),
                                           train_transforms,
                                           split="val")
    val_dataloader = data.DataLoader(val_dataset,
                                     batch_size=args.batch_size,
                                     num_workers=1,
                                     pin_memory=True,
                                     shuffle=False,
                                     drop_last=False)

    print("Creating model...")
    device = torch.device("cuda: 0") if args.gpu else torch.device("cpu")
    #model = models.resnet18(pretrained=True)
    #model.fc = nn.Linear(model.fc.in_features, 2 * NUM_PTS, bias=True)

    #model = models.densenet161(pretrained=True)
    #model.classifier = nn.Linear(model.classifier.in_features, 2 * NUM_PTS, bias=True)

    model = EfficientNet.from_pretrained('efficientnet-b4',
                                         num_classes=2 * NUM_PTS)

    if args.oldname:
        with open(f"{args.oldname}_best.pth", "rb") as fp:
            best_state_dict = torch.load(fp, map_location="cpu")
            model.load_state_dict(best_state_dict)

    model.to(device)

    #optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=True)
    optimizer = RAdam(model.parameters(), lr=args.learning_rate)

    loss_fn = fnn.mse_loss
    lr_scheduler = ReduceLROnPlateau(optimizer)

    # 2. train & validate
    print("Ready for training...")
    best_val_loss = np.inf
    for epoch in range(args.epochs):
        train_loss = train(model,
                           train_dataloader,
                           loss_fn,
                           optimizer,
                           device=device)
        val_loss = validate(model, val_dataloader, loss_fn, device=device)
        print("Epoch #{:2}:\ttrain loss: {:5.2}\tval loss: {:5.2}\tlr: {:5.2}".
              format(epoch, train_loss, val_loss, get_lr(optimizer)))
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            with open(f"{args.name}_best.pth", "wb") as fp:
                torch.save(model.state_dict(), fp)
            os.system(
                'cp /content/MadeCvHw1/{args.name}_best.pth "/content/drive/My Drive/MADE/CV_HW1/"'
            )
        lr_scheduler.step(train_loss)

    # 3. predict
    test_dataset = ThousandLandmarksDataset(os.path.join(args.data, 'test'),
                                            train_transforms,
                                            split="test")
    test_dataloader = data.DataLoader(test_dataset,
                                      batch_size=args.batch_size,
                                      num_workers=1,
                                      pin_memory=True,
                                      shuffle=False,
                                      drop_last=False)

    with open(f"{args.name}_best.pth", "rb") as fp:
        best_state_dict = torch.load(fp, map_location="cpu")
        model.load_state_dict(best_state_dict)

    test_predictions = predict(model, test_dataloader, device)
    with open(f"{args.name}_test_predictions.pkl", "wb") as fp:
        pickle.dump(
            {
                "image_names": test_dataset.image_names,
                "landmarks": test_predictions
            }, fp)

    create_submission(args.data, test_predictions, f"{args.name}_submit.csv")