def test(config): NSML_SESSEION = 'team_6/19_tcls_qa/80' # NOTE: need to hard code NSML_CHECKPOINT = '13800' # NOTE: nghhhhed to hard code assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit" assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit" set_global_seed(config.seed_num) token_makers = create_by_factory(TokenMakersFactory, config.token) tokenizers = token_makers["tokenizers"] del token_makers["tokenizers"] config.data_reader.tokenizers = tokenizers data_reader = create_by_factory(DataReaderFactory, config.data_reader) def bind_load_vocabs(config, token_makers): CHECKPOINT_FNAME = "checkpoint.bin" def load(dir_path): checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME) checkpoint = torch.load(checkpoint_path) vocabs = {} token_config = config.token for token_name in token_config.names: token = getattr(token_config, token_name, {}) vocab_config = getattr(token, "vocab", {}) texts = checkpoint["vocab_texts"][token_name] if type(vocab_config) != dict: vocab_config = vars(vocab_config) vocabs[token_name] = Vocab(token_name, **vocab_config).from_texts(texts) for token_name, token_maker in token_makers.items(): token_maker.set_vocab(vocabs[token_name]) return token_makers nsml.bind(load=load) bind_load_vocabs(config, token_makers) nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION) # Raw to Tensor Function text_handler = TextHandler(token_makers, lazy_indexing=False) raw_to_tensor_fn = text_handler.raw_to_tensor_fn( data_reader, cuda_device=device, ) # Model & Optimizer model = create_model(token_makers, ModelFactory, config.model, device) trainer = Trainer(model, metric_key="f1") if nsml.IS_ON_NSML: bind_nsml(model, trainer=trainer, raw_to_tensor_fn=raw_to_tensor_fn) if config.nsml.pause: nsml.paused(scope=locals())
def main(): args = get_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = Inpaint() model = model.to(device) optim = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) save, load = bind_nsml(model, optim) if args.pause == 1: nsml.paused(scope=locals()) if args.mode == 'train': path_train = os.path.join(dir_data_root, 'train') path_train_data = os.path.join(dir_data_root, 'train', 'train_data') tr_loader, val_loader = data_loader_with_split(path_train, batch_size=args.batch_size) postfix = dict() total_step = 0 for epoch in trange(args.num_epochs, disable=use_nsml): pbar = tqdm(enumerate(tr_loader), total=len(tr_loader), disable=use_nsml) for step, (_, x_input, mask, x_GT) in pbar: total_step += 1 x_GT = x_GT.to(device) x_input = x_input.to(device) mask = mask.to(device) x_mask = torch.cat([x_input, mask], dim=1) model.zero_grad() x_hat = model(x_mask) x_composed = compose(x_input, x_hat, mask) loss = l1_loss(x_composed, x_GT) loss.backward() optim.step() postfix['loss'] = loss.item() if use_nsml: postfix['epoch'] = epoch postfix['step_'] = step postfix['total_step'] = total_step postfix['steps_per_epoch'] = len(tr_loader) if step % args.eval_every == 0: vutils.save_image(x_GT, 'x_GT.png', normalize=True) vutils.save_image(x_input, 'x_input.png', normalize=True) vutils.save_image(x_hat, 'x_hat.png', normalize=True) vutils.save_image(mask, 'mask.png', normalize=True) metric_eval = local_eval(model, val_loader, path_train_data) postfix['metric_eval'] = metric_eval if use_nsml: if step % args.print_every == 0: print(postfix) nsml.report(**postfix, scope=locals(), step=total_step) else: pbar.set_postfix(postfix) if use_nsml: nsml.save(epoch) else: save(epoch)
def main(args): search_file(DATASET_PATH) if args.mode == 'train': feature_ext_model = build_cnn_model(backbone=CNN_BACKBONE) else: feature_ext_model = build_cnn_model(backbone=CNN_BACKBONE, use_imagenet=None) #개별 모델 정보 #nsml.load(checkpoint='193_base_611', session='team_27/airush2/229') #nsml.load(checkpoint='193_base_202', session='team_27/airush2/645') #nsml.load(checkpoint='part_03_114', session='team_27/airush2/671') #nsml.load(checkpoint='part_03_146', session='team_27/airush2/673') #nsml.load(checkpoint='part_03_94', session='team_27/airush2/684') #nsml.load(checkpoint='part_03_57', session='team_27/airush2/688') model1 = build_model(2600) model2 = build_model(2600) model3 = build_model(2600) model4 = build_model(2600) model5 = build_model(2600) model6 = build_model(2600) print('feature_ext_model.output.shape[1]', feature_ext_model.output.shape[1]) if use_nsml: bind_nsml(feature_ext_model, model1, args.task) nsml.load(checkpoint='193_base_611', session='team_27/airush2/229') bind_nsml(feature_ext_model, model2, args.task) nsml.load(checkpoint='193_base_202', session='team_27/airush2/645') bind_nsml(feature_ext_model, model3, args.task) nsml.load(checkpoint='part_03_114', session='team_27/airush2/671') bind_nsml(feature_ext_model, model4, args.task) nsml.load(checkpoint='part_03_146', session='team_27/airush2/673') bind_nsml(feature_ext_model, model5, args.task) nsml.load(checkpoint='part_03_94', session='team_27/airush2/684') bind_nsml(feature_ext_model, model6, args.task) nsml.load(checkpoint='part_03_57', session='team_27/airush2/688') #nsml.load(checkpoint='part_03_21', session='team_27/airush2/671') #bind_nsml(feature_ext_model, model3, args.task) #nsml.load(checkpoint='part_03_24', session='team_27/airush2/673') merge_model = Model(inputs=[ model1.input, model2.input, model3.input, model4.input, model5.input, model6.input ], outputs=[ model1.output, model2.output, model3.output, model4.output, model5.output, model6.output ]) bind_nsml(feature_ext_model, merge_model, args.task) nsml.save('dgu_final') # megrging if args.pause: nsml.paused(scope=locals())
def main(): args = get_args() if args.use_dropout == 0: args.use_dropout = False if args.use_dropout ==0: args.use_dropout = False for x in vars(args).items(): print(x) #from utils import data_transforms #print(data_transforms) if args.lr_sch ==5 and torch.__version__ != '0.4.0' : print("for cosine annealing, change to torch==0.4.0 in setup.py") raise AssertionError() elif args.lr_sch !=5 and torch.__version__ == '0.4.0': print("warning : this is torch version {}! nsml report will not be recorded".format(torch.__version__)) model, optimizer, scheduler = model_all.get_model(args) if args.use_gpu: if torch.cuda.device_count() > 1: print("[gpu] Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = torch.nn.DataParallel(model) elif torch.cuda.device_count() == 1: print("[gpu] Let's use", torch.cuda.device_count(), "GPUs!") else: print("[gpu] no available gpus") model = model.cuda() nsml.bind(infer=infer, model=model, optimizer=optimizer) if args.pause: nsml.paused(scope=locals()) nsml.save() if args.mode == 'train': dataloaders, dataset_sizes = utils.data_loader(args, train=True, batch_size=args.batch_size) model = train.train_test(model, optimizer, scheduler, dataloaders, dataset_sizes, args) utils.save_model(model, 'model_state') with open('args.pickle', 'wb') as farg: pickle.dump(args, farg) loader = utils.data_loader(args, train=False, batch_size=1) predict, acc = utils.get_forward_result(model, loader, args) predict = torch.cat(predict, 0) nsml.bind(save=lambda x: utils.save_csv(x, data_csv_fname=os.path.join(DATASET_PATH, 'train', 'test') + '/test_data', results=predict, test_loader=loader)) nsml.save('result')
def main(config, scope): # Create directories if not exist. if not os.path.exists(config.log_path): os.makedirs(config.log_path) if not os.path.exists(config.model_save_path): os.makedirs(config.model_save_path) if not os.path.exists(config.sample_path): os.makedirs(config.sample_path) if config.mode == 'sample': config.batch_size = config.sample_size # Data loader data_loader = get_loader(config.image_path, config.image_size, config.batch_size, config.num_workers) # Solver solver = Solver(data_loader, config) def load(filename, *args): solver.load(filename) def save(filename, *args): solver.save(filename) def infer(input): result = solver.infer(input) # convert tensor to dataurl data_url_list = [''] * input for idx, sample in enumerate(result): numpy_array = np.uint8(sample.cpu().numpy() * 255) image = Image.fromarray(np.transpose(numpy_array, axes=(1, 2, 0)), 'RGB') temp_out = BytesIO() image.save(temp_out, format=OUTPUT_FORMAT) byte_data = temp_out.getvalue() data_url_list[idx] = u'data:image/{format};base64,{data}'.\ format(format=OUTPUT_FORMAT, data=base64.b64encode(byte_data).decode('ascii')) return data_url_list def evaluate(test_data, output): pass def decode(input): return input nsml.bind(save, load, infer, evaluate, decode) if config.pause: nsml.paused(scope=scope) if config.mode == 'train': solver.train() elif config.mode == 'sample': solver.sample()
def main(args): cnn_model = build_cnn_model(backbone=MobileNetV2, use_imagenet=None) gbm_model = LGBMClassifier( boosting_type='gbdt', objective='binary', n_jobs=3, # Updated from 'nthread' silent=False, max_depth=params['max_depth'], max_bin=params['max_bin'], subsample_for_bin=params['subsample_for_bin'], subsample=params['subsample'], subsample_freq=params['subsample_freq'], min_split_gain=params['min_split_gain'], min_child_weight=params['min_child_weight'], min_child_samples=params['min_child_samples'], scale_pos_weight=params['scale_pos_weight']) if use_nsml: bind_nsml(cnn_model, gbm_model) if args.pause: nsml.paused(scope=locals()) if (args.mode == 'train'): #train_loader, dataset_sizes = get_data_loader(root=os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data'), phase='train', batch_size=args.batch_size) start_time = datetime.datetime.now() TotalX = np.load('TrainX.npy') TotalY = np.load('TrainY.npy') print('TotalX.shape', TotalX.shape, 'TotalY.shape', TotalY.shape) X_train, X_test, Y_train, Y_test = train_test_split(TotalX, TotalY, test_size=0.05, random_state=777) print('X_train.shape', X_train.shape, 'X_test.shape', X_test.shape, 'Y_train.shape', Y_train.shape, 'Y_test.shape', Y_test.shape) # To view the default model params: gbm_model.get_params().keys() eval_set = (X_test, Y_test) gbm_model.fit( X_train, Y_train, ) gbm_model.fit(X_train, Y_train, eval_set=[(X_test, Y_test)], eval_metric='binary_error', early_stopping_rounds=50) nsml.save('last')
def train(experiment_name: str = 'v1', pause: bool = False, mode: str = 'train'): config = import_module( f'spam.training.experiments.{experiment_name}').config model = config['model'](**config['model_kwargs']) bind_model(model) if pause: nsml.paused(scope=locals()) if mode == 'train': model.fit(**config['fit_kwargs'])
def main(args): search_file(DATASET_PATH) model_list = [] model1 = { 'backbone': MobileNetV2, 'input_shape': (224, 224, 3), 'use_history_image_f': True, 'Generator': AiRushDataGenerator } model229 = { 'backbone': MobileNetV2, 'input_shape': (224, 224, 3), 'use_history_image_f': True, 'Generator': AiRushDataGenerator } #model2={'backbone':} model_list.append(model1) for model_info in model_list: if args.mode == 'train': feature_ext_model = build_cnn_model( backbone=model_info['backbone'], input_shape=model_info['input_shape']) else: feature_ext_model = build_cnn_model( backbone=model_info['backbone'], input_shape=model_info['input_shape'], use_imagenet=None) if use_history_image_f == True: in_feature_num = int(97 + 84 + 9 + feature_ext_model.output.shape[1] * 2) else: in_feature_num = int(97 + 84 + 9 + feature_ext_model.output.shape[1]) print('in_feature_num', in_feature_num) model = build_model(in_feature_num) print('feature_ext_model.output.shape[1]', feature_ext_model.output.shape[1]) #개별 모델로딩 if use_nsml: bind_nsml(feature_ext_model, model, args.task) #merging if args.pause: nsml.paused(scope=locals())
def main(): global opt, model opt = parser.parse_args() cudnn.benchmark = True log = Logger() # Building model module_net = import_module('model.' + opt.network_archi) model = getattr(module_net, 'Net')() criterion = getattr(module_net, 'criterion')() model = model.cuda() criterion = criterion.cuda() # Setting Optimizer optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=opt.lr) # *** Reserved for nsml *** bind_nsml(model, optimizer) if opt.pause: nsml.paused(scope=locals()) # *** Reserved for nsml *** (end) if opt.mode == "train": if IS_ON_NSML: opt.dataset_path = os.path.join(DATASET_PATH, 'train', 'train_data') else: opt.dataset_path = '/home/data/nipa_faces_sr_tmp2/train/train_data' # local datapath training_data_loader, val_loader = data_loader_with_split( opt.dataset_path, train_split=0.9, batch_size=opt.batchSize) # Training for epoch in range(opt.nEpochs): if opt.network_archi.startswith("edsr"): average_epoch_loss_train = train(training_data_loader, val_loader, optimizer, model, criterion, epoch) info = {'train_loss': average_epoch_loss_train} nsml.save(str(epoch + 1)) for tag, value in info.items(): log.scalar_summary(tag, value, epoch)
def main(config, local): # random seed random.seed(config.random_seed) np.random.seed(config.random_seed) torch.random.manual_seed(config.random_seed) if config.device == 'cuda': torch.cuda.manual_seed_all(config.random_seed) vocab = Vocabulary(config) print(f'Vocabulary loaded') feature = Feature(config) print(f'Feature data loaded') setattr(config, 'char_vocab_size', 0) setattr(config, 'class_size', 1) if config.mode == 'train': train_question_file_path = os.path.join(config.data_dir, config.train_file_name) train_label_file_path = os.path.join(config.data_dir, config.train_label_file_name) train_dataset = Dataset(train_question_file_path, train_label_file_path, vocab, feature, mode='train') train_data_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) validation_question_file_path = os.path.join(config.data_dir, config.validation_file_name) validation_label_file_path = os.path.join(config.data_dir, config.validation_label_file_name) validation_dataset = Dataset(validation_question_file_path, validation_label_file_path, vocab, feature, mode='validation') validation_data_loader = DataLoader(validation_dataset, batch_size=config.batch_size) else: train_data_loader = None validation_data_loader = None print(f'{config.mode} Dataset loaded') trainer = Trainer(config, feature, train_data_loader, validation_data_loader) print(f'Trainer loaded') if nsml.IS_ON_NSML: bind_model(trainer.model, vocab, feature, config) if config.pause: nsml.paused(scope=local) if config.mode == 'train': print(f'Starting training') trainer.train() print(f'Finishing training')
def train(experiment_name: str = 'v1', pause: bool = False, mode: str = 'train'): config = import_module(f'spam.training.experiments.{experiment_name}').config model = config['model'](**config['model_kwargs']) bind_model(model) if pause: nsml.paused(scope=locals()) if mode == 'train': # nsml.load(checkpoint='last_layer_tuning', session='hi-space/spam-2/14') # nsml.load(checkpoint='best', session='hi-space/spam-1/147') nsml.load(checkpoint='full_tuning_21', session='hi-space/spam-3/3') nsml.save('best') print('best model saved') # exit() print('-----------') print(config) print('-----------') model.fit(**config['fit_kwargs'])
def main(): seed_everything() config = utils.config.load(ensemble_checkpoints[0][2]) model = get_model(config).cuda() bind_model(model) args = get_args() if args.pause: ## test mode일 때 print('Inferring Start...') nsml.paused(scope=locals()) if args.mode == 'train': ### training mode일 때 print('Training Start...') nsml.load(session=ensemble_checkpoints[0][0], checkpoint=ensemble_checkpoints[0][1]) nsml.save(0) exit()
def main(config): model = get_resnet18(num_classes=config.num_classes) model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) if use_nsml: bind_nsml(model, optimizer, config.task) if config.pause: nsml.paused(scope=locals()) if config.mode == 'train': train_loader = get_loader(root=DATASET_PATH, phase='train', task=config.task, batch_size=config.batch_size) # start training start_time = datetime.datetime.now() iter_per_epoch = len(train_loader) print('start training...!') for epoch in range(config.num_epochs): for i, (images, _, labels) in enumerate(train_loader): images = images.cuda() labels = labels.cuda() # forward logits = model(images) loss = F.cross_entropy(logits, labels) # backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() if (i + 1) % config.print_every == 0: elapsed = datetime.datetime.now() - start_time print( 'Elapsed [%s], Epoch [%i/%i], Step [%i/%i], Loss: %.4f' % (elapsed, epoch + 1, config.num_epochs, i + 1, iter_per_epoch, loss.item())) if (epoch + 1) % config.save_every == 0: nsml.save(str(epoch + 1))
def main(): seed_everything() pprint.pprint(config, indent=2) model = get_model(config).cuda() bind_model(model) args = get_args() if args.pause: ## test mode일 때 print('Inferring Start...') nsml.paused(scope=locals()) if args.mode == 'train': ### training mode일 때 print('Training Start...') nsml.load(checkpoint='18', session='team146/KHD2019_FUNDUS/20') nsml.save(0) exit()
def main(config, local): n_gpu = int(GPU_NUM) n_gpu = 1 if n_gpu == 0 else n_gpu np.random.seed(config.random_seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.random_seed) # Create data instances vocab = Vocabulary(config.vocab_path) if config.mode == 'train': # Prepare train data loader train_dataset, val_dataset = Dataset(vocab), Dataset(vocab) train_path = os.path.join(config.data_dir, 'train_data/train_data') val_path = os.path.join(config.data_dir, 'train_data/val_data') train_dataset.create_instances(train_path, config.max_seq_length, type='train') val_dataset.create_instances(val_path, config.max_seq_length, type='val') train_loader = DataLoader(train_dataset, batch_size=config.batch_size * n_gpu, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=config.batch_size * n_gpu) else: train_loader, val_loader = None, None trainer = Trainer(config, n_gpu, vocab, train_loader, val_loader) if nsml.IS_ON_NSML: bind_model(trainer.model, vocab, config) if config.pause: nsml.paused(scope=local) if config.mode == 'train': trainer.train()
def main(args, scope): train_loader, _ = get_loader(args.dataset, batch_size=args.batch_size, num_workers=args.workers) G = Generator(args) D = Discriminator(args) trainer = Trainer(train_loader, G, D, args) save, load, infer = get_bindings(trainer) nsml.bind(save=save, load=load, infer=infer) if args.pause: nsml.paused(scope=scope) if args.mode == 'train': if args.verbose: trainer.show_current_model() trainer.train() elif args.mode == 'sample': trainer.sample()
def train(experiment_name: str = 'v_res', pause: bool = False, mode: str = 'train'): config = import_module( f'spam.training.experiments.{experiment_name}').config model = config['model'](**config['model_kwargs']) #BasicModel() # print(type(model)) # print(type(config)) # if experiment_name == 'v_ensemble': # em.bind_model(model) # else: # bm.bind_model(model) bind_model(model) if pause: nsml.paused(scope=locals()) if mode == 'train': model.fit(**config['fit_kwargs'])
def train(experiment_name: str = 'v1', pause: bool = False, mode: str = 'train', ST_name: str = 'v0'): config = import_module(f'spam.training.experiments.{ST_name}').config model = config['model'](**config[ 'model_kwargs']) # model: STModel(network_fn = frozen_networks, network_kwargs = [input_size, len(classes)]) STModel.bind_model(model) if pause: nsml.paused(scope=locals()) if mode == 'train': base_dir = model.fit(**config['fit_kwargs']) config = import_module(f'spam.training.experiments.{experiment_name}').config config['model_kwargs']['dataset_kwargs']['base_dir'] = base_dir # self training add model = config['model'](**config['model_kwargs'])#model: BasicModel(network_fn = frozen_networks, network_kwargs = [input_size, len(classes)]) BasicModel.bind_model(model) if pause: nsml.paused(scope=locals()) if mode == 'train': model.fit(**config['fit_kwargs'])
lr_scheduler = config.lr_scheduler random_seed = 2019 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # model = se_resnet101(pretrained=False) model = shufflenet_v2_x2_0(pretrained=False) model.to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = RAdam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), weight_decay=1e-4) bind_model(model) if config.pause: ## test mode 일때는 여기만 접근 print('Inferring Start...') nsml.paused(scope=locals()) if config.mode == 'train': ### training mode 일때는 여기만 접근 print('Training Start...') # train mode 일때, path 설정 # nsml.load(checkpoint='1', session='team059/KHD2019_MAMMO/48') # load시 수정 필수! # nsml.save(100) # print('model_tmp_save') img_path = DATASET_PATH + '/train/' data, y = data_loader(img_path) X = preprocessing(data) # Data loader batch_loader = DataLoader(dataset=MammoDataset(X,y), ## pytorch data loader 사용 batch_size=batch_size, shuffle=True)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) # lnw add get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) # lnw valid_ratio=0.05 -> valid_ratio=0.1 or 0.03 #train_batch_num, train_dataset_list, valid_dataset = split_dataset(args, wav_paths, script_paths, valid_ratio=0.05) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.03) #lnw add lstart_time = datetime.now() print("Start time : " + str(lstart_time)) #lnw block #logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): #lnw add lepoch_start = datetime.now() print(epoch, "epoch Start time : " + str(lepoch_start)) train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() #lnw modified print_batch 10 -> 100, 450 #train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 450, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss #lnw add. save best model torch.save(model, 'ModelBestSave.pt') #lnw end time, duration lepoch_end = datetime.now() print(epoch, "epoch End time: " + str(lepoch_end), "Duration:", str(lepoch_end - lepoch_start), "SratTime-NowTime:", str(lepoch_end - lstart_time)) #lnw add lend_time = datetime.now() print("End time : " + str(lend_time)) print('Duration: {}'.format(lend_time - lstart_time))
def main(args): # fix seed for train reproduction seed_everything(args.SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("device", device) ############ DONOTCHANGE ############### if args.pause: print('Inferring Start...') nsml.paused(scope=locals()) ####################################### if args.mode == 'train': ############ DONOTCHANGE: Path loader ############### root_path = os.path.join(DATASET_PATH, 'train') image_keys, image_path = path_loader(root_path) labels = label_loader(root_path, image_keys) if args.DEBUG: total_num = 100 image_path = image_path[:total_num] labels = labels[:total_num] train_folds = args.train_folds.split(', ') train_folds = [int(num) for num in train_folds] print("train_folds", train_folds) skf = StratifiedKFold(n_splits=args.n_folds, shuffle=True, random_state=args.SEED) for fold_num, (trn_idx, val_idx) in enumerate(skf.split(image_path, labels)): if fold_num not in train_folds: continue print(f"fold {fold_num} training starts...") trn_img_paths = np.array(image_path)[trn_idx] trn_labels = np.array(labels)[trn_idx] val_img_paths = np.array(image_path)[val_idx] val_labels = np.array(labels)[val_idx] default_transforms = transforms.Compose( [transforms.Resize(args.input_size)]) train_transforms = get_transform( target_size=(args.input_size, args.input_size), transform_list=args.train_augments, augment_ratio=args.augment_ratio) valid_transforms = get_transform( target_size=(args.input_size, args.input_size), transform_list=args.valid_augments, augment_ratio=args.augment_ratio, is_train=False) train_dataset = PathDataset(trn_img_paths, trn_labels, default_transforms, train_transforms) valid_dataset = PathDataset(trn_img_paths, trn_labels, default_transforms, valid_transforms) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True) valid_loader = DataLoader(dataset=valid_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True) # define model model = build_model(args, device) bind_model(model, args) # optimizer definition optimizer = build_optimizer(args, model) scheduler = build_scheduler(args, optimizer, len(train_loader)) criterion = nn.BCELoss() trn_cfg = { 'train_loader': train_loader, 'valid_loader': valid_loader, 'model': model, 'criterion': criterion, 'optimizer': optimizer, 'scheduler': scheduler, 'device': device, } train(args, trn_cfg) del model, train_loader, valid_loader, train_dataset, valid_dataset gc.collect()
def main(): seed_everything() args = get_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') netG = InpaintGeneratorLight() netD = Discriminator() print('################################################################') print('Total number of parameters * 4:', (count_parameters(netG) + count_parameters(netD)) * 4) print('################################################################') netG = netG.to(device) netD = netD.to(device) optimG = torch.optim.Adam(netG.parameters(), lr=args.lr, betas=(0.0, 0.999)) optimD = torch.optim.Adam(netD.parameters(), lr=args.lr * 0.1, betas=(0.0, 0.999)) save, load = bind_nsml(netG, optimG) if args.pause == 1: nsml.paused(scope=locals()) adversarial_loss = AdversarialLoss() l1_loss = nn.L1Loss() # load current_epoch = 0 if not use_nsml: writer = SummaryWriter(os.path.join('logs', args.nickname)) if args.load: netG_name = os.path.join('checkpoints', args.nickname, 'netG_%03d.pth' % args.load_epoch) netD_name = os.path.join('checkpoints', args.nickname, 'netD_%03d.pth' % args.load_epoch) netG_dict = torch.load(netG_name) netD_dict = torch.load(netD_name) netG.load_state_dict(netG_dict['state_dict']) netD.load_state_dict(netD_dict['state_dict']) current_epoch = args.load_epoch + 1 print('loaded') if args.mode == 'train': path_train = os.path.join(dir_data_root, 'train') path_train_data = os.path.join(dir_data_root, 'train', 'train_data') # fold fnames = os.listdir(path_train_data) if args.debug: fnames = fnames[:1000] random.shuffle(fnames) val_ratio = 0.1 train_fnames = fnames[:-int(len(fnames) * val_ratio)] val_fnames = fnames[-int(len(fnames) * val_ratio):] postfix = dict() total_step = 0 start = time.time() # for epoch in trange(args.num_epochs, disable=use_nsml): for epoch in range(current_epoch, args.num_epochs): if epoch < args.bbox_epochs[0]: bbox_constraint = 0.25 elif epoch < args.bbox_epochs[1]: bbox_constraint = 0.75 else: bbox_constraint = 1.0 tr_loader = get_dataloader(path_train_data, train_fnames, 'train', bbox_constraint, args.mask_channels, args.batch_size, args.num_workers) val_loader = get_dataloader(path_train_data, val_fnames, 'val', bbox_constraint, args.mask_channels, args.batch_size, args.num_workers) print('train:', len(tr_loader) * args.batch_size, 'val:', len(val_loader) * args.batch_size) # if epoch >= args.lr_decay_epoch: # optim.param_groups[0]['lr'] *= 0.1 pbar = tqdm(enumerate(tr_loader), total=len(tr_loader), disable=True) for step, (_, x_input, mask, x_GT) in pbar: total_step += 1 x_input = x_input.to(device) mask = mask.to(device) x_GT = x_GT.to(device) x_mask = torch.cat([x_input, mask], dim=1) x_hat = netG(x_mask) x_composed = compose(x_input, x_hat, mask) ########################################### # update D network ########################################### netD.zero_grad() netD_real = netD(x_GT) net_D_real_loss = adversarial_loss(netD_real, True) netD_fake = netD(x_hat) netD_fake_loss = adversarial_loss(netD_fake, False) netD_loss = net_D_real_loss + netD_fake_loss netD_loss.backward(retain_graph=True) optimD.step() ########################################### # update G network ########################################### netD.zero_grad() netG_fake = netD(x_hat) #.view(-1) 해야할 수도 netG_fake_loss = adversarial_loss(netG_fake, True) * 0.1 # netG_L1_loss = inpainting_loss(x_hat, x_GT, mask) netG_L1_loss = l1_loss(x_hat, x_GT) / torch.mean(mask) netG_loss = netG_fake_loss + netG_L1_loss netG_loss.backward() optimG.step() postfix['netD_loss'] = netD_loss.item() postfix['netG_loss'] = netG_loss.item() postfix['epoch'] = epoch postfix['step_'] = step postfix['total_step'] = total_step postfix['steps_per_epoch'] = len(tr_loader) if step != 0 and step % (args.eval_every - 1) == 0: metric_eval = local_eval(netG, val_loader, path_train_data) postfix['metric_eval'] = metric_eval print('metric eval:', metric_eval) if not use_nsml: sample_dir = os.path.join('samples', args.nickname) os.makedirs(sample_dir, exist_ok=True) vutils.save_image( x_GT, os.path.join(sample_dir, 'x_GT_%03d.png' % epoch), normalize=True) vutils.save_image(x_input, os.path.join( sample_dir, 'x_input_%03d.png' % epoch), normalize=True) vutils.save_image(x_hat, os.path.join( sample_dir, 'x_hat_%03d.png' % epoch), normalize=True) vutils.save_image( mask, os.path.join(sample_dir, 'mask_%03d.png' % epoch), normalize=True) vutils.save_image(x_composed, os.path.join( sample_dir, 'x_composed_%03d_%.1f.png' % (epoch, metric_eval)), normalize=True) writer.add_scalar('train/netD_loss', netD_loss.item(), epoch) writer.add_scalar('train/netG_loss', netG_loss.item(), epoch) if step % args.print_every == 0: print( "[%d/%d][%d/%d] time: %.2f," "netG_gan_loss: %.2f, netG_L1_loss: %.2f, netD_loss: %.2f" % (epoch, args.num_epochs, step, len(tr_loader), time.time() - start, netG_fake_loss.item(), netG_L1_loss.item(), netD_loss.item())) if use_nsml: nsml.report(**postfix, scope=locals(), step=total_step) if use_nsml: nsml.save(epoch) else: checkpoint_dir = os.path.join('checkpoints', args.nickname) os.makedirs(checkpoint_dir, exist_ok=True) netG_dict = {'state_dict': netG.state_dict()} netD_dict = {'state_dict': netD.state_dict()} torch.save( netG_dict, os.path.join(checkpoint_dir, 'netG_%03d.pth' % epoch)) torch.save( netD_dict, os.path.join(checkpoint_dir, 'netD_%03d.pth' % epoch)) print('saved')
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected") parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help= "The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help= "The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help= "If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", default=True, action="store_true", help="Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=10000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument( "--threads", type=int, default=1, help="multiple threads for converting example to features") ### DO NOT MODIFY THIS BLOCK ### # arguments for nsml parser.add_argument('--pause', type=int, default=0) parser.add_argument('--mode', type=str, default='train') ################################ args = parser.parse_args() # for NSML args.data_dir = os.path.join(DATASET_PATH, args.data_dir) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() logger.warning('IF args.n_gpu : ' + str(args.n_gpu) + ' / device : ' + str(device) + '\n') else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 logger.warning('ELSE args.n_gpu : ' + str(args.n_gpu) + ' / device : ' + str(device) + '\n') args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename='log.log') logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() logger.warning("Model Loading ..") config = ElectraConfig.from_pretrained(args.model_name_or_path) model = ElectraForQuestionAnswering.from_pretrained( args.model_name_or_path, config=config) tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=False) logger.warning("Model Loading Completed") if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) ### DO NOT MODIFY THIS BLOCK ### if IS_ON_NSML: bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) ################################ logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def main(): parser = argparse.ArgumentParser() # Required parameters, we defined additional arguments for experiment parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name", ) parser.add_argument( "--load_cache", action="store_true", help="load data from cached session", ) parser.add_argument( "--save_cache", action="store_true", help="save loaded dataset into cache" ) parser.add_argument( "--cached_session_pretrain", default="", type=str, help="Path to cache where 'Span-Pretraining' dataset is stored", ) parser.add_argument( "--cached_session_pretrain_qa", default="", type=str, help="Path to cache where 'QA-Pretraining' dataset is stored", ) parser.add_argument( "--cached_session_train", default="", type=str, help="Path to cache where given 'training' dataset is stored", ) parser.add_argument( "--cached_session_dev", default="", type=str, help="Path to cache where given 'development set' is stored", ) parser.add_argument( "--load_model", action="store_true", help="use pretrained model from previous sessions", ) parser.add_argument( "--load_model_session", default="", type=str, help="Path to pre-trained model", ) parser.add_argument( "--load_model_checkpoint", default="", type=str, help="Path to pre-trained model", ) parser.add_argument( "--just_for_save", action="store_true", help="save checkpoint and terminate immediately", ) parser.add_argument( "--freeze_embedding", action="store_true", help="finetuning just classification layer", ) parser.add_argument( "--mix_qa", action="store_true", help="mix qa set for variance", ) parser.add_argument( "--mix_portion", type=float, default=0.5, help="defines portion of qa pairs to be reconstructed" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help="The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help="The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help="If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_pretrain_span", action="store_true", help="Whether to run span-pretraining.") parser.add_argument("--do_pretrain_qa", action="store_true", help="Whether to run qa-pretraining.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", default=True, action="store_true", help="Run evaluation during training at each logging step." ) parser.add_argument("--do_initial_validation", action="store_true", help="Whether to run initial validation") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") ### DO NOT MODIFY THIS BLOCK ### # arguments for nsml parser.add_argument('--pause', type=int, default=0) parser.add_argument('--mode', type=str, default='train') ################################ args = parser.parse_args() # for NSML args.data_dir = os.path.join(DATASET_PATH, args.data_dir) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename='log.log' ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad") # tokenizer.add_special_tokens({"additional_special_tokens" : ["[QUES]"]}) # print("vocabsize: {}".format(tokenizer.vocab_size)) # print("example") # print(tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]")) model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad") if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 0: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) model.to(args.device) ### DO NOT MODIFY THIS BLOCK ### if IS_ON_NSML: bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) ################################ logger.info("Training/evaluation parameters %s", args) # bind_nsml(model, tokenizer, args) if args.load_model: tmp_args = parser.parse_args() nsml.copy(args, tmp_args) nsml.load(checkpoint=args.load_model_checkpoint, session=args.load_model_session) nsml.copy(tmp_args, args) if args.just_for_save: nsml.save("test") return # initial validation if args.do_initial_validation: logger.info("Initinal Validation start") result = evaluate(args, model, tokenizer, prefix="") _f1, _exact = result["f1"], result["exact"] logger.info( "f1_val = {}, exact_val = {}" \ .format(_f1, _exact)) if IS_ON_NSML: nsml.report(summary=True, step=0, f1=_f1, exact=_exact) # 'Span' Pretraining if args.do_pretrain_span: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=False) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True) logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss) nsml.save("pretrained_span") # 'QA' Pretraining if args.do_pretrain_qa: t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=True) t = time.time() - t logger.info("loading pretrain data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True) logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss) nsml.save("pretrained_span+qa") # Training if args.do_train: if args.freeze_embedding: for param in model.module.electra.parameters(): param.requires_grad = False t = time.time() train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False) t = time.time() - t logger.info("loading train data takes {:.3f} seconds".format(t)) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser( description='Speech hackathon lilililill model') parser.add_argument( '--max_epochs', type=int, default=1000, help='number of max epochs in training (default: 1000)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument('--lr', type=float, default=1e-03, help='learning rate (default: 0.001)') parser.add_argument('--num_mels', type=int, default=80, help='number of the mel bands (default: 80)') parser.add_argument('--batch_size', type=int, default=128, help='batch size in training (default: 128)') parser.add_argument("--num_thread", type=int, default=4, help='number of the loading thread (default: 4)') parser.add_argument('--num_hidden_enc', type=int, default=1024, help='hidden size of model (default: 1024)') parser.add_argument('--num_hidden_dec', type=int, default=512, help='hidden size of model decoder (default: 512)') parser.add_argument( '--nsc_in_ms', type=int, default=50, help='Number of sample size per time segment in ms (default: 50)') parser.add_argument( '--ref_repeat', type=int, default=1, help='Number of repetition of reference seq2seq (default: 1)') parser.add_argument('--loss_lim', type=float, default=0.05, help='Minimum loss threshold (default: 0.05)') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument('--memo', type=str, default='', help='Comment you wish to leave') parser.add_argument('--debug', type=str, default='False', help='debug mode') parser.add_argument('--load', type=str, default=None) args = parser.parse_args() batch_size = args.batch_size num_thread = args.num_thread num_mels = args.num_mels char2index, index2char = load_label('./hackathon.labels') SOS_token = char2index['<s>'] # '<sos>' or '<s>' EOS_token = char2index['</s>'] # '<eos>' or '</s>' PAD_token = char2index['_'] # '-' or '_' unicode_jamo_list = My_Unicode_Jamo_v2() # logger.info(''.join(unicode_jamo_list)) # logger.info('This is a new main2.py') tokenizer = Tokenizer(unicode_jamo_list) jamo_tokens = tokenizer.word2num(unicode_jamo_list) # logger.info('Tokens: {}'.format(jamo_tokens)) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') net = Mel2SeqNet_v2(num_mels, args.num_hidden_enc, args.num_hidden_dec, len(unicode_jamo_list), device) net_optimizer = optim.Adam(net.parameters(), lr=args.lr) ctc_loss = nn.CTCLoss().to(device) # net_B = Seq2SeqNet(512, jamo_tokens, char2index, device) ######### net_B = Seq2SeqNet_v2(1024, jamo_tokens, char2index, device) ######### net_B_optimizer = optim.Adam(net_B.parameters(), lr=args.lr) ######### net_B_criterion = nn.NLLLoss(reduction='none').to(device) ######### bind_model(net, net_B, net_optimizer, net_B_optimizer, index2char, tokenizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return if args.load != None: # nsml.load(checkpoint='saved', session='team47/sr-hack-2019-dataset/' + args.load) nsml.load(checkpoint='model', session='team47/sr-hack-2019-dataset/' + args.load) nsml.save('saved') for g in net_optimizer.param_groups: g['lr'] = 1e-06 for g in net_B_optimizer.param_groups: g['lr'] = 1e-06 for g in net_optimizer.param_groups: logger.info(g['lr']) for g in net_B_optimizer.param_groups: logger.info(g['lr']) wav_paths, script_paths, korean_script_paths = get_paths(DATASET_PATH) logger.info('Korean script path 0: {}'.format(korean_script_paths[0])) logger.info('wav_paths len: {}'.format(len(wav_paths))) logger.info('script_paths len: {}'.format(len(script_paths))) logger.info('korean_script_paths len: {}'.format(len(korean_script_paths))) # Load Korean Scripts korean_script_list, jamo_script_list = get_korean_and_jamo_list_v2( korean_script_paths) logger.info('Korean script 0: {}'.format(korean_script_list[0])) logger.info('Korean script 0 length: {}'.format(len( korean_script_list[0]))) logger.info('Jamo script 0: {}'.format(jamo_script_list[0])) logger.info('Jamo script 0 length: {}'.format(len(jamo_script_list[0]))) script_path_list = get_script_list(script_paths, SOS_token, EOS_token) ground_truth_list = [ (tokenizer.word2num(['<s>'] + list(jamo_script_list[i]) + ['</s>'])) for i in range(len(jamo_script_list)) ] # 90% of the data will be used as train split_index = int(0.95 * len(wav_paths)) wav_path_list_train = wav_paths[:split_index] ground_truth_list_train = ground_truth_list[:split_index] korean_script_list_train = korean_script_list[:split_index] script_path_list_train = script_path_list[:split_index] wav_path_list_eval = wav_paths[split_index:] ground_truth_list_eval = ground_truth_list[split_index:] korean_script_list_eval = korean_script_list[split_index:] script_path_list_eval = script_path_list[split_index:] logger.info('Total:Train:Eval = {}:{}:{}'.format(len(wav_paths), len(wav_path_list_train), len(wav_path_list_eval))) preloader_eval = Threading_Batched_Preloader_v2(wav_path_list_eval, ground_truth_list_eval, script_path_list_eval, korean_script_list_eval, batch_size, num_mels, args.nsc_in_ms, is_train=True) preloader_train = Threading_Batched_Preloader_v2(wav_path_list_train, ground_truth_list_train, script_path_list_train, korean_script_list_train, batch_size, num_mels, args.nsc_in_ms, is_train=False) best_loss = 1e10 best_eval_cer = 1e10 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) logger.info('start') train_begin = time.time() for epoch in range(args.max_epochs): logger.info((datetime.now().strftime('%m-%d %H:%M:%S'))) net.train() net_B.train() preloader_train.initialize_batch(num_thread) loss_list_train = list() seq2seq_loss_list_train = list() seq2seq_loss_list_train_ref = list() logger.info("Initialized Training Preloader") count = 0 total_dist = 0 total_length = 1 total_dist_ref = 0 total_length_ref = 1 while not preloader_train.end_flag: batch = preloader_train.get_batch() # logger.info(psutil.virtual_memory()) # logger.info("Got Batch") if batch is not None: # logger.info("Training Batch is not None") tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch pred_tensor, loss = train(net, net_optimizer, ctc_loss, tensor_input.to(device), ground_truth.to(device), length_list.to(device), device) loss_list_train.append(loss) #################################################### jamo_result = Decode_Prediction_No_Filtering( pred_tensor, tokenizer) true_string_list = Decode_Num_Script( batched_num_script.detach().cpu().numpy(), index2char) for i in range(args.ref_repeat): lev_input_ref = ground_truth lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_train( lev_input_ref.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_optimizer, net_B_criterion) pred_string_list_ref = Decode_Lev_Prediction( lev_pred_ref, index2char) seq2seq_loss_list_train_ref.append(seq2seq_loss_ref) dist_ref, length_ref = char_distance_list( true_string_list, pred_string_list_ref) pred_string_list = [None] dist = 0 length = 0 if (loss < args.loss_lim): lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor) lev_pred, attentions, seq2seq_loss = net_B.net_train( lev_input.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_optimizer, net_B_criterion) pred_string_list = Decode_Lev_Prediction( lev_pred, index2char) seq2seq_loss_list_train.append(seq2seq_loss) dist, length = char_distance_list(true_string_list, pred_string_list) total_dist_ref += dist_ref total_length_ref += length_ref total_dist += dist total_length += length count += 1 if count % 25 == 0: logger.info("Train: Count {} | {} => {}".format( count, true_string_list[0], pred_string_list_ref[0])) logger.info("Train: Count {} | {} => {} => {}".format( count, true_string_list[0], jamo_result[0], pred_string_list[0])) else: logger.info("Training Batch is None") # del preloader_train # logger.info(loss_list_train) train_loss = np.mean(np.asarray(loss_list_train)) train_cer = np.mean(np.asarray(total_dist / total_length)) train_cer_ref = np.mean(np.asarray(total_dist_ref / total_length_ref)) logger.info("Mean Train Loss: {}".format(train_loss)) logger.info("Total Train CER: {}".format(train_cer)) logger.info("Total Train Reference CER: {}".format(train_cer_ref)) preloader_eval.initialize_batch(num_thread) loss_list_eval = list() seq2seq_loss_list_eval = list() seq2seq_loss_list_eval_ref = list() logger.info("Initialized Evaluation Preloader") count = 0 total_dist = 0 total_length = 1 total_dist_ref = 0 total_length_ref = 1 net.eval() net_B.eval() while not preloader_eval.end_flag: batch = preloader_eval.get_batch() if batch is not None: tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch pred_tensor, loss = evaluate(net, ctc_loss, tensor_input.to(device), ground_truth.to(device), length_list.to(device), device) loss_list_eval.append(loss) #################### jamo_result = Decode_Prediction_No_Filtering( pred_tensor, tokenizer) true_string_list = Decode_Num_Script( batched_num_script.detach().cpu().numpy(), index2char) lev_input_ref = ground_truth lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_eval( lev_input_ref.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_criterion) pred_string_list_ref = Decode_Lev_Prediction( lev_pred_ref, index2char) seq2seq_loss_list_train_ref.append(seq2seq_loss_ref) dist_ref, length_ref = char_distance_list( true_string_list, pred_string_list_ref) lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor) lev_pred, attentions, seq2seq_loss = net_B.net_eval( lev_input.to(device), batched_num_script.to(device), batched_num_script_loss_mask.to(device), net_B_criterion) pred_string_list = Decode_Lev_Prediction(lev_pred, index2char) seq2seq_loss_list_train.append(seq2seq_loss) dist, length = char_distance_list(true_string_list, pred_string_list) total_dist_ref += dist_ref total_length_ref += length_ref total_dist += dist total_length += length count += 1 #################### if count % 10 == 0: logger.info("Eval: Count {} | {} => {}".format( count, true_string_list[0], pred_string_list_ref[0])) logger.info("Eval: Count {} | {} => {} => {}".format( count, true_string_list[0], jamo_result[0], pred_string_list[0])) else: logger.info("Training Batch is None") eval_cer = total_dist / total_length eval_cer_ref = total_dist_ref / total_length_ref eval_loss = np.mean(np.asarray(loss_list_eval)) logger.info("Mean Evaluation Loss: {}".format(eval_loss)) logger.info("Total Evaluation CER: {}".format(eval_cer)) logger.info("Total Evaluation Reference CER: {}".format(eval_cer_ref)) nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, train_epoch__cer_ref=train_cer_ref, eval__loss=eval_loss, eval__cer=eval_cer, eval__cer_ref=eval_cer_ref) nsml.save(args.save_name) best_model = (eval_cer < best_eval_cer) if best_model: nsml.save('best') best_eval_cer = eval_cer logger.info("Inference Check")
def main(args, local): if args.arch == 'xDeepFM' and args.mode == 'train': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join(DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature'] target = ['label'] len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features] idx_artics_all = item['article_id'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스 linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary') print('---model defined---') # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까 print(time.time() - s ,'seconds') if use_nsml and args.mode == 'train': bind_nsml(model,[], args.task) if args.mode == 'test': print('_infer root - : ', DATASET_PATH) print('test') model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH) bind_nsml(model, [], args.task) checkpoint_session = ['401','team_62/airush2/176'] nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) print('successfully loaded') if (args.mode == 'train'): if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네 nsml.save('infer') print('end') print('end_main') if args.pause: nsml.paused(scope=local)
def main(): # Argument Settings parser = argparse.ArgumentParser( description='Image Tagging Classification from Naver Shopping Reviews') parser.add_argument('--sess_name', default='example', type=str, help='Session name that is loaded') parser.add_argument('--checkpoint', default='best', type=str, help='Checkpoint') parser.add_argument('--batch_size', default=256, type=int, help='batch size') parser.add_argument('--num_workers', default=16, type=int, help='The number of workers') parser.add_argument('--num_epoch', default=100, type=int, help='The number of epochs') parser.add_argument('--model_name', default='mobilenet_v2', type=str, help='[resnet50, rexnet, dnet1244, dnet1222]') parser.add_argument('--weight_file', default='model.pth', type=str) parser.add_argument('--optimizer', default='SGD', type=str) parser.add_argument('--lr', default=1e-2, type=float) parser.add_argument('--weight_decay', default=1e-5, type=float) parser.add_argument('--learning_anneal', default=1.1, type=float) parser.add_argument('--annealing_period', default=10, type=int) parser.add_argument('--num_gpu', default=1, type=int) parser.add_argument('--pretrain', action='store_true', default=False) parser.add_argument('--mode', default='train', help='Mode') parser.add_argument('--pause', default=0, type=int) parser.add_argument('--iteration', default=0, type=str) args = parser.parse_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Model logger.info('Build Model') model = select_model(args.model_name, pretrain=args.pretrain, n_class=41) total_param = sum([p.numel() for p in model.parameters()]) logger.info(f'Model size: {total_param} tensors') load_weight(model, args.weight_file) model = model.to(device) nu.bind_model(model) nsml.save('best') if args.pause: nsml.paused(scope=locals()) if args.num_epoch == 0: return # Set the dataset logger.info('Set the dataset') df = pd.read_csv(f'{DATASET_PATH}/train/train_label') train_size = int(len(df) * 0.8) trainset = TagImageDataset(data_frame=df[:train_size], root_dir=f'{DATASET_PATH}/train/train_data', transform=train_transform) testset = TagImageDataset(data_frame=df[train_size:], root_dir=f'{DATASET_PATH}/train/train_data', transform=test_transform) train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) test_loader = DataLoader(dataset=testset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) criterion = nn.CrossEntropyLoss(reduction='mean') optimizer = select_optimizer(model.parameters(), args.optimizer, args.lr, args.weight_decay) criterion = criterion.to(device) if args.mode == 'train': logger.info('Start to train!') train_process(args=args, model=model, train_loader=train_loader, test_loader=test_loader, optimizer=optimizer, criterion=criterion, device=device) elif args.mode == 'test': nsml.load(args.checkpoint, session=args.sess_name) logger.info('[NSML] Model loaded from {}'.format(args.checkpoint)) model.eval() logger.info('Start to test!') test_loss, test_acc, test_f1 = evaluate(model=model, test_loader=test_loader, device=device, criterion=criterion) logger.info(test_loss, test_acc, test_f1)
def main(args): search_file(DATASET_PATH) if args.mode == 'train': feature_ext_model = build_cnn_model(backbone=CNN_BACKBONE) else: feature_ext_model = build_cnn_model(backbone=CNN_BACKBONE,use_imagenet=None) if use_history_image_f==True: in_feature_num = int(97 +84 + 9+ feature_ext_model.output.shape[1]*2) else: in_feature_num = int(97 +84 + 9+ feature_ext_model.output.shape[1]) print( 'in_feature_num',in_feature_num) model = build_model(in_feature_num) print('feature_ext_model.output.shape[1]',feature_ext_model.output.shape[1]) if use_nsml: bind_nsml(feature_ext_model, model, args.task) if args.pause: nsml.paused(scope=locals()) if args.mode == 'train': csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') print('item.shape', item.shape) print(item.head()) category_text_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data_article.tsv') category_text = pd.read_csv(category_text_file, dtype={ 'article_id': str, 'category_id': int, 'title': str }, sep='\t') print('category_text.shape', category_text.shape) print(category_text.head()) category_text = category_text[['article_id','category_id']] print('category_id].values.max()',category_text['category_id'].values.max()) print('category_id].values.min()',category_text['category_id'].values.min()) label_data_path = os.path.join(DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') print('train label csv') print(label.head()) if debug is not None: item= item[:debug] label = label[:debug] if balancing == True: one_label = label[label['label']==1] print(one_label.head()) zero_label = label[label['label']==0].sample(one_label.shape[0]) print(zero_label.head()) label = pd.concat([one_label,zero_label]) #print(label.index.to_list()) item = item.loc[label.index.to_list()] print('item.shape',item.shape) print(item.head()) print(label.head()) #class_weights = class_weight.compute_class_weight('balanced', np.unique(label), label) #print('class_weights',class_weights) item,article_list,total_list_article = count_process(item,category_text) print('preprocess item.shape', item.shape) print(item.head()) print(item.columns) #only train set's article img_features, img_distcnts = make_features_and_distcnt(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image'),feature_ext_model ,article_list, 'features.pkl', 'distr_cnt.pkl') #only train history cnts history_distcnts = make_history_distcnt(total_list_article, 'history_distr_cnt.pkl') train_df, valid_df, train_dfy, valid_dfy = train_test_split(item, label, test_size=0.05, random_state=888)#,stratify =label) print('train_df.shape, valid_df.shape, train_dfy.shape, valid_dfy.shape' ,train_df.shape, valid_df.shape, train_dfy.shape, valid_dfy.shape) # Generators #root=os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image') training_generator = AiRushDataGenerator( train_df, label=train_dfy,shuffle=True,batch_size=batch_size,mode='train' , image_feature_dict=img_features,distcnts = img_distcnts, history_distcnts=history_distcnts ,featurenum=in_feature_num,use_image_feature=True, use_history_image_f = use_history_image_f) validation_generator = AiRushDataGenerator( valid_df, label=valid_dfy,shuffle=False,batch_size=batch_size//20,mode='valid' ,image_feature_dict=img_features,distcnts = img_distcnts,history_distcnts=history_distcnts ,featurenum=in_feature_num,use_image_feature=True, use_history_image_f = use_history_image_f) #pctr = Metrics()#next(training_generator.flow()) #x, y = training_generator.__getitem__(0) #print(x.shape, y.shape) #print(len(test),test[0].shape,test[1].shape) metrics=['accuracy',f1_score]#,pctr] #opt = optimizers.SGD(lr=0.01, clipvalue=0.5) opt = Adam(lr=0.001) #KerasFocalLoss model.compile(loss=f1_loss, optimizer=opt, metrics=metrics) model.summary() """ Callback """ monitor = 'val_f1_score' best_model_path = 'dgu_model.h5' reduce_lr = ReduceLROnPlateau(monitor=monitor, patience=30,factor=0.2,verbose=1,mode='max') early_stop = EarlyStopping(monitor=monitor, patience=9,mode='max') #checkpoint = ModelCheckpoint(best_model_path,monitor=monitor,verbose=1,save_best_only=True) report = report_nsml(prefix = 'dgu') callbacks = [reduce_lr,report] # Train model on dataset model.fit_generator(generator=training_generator,steps_per_epoch=100, epochs=10000, #class_weight=class_weights, validation_data=validation_generator, use_multiprocessing=True, workers=2, callbacks=callbacks)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--feature', type=str, default='mel', help='select feature extraction function. mel or log_mel ') args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py ; N_FFT = size of the Fourier Transform feature_size = N_FFT / 2 + 1 # N_FFT size = 512 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() # initial distribution of model weights for param in model.parameters(): param.data.uniform_(-0.08, 0.08) # make tensors able to be computed on multiple devices in parallel and copy tensors to GPU model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) # val ratio can be adjusted -> 10% ?? train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) # load train data train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() # train epoch train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) print('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() # eval for each epoch valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) print('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=WORD_MAXLEN, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--word', action='store_true', help='Train/Predict model using word based label (default: False)') parser.add_argument('--gen_label_index', action='store_true', help='Generate word label index map(default: False)') parser.add_argument('--iteration', type=str, help='Iteratiom') parser.add_argument('--premodel_session', type=str, help='Session name of premodel') # transformer model parameter parser.add_argument('--d_model', type=int, default=128, help='transformer_d_model') parser.add_argument('--n_head', type=int, default=8, help='transformer_n_head') parser.add_argument('--num_encoder_layers', type=int, default=4, help='num_encoder_layers') parser.add_argument('--num_decoder_layers', type=int, default=4, help='transformer_num_decoder_layers') parser.add_argument('--dim_feedforward', type=int, default=2048, help='transformer_d_model') parser.add_argument('--dropout', type=float, default=0.1, help='transformer_dropout') # transformer warmup parameter parser.add_argument('--warmup_multiplier', type=int, default=3, help='transformer_warmup_multiplier') parser.add_argument('--warmup_epoch', type=int, default=10, help='transformer_warmup_epoch') args = parser.parse_args() char_loader = CharLabelLoader() char_loader.load_char2index('./hackathon.labels') label_loader = char_loader if args.word: if args.gen_label_index: generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH) from subprocess import call call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True) # ??? ??? ??? ?? word_loader = CharLabelLoader() word_loader.load_char2index('./hackathon.pos.labels') label_loader = word_loader if os.path.exists(TRAIN_LABEL_CHAR_PATH): generate_word_label_file(char_loader, word_loader, TRAIN_LABEL_POS_PATH, TRAIN_LABEL_CHAR_PATH) char2index = label_loader.char2index index2char = label_loader.index2char SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') ############ model print("model: transformer") # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers, # dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN) encoder = Encoder(d_input=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, pe_maxlen=SOUND_MAXLEN) decoder = Decoder(sos_id=SOS_token, eos_id=EOS_token, n_tgt_vocab=len(char2index), d_word_vec=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, tgt_emb_prj_weight_sharing=True, pe_maxlen=SOUND_MAXLEN) model = Transformer(encoder, decoder) optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), lr=0.0004, betas=(0.9, 0.98), eps=1e-09)) ############/ for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) """ optimizer = optim.Adam(model.module.parameters(), lr=args.lr) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) """ bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o # target_path = os.path.join(DATASET_PATH, 'train_label') target_path = TRAIN_LABEL_CHAR_PATH if args.word: target_path = TRAIN_LABEL_POS_PATH load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) if args.iteration: if args.premodel_session: nsml.load(args.iteration, session=args.premodel_session) logger.info(f'Load {args.premodel_session} {args.iteration}') else: nsml.load(args.iteration) logger.info(f'Load {args.iteration}') logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): # learning rate scheduler train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() print("~~~~~~~~~~~~") if epoch == 10 or (epoch > 48 and epoch % 10 == 9): valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, device, args.max_len, args.batch_size) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss