def test_upsample(): data_loaders = get_data_loaders('../data/ljspeech', -1) for phase, data_loader in data_loaders.items(): train = (phase == "train") running_loss = 0. test_evaluated = False for step, (x, y, c, g, input_lengths) in enumerate(data_loader): c = c.unsqueeze(1) upconv1 = UpSampleConv() c1 = upconv1(c) break
image_patches['orig'] = cv2.resize(image, self.patch_size) img_probas = classify_image(model, image_patches) name_arch = re.search('b[0-9]', model_arch).group(0) image_preds_to_dataFrame(img_probas, name_arch, image_name) self.data_frame.to_csv( '/Users/eugeneolkhovik/python_files/ML/melanoma/derma_classifier/meta_study/ensemble_pred.csv' ) if __name__ == "__main__": cfg = ConfigTwoClasses() data_manager = DataManager(cfg) _, data_loader = get_data_loaders(cfg, ) # args patches_name = ['tl', 'tr', 'bl', 'br', 'center', 'orig'] writer = PredictionWriter() writer.create_dataframe(patches_name) writer.run() """ ### experiment -> models directories all_dirs = os.walk(experiments_dir) models_res = [] model_path = [] for i in next(all_dirs)[1]: model_resualts_dir = os.path.join(experiments_dir, i) models_res.append(model_resualts_dir)
initial_value = None if initial_value is None else int(initial_value) file_name_suffix = args["--file-name-suffix"] output_html = args["--output-html"] # Override hyper parameters hparams.parse(args["--hparams"]) assert hparams.name == "wavenet_vocoder" from train import build_model, get_data_loaders from synthesis import wavegen # Data # Use exactly same testset used in training script # disable shuffle for convenience test_data_loader = get_data_loaders(data_root, speaker_id, test_shuffle=False)["test"] test_dataset = test_data_loader.dataset # Model model = build_model() # Load checkpoint print("Load checkpoint from {}".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) checkpoint_name = splitext(basename(checkpoint_path))[0] os.makedirs(dst_dir, exist_ok=True) dst_dir_name = basename(os.path.normpath(dst_dir))
with open(preset) as f: hparams.parse_json(f.read()) # Override hyper parameters hparams.parse(args["--hparams"]) assert hparams.name == "wavenet_vocoder" # tee sys.stdout to an additional log file in checkpoint_dir tee = Tee(join(os.path.dirname(checkpoint_path), 'evaluate.stdout')) from train import build_model, get_data_loaders from synthesis import wavegen # Data # Use exactly same testset used in training script # disable shuffle for convenience test_data_loader = get_data_loaders(data_root, speaker_id, test_shuffle=False, phases=("test",))["test"] test_dataset = test_data_loader.dataset # Model model = build_model().to(device) # Load checkpoint print("Load checkpoint from {}".format(checkpoint_path)) if use_cuda: checkpoint = torch.load(checkpoint_path) else: checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) checkpoint_name = splitext(basename(checkpoint_path))[0] os.makedirs(dst_dir, exist_ok=True)
"DEL", # DELETION "INS", # INSERTION "SUB", # SUBTRACTION "W-PER", # WHOLE WORD PERMUTATION "W-DEL", # WHOLE WORD DELETION "W-INS", # WHOLE WORD INSERTION "W-SUB" # WHOLE WORD SUBTRACTION ] MAX_CHARS = 24 BSZ = 8 NT = random.choice(TYPES) ntype_chckp = CHECKPOINTS_PATH.joinpath(f"MUDE_{NT}") checkpoint_path = best_checkpoint_selector(ntype_chckp) test_ld = get_data_loaders(NT, BSZ, MAX_CHARS)[2] CHAR_VOCAB_SIZE = len(test_ld.dataset.vect.chars) TGT_VOCAB_SIZE = len(test_ld.dataset.vocab) DIM = 512 DIM_FFT = int(DIM * 4) ATTN_HEADS = 8 DEPTH = 2 DIM_HIDDEN = 650 DROPOUT_RATE = .01 LR = 1e-4 mude = MUDE(dim=DIM, characters_vocab_size=CHAR_VOCAB_SIZE, tokens_vocab_size=TGT_VOCAB_SIZE, encoder_depth=DEPTH,
def train(): parser = ArgumentParser() parser.add_argument("--train_path", type=str, default='data/yesands_train_iter4.json', help="Set data path") parser.add_argument("--valid_path", type=str, default='data/yesands_valid.json', help="Set data path") parser.add_argument("--correct_bias", type=bool, default=False, help="Set to true to correct bias for Adam optimizer") parser.add_argument("--lr", type=float, default=2e-5, help="Set learning rate") parser.add_argument("--n_epochs", type=int, default=4, help="Set number of epochs") parser.add_argument("--num_warmup_steps", type=float, default=1000, help="Set number of warm-up steps") parser.add_argument("--num_total_steps", type=float, default=10000, help="Set number of total steps") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--max_grad_norm", type=float, default=1.0, help="Set maximum gradient normalization.") parser.add_argument( "--pretrained_path", type=str, default='roberta-base', help= "Choose which pretrained model to use (bert-base-uncased, roberta-base, roberta-large, roberta-large-mnli)" ) parser.add_argument("--batch_size", type=int, default=32, help="Provide the batch size") parser.add_argument("--random_seed", type=int, default=42, help="Set the random seed") parser.add_argument( "--test", action='store_true', help="If true, run with small dataset for testing code") parser.add_argument( "--base", action='store_true', help= "If true, run with base experiment configuration (training with spont only) for comparison" ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger.info("Arguments: {}".format(pformat(args))) if 'roberta' in args.pretrained_path: # initialize tokenizer and model logger.info("Initialize model and tokenizer.") tokenizer = RobertaTokenizer.from_pretrained( args.pretrained_path, cache_dir='../pretrained_models') model = RobertaForSequenceClassification.from_pretrained( args.pretrained_path, cache_dir='../pretrained_models') ### START MODEL MODIFICATION # Pretrained model was not trained with token type ids. # fix token type embeddings for finetuning. Without this, the model can only take 0s as valid input for token_type_ids model.config.type_vocab_size = 2 model.roberta.embeddings.token_type_embeddings = torch.nn.Embedding( 2, model.config.hidden_size) model.roberta.embeddings.token_type_embeddings.weight.data.normal_( mean=0.0, std=model.config.initializer_range) ### END MOD elif 'bert' in args.pretrained_path: model = BertForSequenceClassification.from_pretrained( args.pretrained_path, cache_dir='../pretrained_models') tokenizer = BertTokenizer.from_pretrained( args.pretrained_path, cache_dir='../pretrained_models') model.to(args.device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, correct_bias=args.correct_bias) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.num_warmup_steps, t_total=args.num_total_steps) logger.info("Prepare datasets") train_data = get_data(args.train_path) valid_data = get_data(args.valid_path) if arg.test: train_data = train_data[:100] valid_data = valid_data[:100] logger.info("Loading train set...") train_loader, train_sampler = get_data_loaders(args, train_data, args.train_path, tokenizer) logger.info("Loading validation set...") valid_loader, valid_sampler = get_data_loaders(args, valid_data, args.valid_path, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) b_input_ids, b_input_mask, b_input_segment, b_labels = batch optimizer.zero_grad() #roberta has issues with token_type_ids loss, logits = model(b_input_ids, token_type_ids=b_input_segment, attention_mask=b_input_mask, labels=b_labels) # loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() return loss.item(), logits, b_labels trainer = Engine(update) val_result_f = Path('turn_val_prediction.txt').open('w') # Evaluation function and evaluator def inference(engine, batch): model.eval() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) b_input_ids, b_input_mask, b_input_segment, b_labels = batch with torch.no_grad(): #roberta has issues with token_type_ids # loss, logits = model(b_input_ids, token_type_ids = None, attention_mask=b_input_mask, labels=b_labels) loss, logits = model(b_input_ids, token_type_ids=b_input_segment, attention_mask=b_input_mask, labels=b_labels) label_ids = b_labels return logits, label_ids, loss.item() evaluator = Engine(inference) trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(valid_loader)) RunningAverage(output_transform=lambda x: x[0]).attach(trainer, "loss") RunningAverage(Accuracy(output_transform=lambda x: (x[1], x[2]))).attach( trainer, "accuracy") if torch.cuda.is_available(): GpuInfo().attach(trainer, name='gpu') recall = Recall(output_transform=lambda x: (x[0], x[1])) precision = Precision(output_transform=lambda x: (x[0], x[1])) F1 = (precision * recall * 2 / (precision + recall)).mean() accuracy = Accuracy(output_transform=lambda x: (x[0], x[1])) metrics = { "recall": recall, "precision": precision, "f1": F1, "accuracy": accuracy, "loss": Average(output_transform=lambda x: x[2]) } for name, metric in metrics.items(): metric.attach(evaluator, name) pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=['loss', 'accuracy']) pbar.attach(trainer, metric_names=['gpu:0 mem(%)', 'gpu:0 util(%)']) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation metrics:\n %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="valid", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) # tb_logger.writer.log_dir -> tb_logger.writer.logdir (this is the correct attribute name as seen in: https://tensorboardx.readthedocs.io/en/latest/_modules/tensorboardX/writer.html#SummaryWriter) checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=5) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" take care of distributed encapsulation torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file( os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) tokenizer.save_vocabulary(tb_logger.writer.logdir) trainer.run(train_loader, max_epochs=args.n_epochs) if args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close() val_result_f.close()
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="gpt2", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2', 'gpt2-medium']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--num_candidates", type=int, default=1) parser.add_argument("--personality_permutations", type=int, default=1) parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--distributed", action='store_true') parser.add_argument("--temperature", type=int, default=1, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument("--entmax_alpha", type=float, default=1.5) parser.add_argument("--entmax_k", type=int, default=512) parser.add_argument("--entmax_bisect_iter", type=int, default=50) parser.add_argument("--loss", default="cross_entropy", type=str) parser.add_argument("--metric", default="jsd", type=str) parser.add_argument("--epsilon", default=0.000001, type=float) parser.add_argument("--name", default='', type=str) parser.add_argument("--temp", type=float, default=0) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) args.train_batch_size = args.batch_size args.valid_batch_size = args.batch_size generic_entmax_loss = partial(EntmaxBisectLoss, alpha=args.entmax_alpha, n_iter=args.entmax_bisect_iter) loss_funcs = { "cross_entropy": nn.CrossEntropyLoss, "sparsemax": partial(SparsemaxLoss, k=args.entmax_k), "entmax15": partial(Entmax15Loss, k=args.entmax_k), "entmax": generic_entmax_loss, "entmax_alpha": "entmax_alpha" } assert args.loss in loss_funcs loss_func = loss_funcs[args.loss] generic_entmax = partial(entmax_bisect, alpha=args.entmax_alpha, n_iter=args.entmax_bisect_iter) gen_funcs = { "softmax": torch.softmax, "sparsemax": partial(sparsemax, k=args.entmax_k), "entmax15": partial(entmax15, k=args.entmax_k), "entmax": generic_entmax, "entmax_alpha": "entmax_alpha" } if args.loss == "cross_entropy": gen_func = gen_funcs["softmax"] elif args.loss == "sparsemax": gen_func = gen_funcs["sparsemax"] elif args.loss == "entmax15": gen_func = gen_funcs["entmax15"] elif args.loss == "entmax": gen_func = gen_funcs["entmax"] elif args.loss == "entmax_alpha": gen_func = gen_funcs["entmax_alpha"] if args.model_checkpoint == "": if args.model == 'gpt2' or args.model == 'gpt2-medium': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel ) if args.model == 'gpt2' or args.model == 'gpt2-medium' else ( OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) bos, eos, speaker1, speaker2, pad = tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) model.eval() if args.metric == 'f1': datasets = {"train": defaultdict(list), "valid": defaultdict(list)} for dataset_name, dataset in personachat.items(): num_candidates = 1 if dataset_name != 'train': for dialog in dataset: persona = dialog["personality"].copy() for utterance in dialog["utterances"]: history = utterance["history"] #[-(2*2+1):] for j, candidate in enumerate( utterance["candidates"][-num_candidates:]): instance = build_input(persona, history, candidate, tokenizer) for input_name, input_array in instance.items(): datasets[dataset_name][input_name].append( input_array) logger.info("Pad inputs and convert to Tensor") tensor_datasets = {"train": [], "valid": []} for dataset_name, dataset in datasets.items(): if dataset_name != 'train': inputs = dataset['input_ids'] replies = dataset['reply'] token_type_ids = dataset['token_type_ids'] special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) predictions = [] references = [] preds = [] refs = [] histories = [] for i in range(len(inputs)): if i % 100 == 0: print(str(i) + ' out of ' + str(len(inputs))) inpu = torch.tensor(inputs[i]).unsqueeze(0).cuda() token_ids = torch.tensor(token_type_ids[i]).unsqueeze(0).cuda() current_output = [] for i in range(args.max_length): if i > 0: inpu = torch.cat([inpu, prev.unsqueeze(0)], 1) if token_ids[0][-1] == 50260: token_ids = torch.cat([ token_ids, torch.tensor([50260]).cuda().unsqueeze(0) ], 1) else: token_ids = torch.cat([ token_ids, torch.tensor([50261]).cuda().unsqueeze(0) ], 1) logits = model(inpu, token_type_ids=token_ids) if isinstance(logits, tuple): logits = logits[0] logits = logits[0, -1, :] if args.top_k != 0 or args.top_p != 0: logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p) if args.temp != 0: probs = softmax_temperature(logits.unsqueeze(0), temperature=args.temp, axis=1).squeeze(0) else: probs = gen_func(logits, dim=-1) prev = torch.multinomial(probs, 1) if prev.item() in special_tokens_ids: break current_output.append(prev.item()) out_text = tokenizer.decode(current_output, skip_special_tokens=True) target = tokenizer.decode(replies[i]) history = tokenizer.decode(inputs[i]) predictions.append(out_text) references.append(target) preds.append(current_output) refs.append(replies[i]) f1_score = eval_utils.f1(preds, refs) print('F1_score:', f1_score) distinct_1, distinct_2, distinct_3, distinct_4 = eval_utils.distinct( predictions) print('distinct_1:', distinct_1) print('distinct_2:', distinct_2) print('distinct_3:', distinct_3) print('distinct_4:', distinct_4) else: _, val_loader, _, valid_sampler = get_data_loaders(args, tokenizer) jsd = 0 sp = 0 perp = 0.0 nb_eval_steps = 0 v = 0 for batch in val_loader: v += 1 if v % 100 == 0: print(str(v) + ' out of ' + str(7801)) batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, lm_labels, lm_labels, mc_labels, token_type_ids = batch lm_logits = model(input_ids, token_type_ids=token_type_ids) lm_logits = lm_logits[0] lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) lm_logits_flat_shifted = list( lm_logits_flat_shifted.cpu().detach().numpy()) lm_labels_flat_shifted = list(lm_labels_flat_shifted.cpu().numpy()) lm_logits_flat_shifted = [ lm_logits_flat_shifted[i] if lm_labels_flat_shifted[i] != pad and lm_labels_flat_shifted[i] != eos else [0] for i in range(len(lm_labels_flat_shifted)) ] lm_logits_flat_shifted = torch.tensor( list(filter(lambda a: len(a) != 1, lm_logits_flat_shifted))) lm_labels_flat_shifted = torch.tensor( list( filter(lambda a: a != pad and a != eos, lm_labels_flat_shifted))) if args.top_p > 0 or args.top_k > 0: j = 0 for l in lm_logits_flat_shifted: j += 1 if j > 1: shift_logits = torch.cat([ shift_logits, top_filtering(l, top_p=args.top_p, top_k=args.top_k).unsqueeze(0) ], 0) else: shift_logits = top_filtering( l, top_p=args.top_p, top_k=args.top_k).unsqueeze(0) else: shift_logits = lm_logits_flat_shifted if args.temp != 0: probs = softmax_temperature(shift_logits, temperature=args.temp, axis=1) else: probs = gen_func(shift_logits, dim=1) jsd_batch = [] labels = torch.zeros(len(lm_labels_flat_shifted), shift_logits.size(-1)) for i in range(len(lm_labels_flat_shifted)): labels[i, lm_labels_flat_shifted[i]] = 1 jsd_ = compute_jsd(probs[i], labels[i]) jsd_batch.append(jsd_) jsd_batch = torch.tensor(jsd_batch).mean() jsd += jsd_batch sp_batch = [] for i in range(len(lm_labels_flat_shifted)): sp_batch.append( compute_sp(probs.squeeze(0)[i], lm_labels_flat_shifted[i])) sp_batch = torch.tensor(sp_batch).mean() sp += sp_batch if len(probs[0].nonzero()) != len(probs[0]): probs = probs[:, :] + args.epsilon sums = [probs[i].sum().item() for i in range(probs.size(0))] probs = [probs[i] / sums[i] for i in range(len(sums))] probs = torch.stack(probs) p = [ probs[i, lm_labels_flat_shifted.squeeze(0)[i].item()] for i in range(len(lm_labels_flat_shifted.squeeze(0))) ] p = torch.stack(p) perp += torch.log(p**-1).mean().item() nb_eval_steps += 1 jsd = jsd / nb_eval_steps sp = sp / nb_eval_steps a = perp / nb_eval_steps perplexity = torch.exp(torch.tensor(a)) print('perplexity:', perplexity) print('jsd:', jsd) print('sp:', sp)