def __init__(self, opt): self.opt = opt tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_length=opt.max_length, data_file='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( vocab=tokenizer.vocab, embed_dim=opt.embed_dim, data_file='{0}d_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) trainset = SentenceDataset(opt.dataset_file['train'], tokenizer, target_dim=self.opt.polarities_dim) testset = SentenceDataset(opt.dataset_file['test'], tokenizer, target_dim=self.opt.polarities_dim) self.train_dataloader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True) self.test_dataloader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) if opt.device.type == 'cuda': print('cuda memory allocated:', torch.cuda.memory_allocated(self.opt.device.index)) self._print_args()
def main(): import json option_file_path = 'dump/sentlm_base/options.json' with open(option_file_path, 'r') as fin: options = json.load(fin) with tf.variable_scope('lm'): model = SentenceLanguageModel(options, True) init = tf.initializers.global_variables() init_state_tensors = [model.init_lstm_state] final_state_tensors = [model.final_lstm_state] batch_size = options['batch_size'] max_seq_length = options['unroll_steps'] max_chars = options['char_cnn']['max_characters_per_token'] config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.InteractiveSession(config=config) sess.run(init) feed_dict = { model.tokens_characters: np.zeros([batch_size, max_seq_length, max_chars], dtype=np.int32) } init_state_values = sess.run(init_state_tensors, feed_dict=feed_dict) from data import SentenceDataset, UnicodeCharsVocabularyPad test_prefix = 'data/test/violin_test.txt' vocab_path = 'data/vocabulary/vocab_bnc_5.txt' vocabulary = UnicodeCharsVocabularyPad(vocab_path, max_word_length=max_chars) dataset = SentenceDataset(test_prefix, vocabulary) a = dataset.iter_batches(batch_size=batch_size, seq_length=max_seq_length) b = next(a) feed_dict = { model.tokens_characters: b['tokens_characters'], model.seq_length: b['lengths'], model.next_token_id: b['next_token_id'] } total_loss = sess.run(model.total_loss, feed_dict=feed_dict) losses = sess.run(model.losses, feed_dict=feed_dict) print(f'Loss: {total_loss} (should be around 12)') from IPython import embed embed() import os os._exit(1)
def main(): # c=Collection() # c.load(Path("./2021/ref/training/medline.1200.es.txt")) # pickle_postag(c) file = './2021/ref/training/medline.1200.es.txt' data = SentenceDataset( file, transform=sentence_to_tensor, target_transform=lambda l: torch.stack(tuple(map(label_to_tensor, l)))) data_loader = DataLoader(data, batch_size=4, collate_fn=my_collate_fn, shuffle=True) n = MyLSTM(50, 50, len(TAGS), 113, 50) n.to(DEVICE) optimizer = torch.optim.SGD(n.parameters(), lr=learning_rate) metrics = { 'acc': lambda pred, true: Accuracy()(pred, true), 'f1': lambda pred, true: F1Score() (torch.tensor(pred.argmax(dim=1), dtype=torch.float32), torch.tensor(true, dtype=torch.float32)) } train(data_loader, n, criterion, optimizer, 5, filename='test_lstm.pth', metrics=metrics)
def main(args): config = Config(args) options, ckpt_file = load_options_latest_checkpoint(config.save_path) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] vocab = UnicodeCharsVocabularyPad(args.vocab_file, max_word_length) else: ## Not tested yet vocab = VocabularyPad(args.vocab_file) test_path = 'data/Selectional_Restrictions/Pylkkanen2007_processed.txt' # test_path = 'data/Selectional_Restrictions/Warren2015_processed.txt' # test_path = 'data/CSR/WSC_sent.txt' with open(test_path) as f: sents = [l.rstrip() for l in f.readlines()] num_per_group = 2 if 'WSC' in test_path else 3 positions = _get_changed_positions(sents, num_per_group) data = SentenceDataset(test_path, vocab, test=True, shuffle_on_load=False, tokenizer=nltk.word_tokenize) # if options.get('bidirectional'): # data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) # else: # data = LMDataset(test_prefix, vocab, **kwargs) all_losses, all_lengths = test(options, ckpt_file, data, batch_size=args.batch_size) # Full score print('Full probability results') scores = all_losses.sum(axis=1) / all_lengths scores = np.array(scores).reshape(-1, num_per_group) res = scores.argmax(axis=1) for i in range(num_per_group): print(sum(res == i) / len(res)) # Partial score print('Partial probability results') seq_mask = sequence_mask(np.array(positions) + 1, options['unroll_steps']) partial_losses = seq_mask * all_losses loss_mask = partial_losses > 0 scores = partial_losses.sum(axis=1) / loss_mask.sum(axis=1) scores = np.array(scores).reshape(-1, num_per_group) res = scores.argmax(axis=1) for i in range(num_per_group): print(sum(res == i) / len(res)) from IPython import embed embed() import os os._exit(1)
def main(): import json from data import SentenceDataset, VocabularyPad option_file_path = 'dump/sentpad_test/options.json' test_prefix = 'data/test/violin_test.txt' vocab_path = 'data/vocabulary/vocab_bnc_5.txt' with open(option_file_path, 'r') as fin: options = json.load(fin) with tf.variable_scope('lm'): model = SentenceLanguageModel(options, is_training=False) init = tf.initializers.global_variables() batch_size = options['batch_size'] max_seq_length = options['unroll_steps'] config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.InteractiveSession() sess.run(init) vocabulary = VocabularyPad(vocab_path) dataset = SentenceDataset(test_prefix, vocabulary) a = dataset.iter_batches(batch_size=batch_size, seq_length=max_seq_length) b = next(a) feed_dict = { model.token_ids: b['token_ids'], model.seq_length: b['lengths'], model.next_token_id: b['next_token_id'] } total_loss = sess.run(model.total_loss, feed_dict=feed_dict) losses = sess.run(model.losses, feed_dict=feed_dict) print(f'Loss: {total_loss} (should be around 12)') from IPython import embed embed() import os os._exit(1)
def main(args): config = Config(args) options = config.get_options() if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] vocab = UnicodeCharsVocabularyPad(args.vocab_file, max_word_length) else: vocab = VocabularyPad(args.vocab_file) data = SentenceDataset(args.prefix, vocab, test=False, shuffle_on_load=True) train(options, data, int(args.ngpus), config.save_path, config.save_path, config.get_logger(), restart_ckpt_file=args.start_from)
if __name__ == "__main__": cfg = ConfigBinaryClassification() cuda = True device = torch.device("cuda:1" if cuda else "cpu") model_path = "checkpoints/roberta24" model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2) model.to(device) model.eval() model.zero_grad() tokenizer_path = "hfl/chinese-roberta-wwm-ext" tokenizer = BertTokenizer.from_pretrained(tokenizer_path) train_dataset = SentenceDataset(tokenizer, cfg.DATA_PATH, dataset="train", cuda=False) train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False) preds = [] for tokens, label in train_loader: tokens = {key: item.to(device) for key, item in tokens.items()} label = label.to(device) pred = model(**tokens)[0] preds.append(pred.detach().cpu().numpy()) preds = np.concatenate(preds) np.save("checkpoints/PTM-pred.npy", preds)
def finetune(args, cfg): device = torch.device("cuda:%d" % args.cuda) model_config = args.model_config tokenizer = BertTokenizer.from_pretrained(model_config) train_dataset = SentenceDataset(tokenizer, cfg.DATA_PATH, dataset="train", cuda=False) valid_dataset = SentenceDataset(tokenizer, cfg.DATA_PATH, dataset="valid", cuda=False) train_loader = DataLoader(train_dataset, batch_size=16) valid_loader = DataLoader(valid_dataset, batch_size=16) model = BertForSequenceClassification.from_pretrained( model_config, num_labels=args.class_num) model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr) criterion = nn.CrossEntropyLoss() #criterion = FocalLoss(classes=3, device=device).to(device) for epoch in range(args.epoch_num): for tokens, label in tqdm(train_loader): model.train() optimizer.zero_grad() tokens = {key: item.to(device) for key, item in tokens.items()} label = label.to(device) pred = model(**tokens)[0] loss = criterion(pred, label) loss.backward() optimizer.step() tokens = {key: item.cpu() for key, item in tokens.items()} label = label.cpu() del tokens, label with torch.no_grad(): model.eval() preds = [] labels = [] for tokens, label in tqdm(valid_loader): tokens = {key: item.to(device) for key, item in tokens.items()} pred = model(**tokens)[0] p = pred.argmax(1).cpu().tolist() l = label.tolist() preds += p labels += l report = classification_report(preds, labels) print(report) model.save_pretrained( os.path.join(args.save_dir, args.save_config + str(epoch)))