def build_model(args): if args.clf_model.lower() == "cnn": # easy for text tokenization tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) model = CNN_Text(args) elif args.clf_model.lower() == "robert": print("name is {}".format(args.model_name_or_path)) tokenizer = RobertaTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = RobertaConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = RobertaForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers if args.freeze: for n, p in model.named_parameters(): if "bert" in n: p.requires_grad = False elif args.clf_model.lower() == "bert": tokenizer = BertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers # if args.freeze: # for n, p in model.named_parameters(): # if "bert" in n: # p.requires_grad = False else: tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = DistilBertConfig.from_pretrained( args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = DistilBertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) model.expand_class_head(args.multi_head) model = model.to(args.device) return tokenizer, model
def main_train(): def clean_str(string): string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string TEXT = data.Field(sequential=True, lower=True, batch_first=True) TEXT.preprocessing = data.Pipeline(clean_str) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) trainset, valset = MR.splits(data_path, fields=[("text", TEXT), ("label", LABEL)]) TEXT.build_vocab(trainset) with open("text.field", 'wb') as f: dill.dump(TEXT, f) trainiter = data.BucketIterator(trainset, batch_size=batch_size, sort_key=lambda x: len(x.text), shuffle=True, device=device) valiter = data.BucketIterator(valset, batch_size=batch_size, sort_key=lambda x: len(x.text), shuffle=True, device=device) model = CNN_Text(channel_dim, len(TEXT.vocab), embed_dim, output_dim, kernel_sizes, is_static=False, dropout_rate=dropout_rate) model = model.to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=weight_decay) train_model(epochs, model, trainiter, valiter, optimizer, criterion)
text_field = data.Field(lower=True) label_field = data.Field(sequential=False) train_data, dev_data = MR.splits(text_field, label_field) text_field.build_vocab(train_data, dev_data) label_field.build_vocab(train_data, dev_data) args = Args() args.dropout = 0.5 args.max_norm = 3.0 args.embed_dim = 128 args.kernel_num = 100 args.kernel_sizes = '3,4,5' args.static = False args.snapshot = 'snapshot/best.pt' args.embed_num = len(text_field.vocab) args.class_num = len(label_field.vocab) - 1 args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] model = CNN_Text(args) model.load_state_dict(torch.load(args.snapshot, map_location='cpu')) model = model.to(device) @app.route('/cls/<text>') def classify_text(text): app.logger.warning(text) result, conf = predict(text, model, text_field, label_field, device) app.logger.warning(conf) return result
class Trainer: def __init__(self, config, n_gpu, vocab, train_loader=None, val_loader=None): self.config = config self.vocab = vocab self.n_gpu = n_gpu self.train_loader = train_loader self.val_loader = val_loader # Build model vocab_size = self.vocab.vocab_size() self.model = CNN_Text(self.config, vocab_size, self.config.n_label) self.model.to(device) if self.n_gpu > 1: self.model = nn.DataParallel(self.model) # Build optimizer self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=0.0005) # Build criterion self.criterion = nn.CrossEntropyLoss() def train(self): best_f1 = 0.0 best_acc = 0.0 global_step = 0 batch_f1 = [] batch_acc = [] for epoch in range(self.config.num_epoch): batch_loss = [] for step, batch in enumerate(self.train_loader): self.model.train() batch = tuple(t.to(device) for t in batch) batch = sort_batch(batch) input_ids, input_lengths, labels = batch outputs = self.model(input_ids) loss = self.criterion(outputs['logits'].view(-1, self.config.n_label), labels.view(-1)) f1, acc = ic_metric(labels.cpu(), outputs['predicted_intents'].cpu()) if self.n_gpu > 1: loss = loss.mean() loss.backward() self.optimizer.step() self.optimizer.zero_grad() global_step += 1 batch_loss.append(loss.float().item()) batch_f1.append(f1) batch_acc.append(acc) if (global_step == 1) or (global_step % self.config.log_interval == 0): mean_loss = np.mean(batch_loss) mean_f1 = np.mean(batch_f1) mean_acc = np.mean(batch_acc) batch_loss = [] nsml.report(summary=True, scope=locals(), epoch=epoch, train_loss=mean_loss, step=global_step) if (global_step > 0) and (global_step % self.config.val_interval == 0): val_loss, val_f1, val_acc = self.evaluation() nsml.report(summary=True, scope=locals(), epoch=epoch, val_loss=val_loss, val_f1=val_f1, val_acc=val_acc, step=global_step) if val_f1 > best_f1: best_f1 = val_f1 best_acc = val_acc nsml.save(global_step) def evaluation(self): self.model.eval() total_loss = [] preds = [] targets = [] with torch.no_grad(): for step, batch in enumerate(self.val_loader): batch = tuple(t.to(device) for t in batch) batch = sort_batch(batch) input_ids, input_lengths, labels = batch outputs = self.model(input_ids) loss = self.criterion(outputs['logits'].view(-1, self.config.n_label), labels.view(-1)) pred = outputs['predicted_intents'].squeeze(-1).cpu().numpy().tolist() target = labels.cpu().numpy().tolist() preds.extend(pred) targets.extend(target) total_loss.append(loss.float().item()) mean_loss = np.mean(total_loss) mean_f1, mean_acc = ic_metric(targets, preds) return mean_loss, mean_f1, mean_acc