def predict(self, tasks, **kwargs): texts = [task['data'][self.value] for task in tasks] predict_dataloader = prepare_texts(texts, self.tokenizer, self.maxlen, SequentialSampler, self.batch_size) pred_labels, pred_scores = [], [] for batch in predict_dataloader: batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1]} with torch.no_grad(): outputs = self.model(**inputs) logits = outputs[0] batch_preds = logits.detach().cpu().numpy() argmax_batch_preds = np.argmax(batch_preds, axis=-1) pred_labels.extend(str(self.labels[i]) for i in argmax_batch_preds) max_batch_preds = np.max(batch_preds, axis=-1) pred_scores.extend(float(s) for s in max_batch_preds) predictions = [] for predicted_label, score in zip(pred_labels, pred_scores): result = [{ 'from_name': self.from_name, 'to_name': self.to_name, 'type': 'choices', 'value': { 'choices': [predicted_label] } }] predictions.append({'result': result, 'score': score}) return predictions
def fit(self, completions, workdir=None, cache_dir=None, pretrained_model='bert-base-multilingual-cased', maxlen=64, batch_size=32, num_epochs=100, logging_steps=1, train_logs=None, **kwargs): input_texts = [] output_labels, output_labels_idx = [], [] label2idx = {l: i for i, l in enumerate(self.labels)} for completion in completions: # get input text from task data if completion['completions'][0].get('skipped'): continue input_text = completion['data'][self.value] input_texts.append(input_text) # get an annotation output_label = completion['completions'][0]['result'][0]['value'][ 'choices'][0] output_labels.append(output_label) output_label_idx = label2idx[output_label] output_labels_idx.append(output_label_idx) new_labels = set(output_labels) added_labels = new_labels - set(self.labels) if len(added_labels) > 0: print('Label set has been changed. Added ones: ' + str(list(added_labels))) self.labels = list(sorted(new_labels)) label2idx = {l: i for i, l in enumerate(self.labels)} output_labels_idx = [label2idx[label] for label in output_labels] tokenizer = BertTokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir) train_dataloader = prepare_texts(input_texts, tokenizer, maxlen, RandomSampler, batch_size, output_labels_idx) model = self.reset_model(pretrained_model, cache_dir, device) total_steps = len(train_dataloader) * num_epochs optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) global_step = 0 total_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(num_epochs, desc='Epoch') if train_logs: tb_writer = SummaryWriter( logdir=os.path.join(train_logs, os.path.basename(output_dir))) else: tb_writer = None loss_queue = deque(maxlen=10) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc='Iteration') for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2] } outputs = model(**inputs) loss = outputs[0] loss.backward() total_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % logging_steps == 0: last_loss = (total_loss - logging_loss) / logging_steps loss_queue.append(last_loss) if tb_writer: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', last_loss, global_step) logging_loss = total_loss # slope-based early stopping if len(loss_queue) == loss_queue.maxlen: slope = calc_slope(loss_queue) if tb_writer: tb_writer.add_scalar('slope', slope, global_step) if abs(slope) < 1e-2: break if tb_writer: tb_writer.close() model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training # noqa model_to_save.save_pretrained(workdir) tokenizer.save_pretrained(workdir) return { 'model_path': workdir, 'batch_size': batch_size, 'maxlen': maxlen, 'pretrained_model': pretrained_model, 'labels': self.labels }
from utils import load_data, prepare_texts, query_for_answers import argparse if __name__ == '__main__': parser = argparse.ArgumentParser( description= 'Client for querying docker image hosted on specified server') parser.add_argument('test_path', type=str, help='path for test.txt file') parser.add_argument('-s', '--separator', default=';', help='separator in test.txt file') parser.add_argument( '-u', '--server_url', default='http://35.234.121.157:8501/v1/models/cnn:predict', help= 'google cloud platform kubernetes cluster ip address with docker image and trained model within' ) args = parser.parse_args() texts, labels = load_data(filename=args.test_path, separator=args.separator) texts = prepare_texts(texts) print('Connecting with gcp...') SERVER_URL = args.server_url acc = query_for_answers(texts, labels, SERVER_URL) print('Accuracy for data test set:') print(acc)