def predict_single(model, tokenizer, prompt, top=1, device=None, max_length=None, beams=None): if device == None: device = cfg('device') if max_length == None: max_length = cfg('max_gen') if beams == None: beams = cfg('beams') prompt = tokenize_query(tokenizer, prompt, device) model = model.to(device) output_sequences = model.generate(input_ids=prompt, max_length=max_length, num_beams=max(top, beams), do_sample=False, num_return_sequences=top, pad_token_id=tokenizer.eos_token_id) output = decode_batch(tokenizer, output_sequences) return output
def validate(model, tokenizer, dev_nls, dev_cms): outputs = [ predict_single_mod(model, tokenizer, dev_nl, top=cfg('val_n')) for dev_nl in dev_nls ] confidences = [x[1] for x in outputs] predictions = [x[0] for x in outputs] scores_template = [ get_template_score(dev_cm[0], pred_cm) for (pred_cm, dev_cm) in zip(predictions, dev_cms) ] print(f"[DEBUG]: TM score {np.mean(scores_template)}") scores_blue = [ sentence_bleu(dev_cm, pred_cm[0]) for (pred_cm, dev_cm) in zip(predictions, dev_cms) ] print(f"[DEBUG]: BLUE score {np.mean(scores_blue)}") if cfg('val_metric') == 'BLUE': scores = scores_blue elif cfg('val_metric') == 'template': scores = scores_template else: assert False, f"Unkown validation metric '{metric}'" return scores, predictions
def get_tokenizer(): tokenizer = AutoTokenizer.from_pretrained(cfg('model')) # gpt2 has no padding by default try: if cfg('eos') != tokenizer.eos_token: print( f"Warning: non-default eos token (default is {tokenizer.eos_token})" ) except: print("Warning: no default eos token") tokenizer.add_tokens(cfg('eos')) tokenizer.eos_token = cfg('eos') print("EOS", tokenizer.eos_token, tokenizer.eos_token_id) # add eos as pad_token should there be no pad token if tokenizer.pad_token == None: tokenizer.pad_token = tokenizer.eos_token print("PAD", tokenizer.pad_token, tokenizer.pad_token_id) try: if cfg('add_tokens'): # assert that there are tokens for seperators and common bash tools #added = tokenizer.add_tokens([cfg('sep1'), cfg('sep2')]) added = tokenizer.add_tokens(bashinfo.top_100_utilities) print(f"added {added} tokens") except: pass return tokenizer
def main(): print("PREPROCESSING DATA") preprocess() print("LOADING TOKENIZER") tokenizer = get_tokenizer() data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) print("LOADING MODEL", cfg('model')) model = get_model(tokenizer) print("LOADING DATA") if cfg('encoding') == 'LBL': train_dataset = LBLDataset(tokenizer=tokenizer, file_path=filename('train')) elif cfg('encoding') == 'blocked': train_dataset = BlockedDataset(tokenizer=tokenizer, file_path=filename('train')) elif cfg('encoding') == 'text': train_dataset = TextDataset(tokenizer=tokenizer, file_path=filename('train'), block_size=cfg('max_block')) elif cfg('encoding').startswith('inter'): if cfg('encoding').endswith('LBL'): loader = LBLDataset elif cfg('encoding').endswith('blocked'): loader = BlockedDataset d1 = loader(tokenizer=tokenizer, file_path=filename('train')) d2 = loader(tokenizer=tokenizer, file_path=filename('dirty')) train_dataset = CombinedDataset(d1, d2) else: raise ValueError("Unkown encoding") trainer = get_trainer(train_dataset, data_collator, model) def validator(x, y): global BEST_metric model.save_pretrained(session) metric, pred = validate(model, tokenizer, x, y) if np.mean(metric) > BEST_metric: print("NEW BEST (saving)") BEST_metric = np.mean(metric) # save predicitions and model save(session + "metric.txt", str(metric) + "\n") save(session + "pred.txt", str(pred) + "\n\n") return metric, pred trainer.validator = validator trainer.val_dataset = get_validation_data() # saving configuration print("SAVING...") session = get_session_path() print(session) save(session + "conf.txt", repr(cfg())) print("STARTING TRAINING...") trainer.train()
def decode(tokenizer, v): text = tokenizer.decode(v, clean_up_tokenization_spaces=False) # remove query at the start start = text.find(cfg('sep2')) + len(cfg('sep2')) text = text[start:] # remove possible junk at the end end = text.find("\n") if end != -1: text = text[:end] text = text.strip('\n ') return text
def get_model(tokenizer, resume=False): if cfg('random_init'): # load randomly initialized model instead of pretrained model_config = transformers.GPT2Config() model = transformers.GPT2LMHeadModel(model_config) elif resume: # resume from previous best model = AutoModelForCausalLM.from_pretrained( cfg('out_path') + cfg('name')) else: # load pretrained model model = AutoModelForCausalLM.from_pretrained(cfg('model')) model.resize_token_embeddings(len(tokenizer)) model = model.to(cfg('device')) return model
def predict_batch(model, tokenizer, input_ids, top=1): """ Requires all inputs to be of equal length """ input_ids = torch.tensor(input_ids) input_ids = input_ids.to(cfg('device')) # prediction output_sequences = model.generate(input_ids=input_ids, max_length=cfg('max_gen'), num_beams=cfg('beams'), do_sample=False, num_return_sequences=top, pad_token_id=tokenizer.eos_token_id) print(output_sequences.shape) output = decode_batch(tokenizer, output_sequences) return output
def preprocess(): for NAME in ('dev', 'train', 'dirty'): try: cm = read_data(NAME + '_cm.txt') nl = read_data(NAME + '_nl.txt') except: print(f"[WARNING]: {NAME} data not found") continue al = [context(x) for x in zip(nl, cm)] al = al = "".join(al) if not al.endswith(cfg('eos')): al += cfg('eos') save_data(NAME + ".txt", al)
def context(pair): if '' in pair: return '' enc = f"{encode(pair[0])} {pair[1]}" if cfg('encoding').endswith('LBL'): return f"{enc} {cfg('eos')}\n" else: return f"{enc}\n"
def get_trainer(train_dataset, collator, model): training_args = TrainingArguments( output_dir=f'output/bash', overwrite_output_dir=True, do_train=True, no_cuda=cfg('device') == 'cpu', num_train_epochs=cfg('epochs'), per_device_train_batch_size=cfg('batch_size'), gradient_accumulation_steps=cfg('grad_acc'), logging_steps=5, save_steps=0, seed=random.randint(0, 2**32 - 1)) trainer = MTrainer(model=model, args=training_args, data_collator=collator, train_dataset=train_dataset, prediction_loss_only=True) return trainer
def predict_single_mod(model, tokenizer, prompt, top=1, device=None, beams=None): if device == None: device = cfg('device') if beams == None: beams = cfg('beams') prompt = tokenize_query(tokenizer, prompt, device) # bit hacky PreTrainedModel._generate_beam_search = mbs._generate_beam_search output_sequences, output_scores = model.generate( input_ids=prompt, max_length=300, # max_length less relevant as mod does early stopping num_beams=max(top, beams), do_sample=False, num_return_sequences=top, pad_token_id=tokenizer.eos_token_id) output = decode_batch(tokenizer, output_sequences) return output, output_scores
def predict_diverse(model, tokenizer, prompt, temp, top_p, top=1): prompt = tokenize_query(tokenizer, prompt) output_sequences = model.generate(input_ids=prompt, max_length=cfg('max_gen'), temperature=temp, top_p=top_p, do_sample=True, num_return_sequences=top, pad_token_id=tokenizer.eos_token_id) output = decode_batch(tokenizer, output_sequences) if len(output) == 1: return output[0] else: return output
def prepare_onnx_generation(model_path, device): load_cfg(model_path) print("# Converting model") convert_to_onnx(model_path, cfg('model'), 'onnx/model.onnx') print("# Loading model into huggingface") model = transformers.AutoModelForCausalLM.from_pretrained(model_path) model.to(device) if device == 'cuda': provider = 'CUDAExecutionProvider' else: raise ValueError("Unkown device") print("# Loading model into ONNX") onnx_model = create_model_for_provider('onnx/model.onnx', provider) print("# Loading tokenizer") tokenizer = get_tokenizer() return onnx_model, model.lm_head, tokenizer
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, debug=False): assert os.path.isfile(file_path) block_size = cfg('max_block') block_size = block_size - tokenizer.num_special_tokens_to_add( pair=False) self.examples = [] print(file_path) with open(file_path, encoding="utf-8") as f: text = f.read() tokenized_text = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(text)) eos = tokenizer.eos_token_id cblock = [tokenized_text.pop(0)] while tokenized_text: t = tokenized_text.index(eos) if t == -1: break if t > block_size: # if entry doesn't fit in block, throw it away tokenized_text = tokenized_text[t + 1:] print(f"Throwing away {t} tokens.") elif t + len(cblock) <= block_size: cblock += tokenized_text[:t + 1] tokenized_text = tokenized_text[t + 1:] else: rest = block_size - len(cblock) cblock = cblock + [eos] * rest self.examples.append(cblock) cblock = [eos] if debug: for i in self.examples[:1]: print(i) print(tokenizer.decode(i))
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, debug=False): assert os.path.isfile(file_path) with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines()] pairs = [x + "\n" + y for x, y in zip(lines[::2], lines[1::2])] if debug: for i in pairs[:2]: print(i) batch_encoding = tokenizer(pairs, add_special_tokens=True, padding=True, truncation=True, max_length=cfg('max_line')) self.examples = batch_encoding["input_ids"] if debug: for i in self.examples[:2]: print(i)
def __getitem__(self, i) -> torch.Tensor: return torch.tensor(self.examples[i], dtype=torch.long, device=cfg('device'))
def _setup_wandb(self): if cfg('wandb'): super(MTrainer, self)._setup_wandb()
def save_data(filename, content): if type(content) == type(list): content = "\n".join(lines) with open(cfg('data_path') + filename, 'w+') as handle: handle.write(content)
def read_data(filename): with open(cfg('data_path') + filename, 'r') as handle: content = handle.readlines() return [x.strip() for x in content]
def get_session_path(): path = cfg('out_path') + datetime.now().strftime("%m-%d_%H:%M:%S") + '/' os.mkdir(path) return path