class KoBARTConditionalGeneration(Base): def __init__(self, hparams, **kwargs): super(KoBARTConditionalGeneration, self).__init__(hparams, **kwargs) self.model = BartForConditionalGeneration.from_pretrained( self.hparams.model_path) self.model.train() self.bos_token = '<s>' self.eos_token = '</s>' self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=os.path.join( self.hparams.tokenizer_path, 'model.json'), bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>') def forward(self, inputs): return self.model( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], decoder_input_ids=inputs['decoder_input_ids'], decoder_attention_mask=inputs['decoder_attention_mask'], labels=inputs['labels'], return_dict=True) def training_step(self, batch, batch_idx): outs = self(batch) loss = outs.loss self.log('train_loss', loss, prog_bar=True) return loss def validation_step(self, batch, batch_idx): outs = self(batch) loss = outs['loss'] return (loss) def validation_epoch_end(self, outputs): losses = [] for loss in outputs: losses.append(loss) self.log('val_loss', torch.stack(losses).mean(), prog_bar=True) def chat(self, text): input_ids = [ self.tokenizer.bos_token_id ] + self.tokenizer.encode(text) + [self.tokenizer.eos_token_id] res_ids = self.model.generate( torch.tensor([input_ids]), max_length=self.hparams.max_seq_len, num_beams=5, eos_token_id=self.tokenizer.eos_token_id, bad_words_ids=[[self.tokenizer.unk_token_id]]) a = self.tokenizer.batch_decode(res_ids.tolist())[0] return a.replace('<s>', '').replace('</s>', '').replace('<usr>', '')
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'sep_token': '[SEP]', 'unk_token': '[UNK]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]' }) print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-"*50) for k, v in tokenizer.get_vocab().items(): print(k, ": ", v) goals = "has_anything(robot),on_surface(blue_block, tabletop),stacked(blue_block, red_block),on_surface(yellow_block, tabletop)" values = [False, True, True, False] input = process_goals(goals, values, return_string=True) print("-"*50) print("INPUT: ", input) encoded = tokenizer.encode(input, add_special_tokens=add_special_tokens) #, padding=True, truncation=True)#, return_tensors="pt") print(encoded) print("DECODED: ", tokenizer.decode(encoded, skip_special_tokens=False)) # Unit testing ;) def compare(debug=False): # Iterate through all inputs. diffs = 0 total = 0 # Open files one by one. for filename in tqdm(sierra_files): # Load the file. h5 = h5py.File(os.path.join(sierra_path, filename), 'r') #print("Symbolic Goal: ", h5["sym_goal"][()], '\n')
#tokenizer = PreTrainedTokenizerFast(tokenizer_object=loaded_tokenizer) # Load from tokenizer file tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path) #tokenizer.add_special_tokens({'pad_token': '[PAD]'}) <- this works! #tokenizer.pad_token print(f"\nFinal tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-" * 50) for k, v in tokenizer.get_vocab().items(): print(k, ": ", v) input = "has_anything(robot),on_surface(blue_block, tabletop),stacked(blue_block, red_block),on_surface(yellow_block, tabletop)" print("INPUT: ", input) encoded = tokenizer.encode( input) #, padding=True, truncation=True)#, return_tensors="pt") print(encoded) print("DECODED: ", tokenizer.decode(encoded, skip_special_tokens=True)) # Unit testing ;) def compare(filename, debug=False): # Iterate through all inputs. diffs = 0 total = 0 with open(filename, "r") as f: csvreader = csv.reader(f, delimiter=';') for row in csvreader: for input in row: # Skip empty inputs.
print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n") print("-" * 50) for k, v in tokenizer.get_vocab().items(): print(k, ": ", v) print("-" * 50) # Now, let's use it: input = "approach_obj(yellow_block),grasp_obj_on_red_block(yellow_block),lift_obj_from_red_block(yellow_block),place_on_center(yellow_block),approach_obj(red_block),grasp_obj(red_block),lift_obj_from_tabletop(red_block),align_red_block_with(blue_block),stack_red_block_on(blue_block),approach_obj(green_block),grasp_obj(green_block),lift_obj_from_far(green_block),place_on_center(green_block),approach_obj(yellow_block),grasp_obj(yellow_block),lift_obj_from_tabletop(yellow_block),align_yellow_block_with(red_block),stack_yellow_block_on(red_block),go_home(robot)" num_actions = len(input.split("),")) print(f"Input (number of actions: {num_actions}): {input}\n") input, _ = process_plan(input, cfg.skip_actions, return_string=True) print(f"Processed input: {input}\n") encoded = tokenizer.encode(input, add_special_tokens=False) print(f"Tokenized input (number of plan tokens {len(encoded)}): {encoded}\n") print("Detokenized: ", tokenizer.decode(encoded, skip_special_tokens=False)) # Unit testing ;) def compare(debug=False): min_plan_length = 100000 avg_plan_length = 0 max_plan_length = 0 # Iterate through all inputs. diffs = 0 total = 0 # Open files one by one.
class PolishRoberta(nn.Module): def __init__(self, pretrained_path, n_labels, hidden_size=768, dropout_p=0.2, label_ignore_idx=0, head_init_range=0.04, device='cuda'): super().__init__() self.n_labels = n_labels self.linear_1 = nn.Linear(hidden_size, hidden_size) self.classification_head = nn.Linear(hidden_size, n_labels) self.label_ignore_idx = label_ignore_idx self.tokenizer = PreTrainedTokenizerFast( tokenizer_file=os.path.join(pretrained_path, "tokenizer.json")) self.model = AutoModel.from_pretrained(pretrained_path) self.dropout = nn.Dropout(dropout_p) self.device = device # initializing classification head self.classification_head.weight.data.normal_(mean=0.0, std=head_init_range) def forward(self, inputs_ids, labels, labels_mask, valid_mask): ''' Computes a forward pass through the sequence tagging model. Args: inputs_ids: tensor of size (bsz, max_seq_len). padding idx = 1 labels: tensor of size (bsz, max_seq_len) labels_mask and valid_mask: indicate where loss gradients should be propagated and where labels should be ignored Returns : logits: unnormalized model outputs. loss: Cross Entropy loss between labels and logits ''' self.model.train() transformer_out = self.model(inputs_ids, return_dict=True)[0] out_1 = F.relu(self.linear_1(transformer_out)) out_1 = self.dropout(out_1) logits = self.classification_head(out_1) if labels is not None: loss_fct = nn.CrossEntropyLoss(ignore_index=self.label_ignore_idx) # Only keep active parts of the loss if labels_mask is not None: active_loss = valid_mask.view(-1) == 1 active_logits = logits.view(-1, self.n_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.n_labels), labels.view(-1)) return loss else: return logits def encode_word(self, s): """ takes a string and returns a list of token ids """ tensor_ids = self.tokenizer.encode(s) # remove <s> and </s> ids return tensor_ids[1:-1]
import json from transformers.tokenization_utils import PreTrainedTokenizer import utils from transformers import PreTrainedTokenizerFast # This will tokenize and add special tokens # Todo ast_tok = "<ast>" tokenizer = PreTrainedTokenizerFast(tokenizer_file = "tokenizer/code-tokenizer.json") with open("output/new_ast_raw.json", "r") as fin, open("output/converted_train.txt", "w") as fout: for line in utils.file_tqdm(fin): json_line = json.loads(line) json_tokens = json_line["nodes"] is_ext = json_line["ext"] if not is_ext: encoded = tokenizer.encode(ast_tok + " " + " ".join(json_tokens)) else: encoded = tokenizer.encode(" ".join(json_tokens)) fout.write(" ".join(str(e) for e in encoded) + " \n")