def setup_class(self): self.use_gpu = torch.cuda.is_available() self.test_dir = Path(tempfile.mkdtemp()) self.base_tokenizer = CTRLTokenizer.from_pretrained('ctrl', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyCtrlTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['ctrl']), get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['ctrl']), do_lower_case=True ) self.model = CTRLModel.from_pretrained('ctrl', output_attentions=False).eval() if self.use_gpu: self.model.cuda() self.sentence_list = ['For instance, on the planet Earth, man had always assumed that he was more intelligent ' 'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst' ' all the dolphins had ever done was muck about in the water having a good time. But ' 'conversely, the dolphins had always believed that they were far more intelligent than ' 'man—for precisely the same reasons.'] * 1 # Pre-allocate GPU memory tokens_list = [self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list] features = [self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list] features = [self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for input in features] all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): _ = self.model(all_input_ids)[0].cpu().numpy()
def test_TFCTRL(self): from transformers import CTRLTokenizer, TFCTRLModel tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = TFCTRLModel.from_pretrained('ctrl') input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=True)
def setup_class(self): self.processor = Sst2Processor() self.test_dir = Path(tempfile.mkdtemp()) sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8' contents = requests.get(sst2_url) (self.test_dir / 'SST-2.zip').open('wb').write(contents.content) with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj: zipObj.extractall(self.test_dir) self.examples = self.processor.get_train_examples(self.test_dir / 'SST-2') self.base_tokenizer = CTRLTokenizer.from_pretrained('ctrl', do_lower_case=False, cache_dir=self.test_dir) self.rust_tokenizer = PyCtrlTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['ctrl']), get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['ctrl']), do_lower_case=False)
def test_tokenization_ctrl(self): # Given self.base_tokenizer = CTRLTokenizer.from_pretrained( 'ctrl', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyCtrlTokenizer( get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['ctrl']), get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['merges_file'] ['ctrl']), do_lower_case=True) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_list( [example.text_a for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): assert rust.token_ids == baseline[ 'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \ f'Sentence a: {self.examples[idx].text_a} \n' \ f'Sentence b: {self.examples[idx].text_b} \n' \ f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \ f'Rust: {rust.token_ids} \n' \ f'Python {baseline["input_ids"]}' assert (rust.segment_ids == baseline['token_type_ids']) assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def setup_base_tokenizer(self): self.base_tokenizer = CTRLTokenizer.from_pretrained('ctrl', do_lower_case=True, cache_dir=self.test_dir)
import torch from transformers import CTRLTokenizer, CTRLModel tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = CTRLModel.from_pretrained('ctrl') input_ids =torch.tensor(tokenizer.encode("Links Hello, my dog is cute",add_special_tokens=True)).unsqueeze(0) # Batch size 1 #因为只有一个句子所以需要unsqueeze去掉一层. outputs = model(input_ids)# 输出的第一个是结果,第二个是cache没啥用.
print("Warning: Please set args.num_gens higher, unable to make equal spread across p values and prompt lengths") exit(0) if args.control_code and args.control_code not in CONTROL_CODES and args.control_code != "Politics": print("Warning: Invalid control code selection, please select a valid control code") exit(0) # Download the sampling file from the Google Cloud bucket filename = 'prompts-{}.json'.format('-'.join([args.dataset, args.split])) file_url = 'gs://roft_datasets/prompts/' + args.dataset + '/' + filename local_file_path = './' + args.dataset + '/' + filename command = "gsutil cp {0} {1}".format(file_url, local_file_path) process = subprocess.Popen(command.split(), stdout=subprocess.PIPE) # Initialize the tokenizer and the model tokenizer = CTRLTokenizer.from_pretrained(args.model_name) model = TFCTRLLMHeadModel.from_pretrained(args.model_name, return_dict=True) # Calculate the 99 percentile length of a prompt with MIN_NUM_SENTS sentences in it with open(local_file_path, 'r') as f: data = json.load(f) tokenized_prompts = [] for prompt in data['prompts']: inputs = tokenizer(' '.join([args.control_code] + prompt[:MIN_NUM_SENTS]), return_tensors="tf") tokenized_prompts.append(float(len(inputs['input_ids'][0])) / float(MIN_NUM_SENTS)) sorted_lens = sorted(tokenized_prompts) ninety_percentile_sent_len = sorted_lens[int(0.90*len(sorted_lens))] nlp = spacy.load("en") set_seed(RANDOM_SEED)