def __init__(self, n_labels, hidden_size, dropout=0.2, label_ignore_idx=0, max_seq_length=128, batch_size=32, head_init_range=0.04, device='cuda', vocab_size=320): super().__init__() self.n_labels = n_labels self.linear_1 = nn.Linear(hidden_size, hidden_size) self.classification_head = nn.Linear(hidden_size, n_labels) self.label_ignore_idx = label_ignore_idx self.tokenizer = ReformerTokenizer.from_pretrained( 'google/reformer-crime-and-punishment') config = ReformerConfig( axial_pos_shape=[batch_size, int(max_seq_length / batch_size)]) self.model = ReformerModel(config) self.dropout = nn.Dropout(dropout) self.device = device # initializing classification head self.classification_head.weight.data.normal_(mean=0.0, std=head_init_range)
def prepare_dataset(max_length): # get pretrained tokenizer tokenizer = ReformerTokenizer.from_pretrained( "patrickvonplaten/reformer-crime-and-punish") # define our map function to reduce the dataset to one sample def flatten_and_tokenize(batch): all_input_text = ["".join(batch["line"])] input_ids_dict = tokenizer.batch_encode_plus( all_input_text, pad_to_max_length=True, max_length=max_length, ) # duplicate data 8 times to have have 8 examples in dataset for key in input_ids_dict.keys(): input_ids_dict[key] = [8 * [x] for x in input_ids_dict[key]][0] return input_ids_dict # load the dataset dataset = nlp.load("crime_and_punish", split="train") # reduce the dataset dataset = dataset.map(flatten_and_tokenize, batched=True, batch_size=-1, remove_columns=["line"]) # prepare dataset to be in torch format dataset.set_format(type="torch", columns=["input_ids", "attention_mask"]) return dataset
def test_pretrained_generate_use_cache_equality(self): model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device) tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment") model.eval() input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device) output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False) output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True) output_with_cache = tokenizer.decode(output_ids_with_cache[0]) output_without_cache = tokenizer.decode(output_ids_without_cache[0]) self.assertEqual(output_with_cache, output_without_cache)
def test_pretrained_generate_crime_and_punish(self): model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device) tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment") model.eval() input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device) output_ids = model.generate( input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8 ) output_text = tokenizer.decode(output_ids[0]) self.assertEqual( output_text, "A few months later state expression in his ideas, at the first entrance. He was positively for an inst", )
def test_tokenization_reformer(self): # Given self.base_tokenizer = ReformerTokenizer.from_pretrained( 'google/reformer-crime-and-punishment', do_lower_case=False, cache_dir=self.test_dir) self.rust_tokenizer = PyReformerTokenizer(get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['google/reformer-crime-and-punishment']), do_lower_case=True) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_list( [example.text_a for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): assert rust.token_ids == baseline[ 'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \ f'Sentence a: {self.examples[idx].text_a} \n' \ f'Sentence b: {self.examples[idx].text_b} \n' \ f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \ f'Rust: {rust.token_ids} \n' \ f'Python {baseline["input_ids"]}' assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def test_full_tokenizer(self): tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382], ) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [ 8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4 ], ) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", ".", ], )
def setUp(self): super().setUp() tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.save_pretrained(self.tmpdirname)
def big_tokenizer(self): return ReformerTokenizer.from_pretrained( "google/reformer-crime-and-punishment")
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath('args.json')) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) tokenizer = ReformerTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = ReformerForQuestionAnswering.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # Get datasets train_examples = DeepThinkDataset(data_args.input_train_file) train_dataset = DTDataset(tokenizer, train_examples, data_args.max_seq_length) eval_examples = DeepThinkDataset(data_args.input_eval_file) eval_dataset = DTDataset(tokenizer, eval_examples, data_args.max_seq_length) # Initialize our Trainer trainer = Trainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=DummyDataCollator(), prediction_loss_only=True, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(eval_output.keys()): logger.info(" %s = %s", key, str(eval_output[key])) writer.write("%s = %s\n" % (key, str(eval_output[key]))) results.update(eval_output) return results
import torch from transformers import ReformerTokenizer, ReformerModel MODEL_MAX_LENGTH = 4608 tokenizer_config_path = "protein_reformer/spiece.model" tokenizer = ReformerTokenizer(vocab_file=tokenizer_config_path, do_lower_case=True, model_max_length=MODEL_MAX_LENGTH) model_checkpoint = 'output/checkpoint-6500/' model = ReformerModel.from_pretrained(model_checkpoint) sequence_file_path = "data/yeast/yeast.txt" f = open(sequence_file_path, "r") sequence_txt = f.readlines() f.close() input_sequence_list = [ tokenizer(sequence.strip(), truncation=True, return_tensors='pt')['input_ids'].cuda() for sequence in sequence_txt ] model.cuda() protein_vectors_list = [ torch.mean(model(inp)[1][-1], dim=1) for inp in input_sequence_list ] protein_vectors = torch.cat(protein_vectors_list, dim=0) from sklearn.manifold import TSNE protein_vectors_tsne = TSNE(n_components=2).fit_transform(
# from transformers import pipeline # nlp = pipeline("sentiment-analysis") # result = nlp("I hate you")[0] # print(f"label: {result['label']}, with score: {round(result['score'], 4)}") # result = nlp("I love you")[0] # print(f"label: {result['label']}, with score: {round(result['score'], 4)}") from transformers import ReformerTokenizer, ReformerModel import torch tokenizer = ReformerTokenizer.from_pretrained( 'google/reformer-crime-and-punishment') model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment', return_dict=True) inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") outputs = model(**inputs) last_hidden_states = outputs.last_hidden_state print(last_hidden_states)
from transformers import ReformerConfig, ReformerModelWithLMHead, ReformerTokenizer, EncoderDecoderConfig, EncoderDecoderModel from torch.utils.data import DataLoader, Dataset NUM_BATCHES = None BATCH_SIZE = 20 LEARNING_RATE = 0.001 #1e-4 #1e-4 VALIDATE_EVERY = 10 SEQ_LEN = 4608 # spm.SentencePieceTrainer.Train("--input=./data/tokenizer_training/AAresiduals.txt \ # --vocab_size=28 \ # --model_prefix=sequence_tokenizer \ # --model_type=char \ # --character_coverage=1.0") tokenizer = ReformerTokenizer(vocab_file="sequence_tokenizer.model", do_lower_case=False, model_max_length=SEQ_LEN) tokenizer.max_model_input_sizes = SEQ_LEN # def split_file(file,out1,out2,percentage=0.75,isShuffle=True,seed=42): # random.seed(seed) # with open(file, 'r',encoding="utf-8") as fin, open(out1, 'w') as foutBig, open(out2, 'w') as foutSmall: # nLines = sum(1 for line in fin) # fin.seek(0) # nTrain = int(nLines*percentage) # nValid = nLines - nTrain # i = 0 # for line in fin: # r = random.random() if isShuffle else 0 # so that always evaluated to true when not isShuffle
from transformers import ReformerConfig, ReformerTokenizer, ReformerModel import sentencepiece as spm import os assert os.path.exists('protein_reformer/training_vocab.txt') == 1\ , f'build a lower case amino acid txt file to train tokenizer. content should be: {"ARNDCQEGHILKMFPSTWYVOUBZX".lower()}' MODEL_MAX_LENGTH = 4608 spm.SentencePieceTrainer.Train( "--input=protein_reformer/training_vocab.txt --model_prefix=spiece --vocab_size=30 --pad_id=29 --character_coverage=1.0" ) os.system("mv spiece.model spiece.vocab protein_reformer") tokenizer = ReformerTokenizer(vocab_file="protein_reformer/spiece.model", do_lower_case=True, model_max_length=MODEL_MAX_LENGTH) tokenizer.save_pretrained("protein_reformer") configuration = ReformerConfig.from_pretrained( "google/reformer-crime-and-punishment") configuration.axial_pos_shape = (64, 72) configuration.max_position_embeddings = MODEL_MAX_LENGTH configuration.vocab_size = tokenizer.vocab_size configuration.pad_token_id = tokenizer.pad_token_id # configuration.attn_layers = ["local","lsh","local","lsh"] configuration.output_hidden_states = True configuration.save_pretrained('protein_reformer/')