def document_encoder(title: str, doc_sents: list, tokenizer: LongformerTokenizer): title_res = SPECIAL_TITLE_START + title + SPECIAL_TITLE_END ## title_tokens = tokenizer.tokenize(text=title_res) title_encode_ids = tokenizer.encode(text=title_tokens, add_special_tokens=False) assert len(title_tokens) == len(title_encode_ids) title_len = len(title_encode_ids) ##++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ encode_id_lens = [] encode_id_lens.append(title_len) doc_encode_id_list = [] doc_encode_id_list.append(title_encode_ids) for sent_idx, sent_text in enumerate(doc_sents): sent_text_res = sent_text + SPECIAL_SENTENCE_TOKEN sent_tokens = tokenizer.tokenize(text=sent_text_res) sent_encode_ids = tokenizer.encode(text=sent_tokens, add_special_tokens=False) assert len(sent_tokens) == len(sent_encode_ids) ##++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ doc_encode_id_list.append(sent_encode_ids) sent_len = len(sent_encode_ids) encode_id_lens.append(sent_len) ##++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ doc_sent_len_cum_list = list( itertools.accumulate(encode_id_lens, operator.add)) sent_start_end_pair = [(doc_sent_len_cum_list[i], doc_sent_len_cum_list[i + 1] - 1) for i in range(len(encode_id_lens) - 1)] doc_encode_ids = list(itertools.chain.from_iterable(doc_encode_id_list)) assert len(doc_encode_ids) == doc_sent_len_cum_list[-1] assert len(sent_start_end_pair) == len(doc_sents) return doc_encode_ids, sent_start_end_pair, len(doc_encode_ids), title_len
def query_encoder(query: str, tokenizer: LongformerTokenizer): query_res = CLS_TOKEN + SPECIAL_QUERY_START + query + SPECIAL_QUERY_END query_tokens = tokenizer.tokenize(text=query_res) query_encode_ids = tokenizer.encode(text=query_tokens, add_special_tokens=False) assert len(query_tokens) == len(query_encode_ids) query_len = len(query_encode_ids) return query_encode_ids, query_len
def load_transformer(model_type): if model_type == "distilbert": tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') model = TFDistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=1) elif model_type == "bert_x12": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=1) elif model_type == "bert_x24": tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = TFBertForSequenceClassification.from_pretrained( "bert-large-uncased", num_labels=1) elif model_type == "albert_v2_x12": tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = TFAlbertForSequenceClassification.from_pretrained( "albert-base-v2", num_labels=1) elif model_type == "longformer_x12": tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') model = TFLongformerForSequenceClassification.from_pretrained( "allenai/longformer-base-4096", num_labels=1) elif model_type == "longformer_x24": tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-large-4096') model = TFLongformerForSequenceClassification.from_pretrained( "allenai/longformer-large-4096", num_labels=1) else: raise ValueError(model_type + " was invalid") return model, tokenizer
def answer_span_token_finder(norm_answer: str, sentence: str, tokenizer: LongformerTokenizer): answer_encode_ids = tokenizer.encode(text=norm_answer, add_special_tokens=False) sentence_encode_ids = tokenizer.encode(text=sentence, add_special_tokens=False) idx = sub_list_finder(target=answer_encode_ids, source=sentence_encode_ids) flag = idx >= 0 return flag, answer_encode_ids, sentence_encode_ids
def get_hotpotqa_longformer_tokenizer(model_name=PRE_TAINED_LONFORMER_BASE, do_lower_case=True): tokenizer = LongformerTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case) special_tokens_dict = {'additional_special_tokens': ['<q>', '</q>', '<d>', '<p>']} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('Number of added tokens = {}: {}'.format(num_added_toks, special_tokens_dict)) print('*' * 75) return tokenizer
def __init__(self, config_path): config = configparser.ConfigParser() config.read(config_path) self.save_dir = Path(config.get("general", "save_dir")) if not self.save_dir.exists(): self.save_dir.mkdir(parents=True) self.clf_th = config.getfloat("general", "clf_th") self.mlp_model_path = config.get("model", "mlp") assert Path(self.mlp_model_path).exists() self.device = "cuda" if torch.cuda.is_available() else "cpu" bert_config_path = config.get("bert", "config_path") assert Path(bert_config_path).exists() self.bert_config = LongformerConfig.from_json_file(bert_config_path) self.max_seq_length = self.bert_config.max_position_embeddings - 2 self.bert_tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') # bert_tokenizer_path = config.get("bert", "tokenizer_path") # assert Path(bert_config_path).exists() # self.bert_tokenizer = LongformerTokenizer.from_pretrained(bert_tokenizer_path) bert_model_path = config.get("bert", "model_path") assert Path(bert_model_path).exists() self.bert_model = LongformerModel.from_pretrained( bert_model_path, config=self.bert_config) self.bert_model.to(self.device) self.bert_model.eval() gold_dir = Path(config.get("data", "gold_dir")) assert Path(gold_dir).exists() self.gold_dataset = ConllDataset(gold_dir) target_dir = Path(config.get("data", "target_dir")) assert Path(target_dir).exists() self.target_dataset = ConllDataset(target_dir)
def __init__(self, config): super(LongformerForBinaryClassification, self).__init__() self.config = config self.tokenizer = LongformerTokenizer.from_pretrained( 'longformer-base-4096') self.longformer = LongformerModel(config) self.classifier = nn.Linear(config.hidden_size, 1)
def get_tokenizer(lm='bert'): """Return the tokenizer. Intiailize it if not initialized. Args: lm (string): the name of the language model (bert, albert, or distilbert) Returns: BertTokenizer or DistilBertTokenizer or AlbertTokenizer """ global tokenizer if tokenizer is None: if lm == 'bert': from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif lm == 'distilbert': from transformers import DistilBertTokenizer tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') elif lm == 'albert': from transformers import AlbertTokenizer tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') elif lm == 'roberta': from transformers import RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') elif lm == 'xlnet': from transformers import XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') elif lm == 'longformer': from transformers import LongformerTokenizer tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') return tokenizer
def main(dataset_directory, jsonlines_filename): dataset, ids, images = extract_article_list( os.path.join(dataset_directory, jsonlines_filename)) print(f'Len dataset = {len(dataset)}') text_model = LongformerModel.from_pretrained( "allenai/longformer-base-4096").to("cuda") text_model.eval() tokenizer = LongformerTokenizer.from_pretrained( "allenai/longformer-base-4096") # pool = Pool(processes=48) # processed_text = list(tqdm(pool.map(process_text, dataset), total=len(dataset))) # pool.close() batch_size = 8 all_embeddings_avg = np.zeros((len(dataset), 768), dtype=np.float) for i, chunk in tqdm(enumerate(chunks(dataset, batch_size)), total=len(dataset) / batch_size): with torch.no_grad(): tokenized_text = tokenizer(chunk, return_tensors="pt", truncation=True, padding="max_length") model_out = text_model(**(tokenized_text.to("cuda"))) all_embeddings_avg[i * batch_size:i * batch_size + len(chunk), :] = torch.mean( model_out[0], dim=1).cpu().numpy() data_df = pd.DataFrame(zip(ids, images, all_embeddings_avg)) data_df.to_pickle( os.path.join(dataset_directory, f"longformer_{jsonlines_filename.split('.')[0]}.pkl"))
def main(dataset_directory, jsonlines_filename): dataset, ids, images = extract_article_list( os.path.join(dataset_directory, jsonlines_filename)) print(f'Len dataset = {len(dataset)}') tokenizer = LongformerTokenizer.from_pretrained( "allenai/longformer-base-4096") batch_size = 512 all_tokens = np.zeros((len(dataset), 2, 4096), dtype=np.float) for i, chunk in tqdm(enumerate(chunks(dataset, batch_size)), total=len(dataset) / batch_size): with torch.no_grad(): tokenized_text = tokenizer(chunk, return_tensors="pt", truncation=True, padding="max_length") all_tokens[i * batch_size:i * batch_size + len(chunk), 0, :] = tokenized_text["input_ids"].numpy() all_tokens[i * batch_size:i * batch_size + len(chunk), 1, :] = tokenized_text["attention_mask"].numpy() data_df = pd.DataFrame(zip(ids, images, all_tokens.astype(np.int_))) data_df.to_pickle( os.path.join( dataset_directory, f"longformer_tokens_{jsonlines_filename.split('.')[0]}.pkl"))
def __init__(self): self.model = LongformerModel.from_pretrained( 'allenai/longformer-base-4096') self.tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') self.led_tokenizer = LEDTokenizer.from_pretrained( 'allenai/led-base-16384') self.led_model = LEDModel.from_pretrained('allenai/led-base-16384')
def _test_TFLongformer(self, size, large=False): from transformers import LongformerTokenizer, TFLongformerModel tokenizer = LongformerTokenizer.from_pretrained(size) model = TFLongformerModel.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict, max_length=512) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def __init__(self, args): super().__init__() self.args = args self.save_hyperparameters(args) self.tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096") self.model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")
def __init__(self): self.train = None self.test = None self.tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') self.processor = squad.SquadV2Processor() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")
def build_model(self): super().build_model() self.tokenizer = LongformerTokenizer.from_pretrained( "allenai/longformer-base-4096") self.model = LongformerForSequenceClassification.from_pretrained( "allenai/longformer-base-4096", num_labels=self.num_categories, ) self.model.to("cuda")
def __init__(self, data_path): super(MafiascumDataset, self).__init__() tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096') config = LongformerConfig() df = pd.read_pickle(data_path, compression="gzip") grouped_df = df.groupby(["author", "game_id"]) labels = [] inputs = [] attention_masks = [] for key, item in grouped_df: posts = grouped_df.get_group(key).content.values # All the posts made by a user in a game label = grouped_df.get_group(key).scum.values[0] # Boolean label = 1 if label else 0 # Int num_sentences_in_game = 0 all_sentences_in_game = [] all_attention_masks_in_game = [] for post in posts: if len(posts) > 0: # Only consider games where user has spoken at least once sentences = post.split('\n\n') for sentence in sentences: sentence = sentence.strip() if len(sentence) > 0: input_ids = tokenizer.encode(sentence, max_length=MAX_SENTENCE_LEN) # 1 for local attention, 2 for global attention, 0 for none (padding) # (for our task, mark <s> start of sentence with 2 to have global attention) attention_mask = [1 for _ in range(len(input_ids))] attention_mask[0] = 2 input_ids = input_ids attention_mask = attention_mask all_sentences_in_game += input_ids all_attention_masks_in_game += attention_mask num_sentences_in_game += 1 # If the player said less than 10 sentences in a game, we ignore this sample if num_sentences_in_game < 10: continue input_ids = torch.LongTensor(all_sentences_in_game[:MAX_DOC_LEN]) attention_mask = torch.LongTensor(all_attention_masks_in_game[:MAX_DOC_LEN]) label = torch.FloatTensor([label]) inputs.append(input_ids) attention_masks.append(attention_mask) labels.append(label) self.inputs = inputs self.attention_masks = attention_masks self.labels = labels
def __init__(self, vocab, unk_token="<unk>", max_input_chars_per_word=100, never_split=None): self.vocab = vocab self.unk_token = unk_token self.max_input_chars_per_word = max_input_chars_per_word self.never_split = never_split self.tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096')
def __init__(self, hparams): #super().__init__() super(TransformerMarco, self).__init__() self.hparams = hparams self.tokenizer = LongformerTokenizer.from_pretrained( hparams.model_name) self.model = LongformerForSequenceClassification.from_pretrained( hparams.model_name) self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None self.DatasetClass = MarcoDataset
def summarise_longformer(long_text_to_summarise): model_to_load = "patrickvonplaten/longformer2roberta-cnn_dailymail-fp16" tok_to_load = "allenai/longformer-base-4096" tokeniser = LongformerTokenizer.from_pretrained(tok_to_load) model = EncoderDecoderModel.from_pretrained(model_to_load) input_ids = tokeniser( long_text_to_summarise, return_tensors="pt").input_ids #.to(device).input_ids outputs = model.generate(input_ids) #.to(device) summary = tokeniser.decode(outputs[0], skip_special_tokens=True) return summary
def make_dataset(self, data_root: str) -> None: """ Make Dataset Make dataset from json files and save it as csv. Args: data_root: Root directory for document json files. """ log.info(f"Making dataset...") json_paths = glob.glob(f"{data_root}/**/*.json", recursive=True) # nltk settings nltk.download('punkt') stemmer = PorterStemmer() cv = CountVectorizer() texts = [] # A list of tokenized texts separated by half-width characters # Longformer feature_matrix = [] device = torch.device('cuda') tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerModel.from_pretrained('allenai/longformer-base-4096').to(device) for json_path in tqdm(json_paths): with open(json_path) as f: json_obj = json.load(f) body = json_obj["body"] soup = BeautifulSoup(body, "html.parser") for script in soup(["script", "style"]): script.decompose() text = soup.get_text() with torch.no_grad(): input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0).to(device) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device).to(device) global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device).to(device) outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask) vec = outputs.last_hidden_state[0].cpu().detach().clone().numpy().mean(0) # np.append(feature_matrix, vec) feature_matrix.append(list(vec)) # log.info(f"Done: {len(feature_matrix)}") feature_matrix = np.array(feature_matrix) log.info(f"Longformer: {feature_matrix.shape}") # Calculate distance matrix dist_mat = squareform(pdist(feature_matrix, metric='cosine')) df = pd.DataFrame(dist_mat) df.to_csv(join(self.cache_path, "json_document_longformer.csv"), index=False) log.info(f"Successfully made dataset.")
def set_tokenizer(self, tokenizer = "roberta"): if tokenizer == "longformer": from transformers import LongformerTokenizer self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') self.tokenizer_type = tokenizer elif tokenizer == "roberta": from transformers import RobertaTokenizer self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') self.tokenizer_type = tokenizer elif tokenizer == "bert": from transformers import BertTokenizer self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.tokenizer_type = tokenizer else: print("Error, the tokenizers allowed are 'longformer' , 'roberta' , 'bert' ")
def __init__(self, params): super(LongEntityLinker, self).__init__() self.params = params self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.n_gpu = torch.cuda.device_count() self.use_golden_tags = params['use_golden_tags'] # init tokenizer if params['use_longformer']: self.tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') else: self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.pad_id = -1 # init model self.model = LongEntityLinkerModule(self.params) self.model = self.model.to(self.device)
def load(self, k): while self.m.get(k, None) == -1: time.sleep(1) # loading, wit till ready if self.m.get(k, None) is not None: return self.m[k] # it's already loaded self.m[k] = -1 # tell others it's loading, wait m = None if k == 'sentence-encode': m = SentenceTransformer('roberta-base-nli-stsb-mean-tokens') # word_embedding_model = models.Transformer('allenai/longformer-base-4096') # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) # m = SentenceTransformer(modules=[word_embedding_model, pooling_model]) elif k == 'sentiment-analysis': tokenizer = AutoTokenizer.from_pretrained( "mrm8488/t5-base-finetuned-emotion") model = AutoModelWithLMHead.from_pretrained( "mrm8488/t5-base-finetuned-emotion").to("cuda") # TODO we sure it's not ForSequenceClassification? https://huggingface.co/mrm8488/t5-base-finetuned-emotion m = (tokenizer, model, 512) elif k == 'summarization': # Not using pipelines because can't handle >max_tokens # https://github.com/huggingface/transformers/issues/4501 # https://github.com/huggingface/transformers/issues/4224 max_tokens = 1024 # 4096 tokenizer = BartTokenizer.from_pretrained( 'facebook/bart-large-cnn') model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large-cnn').to("cuda") # model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16").to("cuda") # tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096") m = (tokenizer, model, max_tokens) elif k == 'question-answering': tokenizer = LongformerTokenizer.from_pretrained( "allenai/longformer-large-4096-finetuned-triviaqa") model = LongformerForQuestionAnswering.from_pretrained( "allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True).to("cuda") # tokenizer = AutoTokenizer.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2") # model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2", return_dict=True).to("cuda") m = (tokenizer, model, 4096) self.m[k] = m return m
def __init__(self, params): super().__init__() if 'dropout' in params: self.dropout = nn.Dropout(p=params['dropout']) else: self.dropout = None # self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False, do_basic_tokenize=False) # self.bert = BertModel.from_pretrained("bert-base-uncased") self.max_length = params['max_length'] if 'max_length' in params else 1024 self.max_memory_size = params['max_memory_size'] self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') self.bert = LongformerModel.from_pretrained("allenai/longformer-base-4096", gradient_checkpointing=True) self.num_labels = params["label_length"] if 'label_length' in params else 2 self.fc = nn.Linear(768, self.num_labels)
def __init__(self, params): super(LongEncoderRanker, self).__init__() self.params = params self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.n_gpu = torch.cuda.device_count() # todo #self.num_tags = 4 if not self.params['end_tag'] else 5 #self.num_tags = 3 if not self.params['end_tag'] else 4 self.num_tags = 9 if self.params['conll'] else 3 self.is_biencoder = params['is_biencoder'] self.use_golden_tags = not params['not_use_golden_tags'] # init tokenizer if params['use_longformer']: self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') else: #self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) #self.pad_id = 0 self.pad_id = -1 # init model self.model = LongEncoderModule(self.params) self.model = self.model.to(self.device)
def get_par_train_data_loader(rank, args) -> (DataLoader, DistributedSampler, int): data_frame = read_train_dev_data_frame(file_path=args.data_path, json_fileName=args.train_data_name) data_size = data_frame.shape[0] if args.train_data_filtered == 1: data_frame = data_frame[data_frame['level'] != 'easy'] logging.info('Filtered data by removing easy case {} to {}'.format( data_size, data_frame.shape[0])) elif args.train_data_filtered == 2: data_frame = data_frame[data_frame['level'] == 'hard'] logging.info( 'Filtered data by removing easy and medium case {} to {}'.format( data_size, data_frame.shape[0])) else: logging.info('Using all training data {}'.format(data_size)) data_size = data_frame.shape[0] num_replicas = args.world_size tokenizer = LongformerTokenizer.from_pretrained(args.pretrained_cfg_name, do_lower_case=True) hotpot_tensorizer = LongformerQATensorizer(tokenizer=tokenizer, max_length=args.max_ctx_len) dataset = HotpotTrainDataset(data_frame=data_frame, hotpot_tensorizer=hotpot_tensorizer, max_sent_num=args.max_sent_num) batch_size = args.batch_size // num_replicas logging.info('Each node batch size = {}'.format(batch_size)) train_sampler = torch.utils.data.distributed.DistributedSampler( dataset=dataset, rank=rank, num_replicas=num_replicas) train_dataloader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=max(1, args.cpu_num // 2), collate_fn=HotpotTrainDataset.collate_fn, shuffle=False, pin_memory=True, sampler=train_sampler) return train_dataloader, train_sampler, data_size
def __init__(self, model_name: str = "allenai/longformer-base-4096"): self.model = LongformerModel.from_pretrained(model_name) self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
'5.0', '--per_gpu_eval_batch_size', '2', '--per_gpu_train_batch_size', '1', # 32GB gpu with fp32 '--gradient_accumulation_steps', '32', #'--evaluate_during_training', # this is removed to reduce training time '--do_train', '--do_eval', ]) train_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_train.txt' val_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_val.txt' # these are small file for test # train_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_train.txt' # val_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_val.txt' training_args.val_datapath = val_fn training_args.train_datapath = train_fn ##################### use pretrianed longformer in transformer longformer_model = LongformerForMaskedLM.from_pretrained( 'allenai/longformer-base-4096') longformer_tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') logger.info('Train and eval with Longformer pretrained ...') pretrain_and_evaluate(training_args, longformer_model, longformer_tokenizer, eval_only=False, model_path=None\ #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path. )
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath('args.json')) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = LongformerTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = LongformerForQuestionAnswering.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # Get datasets #train_dataset = torch.load(data_args.train_file_path) #eval_dataset = torch.load(data_args.valid_file_path) train_examples = DeepThinkDataset(data_args.input_train_file) train_dataset = DTDataset(tokenizer, train_examples, data_args.max_seq_length) eval_examples = DeepThinkDataset(data_args.input_eval_file) eval_dataset = DTDataset(tokenizer, eval_examples, data_args.max_seq_length) # Initialize our Trainer trainer = Trainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=DummyDataCollator(), prediction_loss_only=True, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(eval_output.keys()): logger.info(" %s = %s", key, str(eval_output[key])) writer.write("%s = %s\n" % (key, str(eval_output[key]))) results.update(eval_output) return results
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator') model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator') input_ids = torch.tensor( tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] print(prediction_scores) ## Longformer from transformers import LongformerModel, LongformerTokenizer model = LongformerModel.from_pretrained('longformer-base-4096') tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096') SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze( 0) # batch of size 1 # Attention mask values -- 0: no attention, 1: local attention, 2: global attention attention_mask = torch.ones( input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention attention_mask[:, [ 1, 4, 21, ]] = 2 # Set global attention based on the task. For example, # classification: the <s> token