def __init__(self, config_path): config = configparser.ConfigParser() config.read(config_path) self.save_dir = Path(config.get("general", "save_dir")) if not self.save_dir.exists(): self.save_dir.mkdir(parents=True) self.clf_th = config.getfloat("general", "clf_th") self.mlp_model_path = config.get("model", "mlp") assert Path(self.mlp_model_path).exists() self.device = "cuda" if torch.cuda.is_available() else "cpu" bert_config_path = config.get("bert", "config_path") assert Path(bert_config_path).exists() self.bert_config = LongformerConfig.from_json_file(bert_config_path) self.max_seq_length = self.bert_config.max_position_embeddings - 2 self.bert_tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') # bert_tokenizer_path = config.get("bert", "tokenizer_path") # assert Path(bert_config_path).exists() # self.bert_tokenizer = LongformerTokenizer.from_pretrained(bert_tokenizer_path) bert_model_path = config.get("bert", "model_path") assert Path(bert_model_path).exists() self.bert_model = LongformerModel.from_pretrained( bert_model_path, config=self.bert_config) self.bert_model.to(self.device) self.bert_model.eval() gold_dir = Path(config.get("data", "gold_dir")) assert Path(gold_dir).exists() self.gold_dataset = ConllDataset(gold_dir) target_dir = Path(config.get("data", "target_dir")) assert Path(target_dir).exists() self.target_dataset = ConllDataset(target_dir)
def __init__(self, pretrained: str, max_query_len: int, max_doc_len: int, mode: str = 'cls', task: str = 'ranking') -> None: super(LongformerMaxp, self).__init__() self._pretrained = pretrained self._max_query_len = max_query_len self._max_doc_len = max_doc_len self._mode = mode self._task = task self._config = LongformerConfig.from_pretrained(self._pretrained) self._config.attention_mode = 'sliding_chunks' self._config.gradient_checkpointing = 'True' #print("attention_mode: "+self._config.attention_mode) self._model = LongformerModel.from_pretrained(self._pretrained, config=self._config) self._activation = nn.ReLU() self.dense = nn.Linear(self._config.hidden_size, 128) self.dropout = nn.Dropout(self._config.hidden_dropout_prob) self.out_proj = nn.Linear(128, 2) if self._task == 'ranking': self._dense2 = nn.Linear(128, 1) elif self._task == 'classification': self._dense2 = nn.Linear(128, 2) else: raise ValueError('Task must be `ranking` or `classification`.')
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = LongformerConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, attention_window=self.attention_window, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def __init__(self, embed_dim=768, max_position_embeddings=2 * 60 * 60, num_attention_heads=12, num_hidden_layers=3, attention_mode='sliding_chunks', pad_token_id=-1, attention_window=None, intermediate_size=3072, attention_probs_dropout_prob=0.1, hidden_dropout_prob=0.1): self.config = LongformerConfig() self.config.attention_mode = attention_mode self.config.intermediate_size = intermediate_size self.config.attention_probs_dropout_prob = attention_probs_dropout_prob self.config.hidden_dropout_prob = hidden_dropout_prob self.config.attention_dilation = [ 1, ] * num_hidden_layers self.config.attention_window = [ 256, ] * num_hidden_layers if attention_window is None else attention_window self.config.num_hidden_layers = num_hidden_layers self.config.num_attention_heads = num_attention_heads self.config.pad_token_id = pad_token_id self.config.max_position_embeddings = max_position_embeddings self.config.hidden_size = embed_dim super(VTNLongformerModel, self).__init__(self.config, add_pooling_layer=False) self.embeddings.word_embeddings = None # to avoid distributed error of unused parameters
def __init__(self, data_path): super(MafiascumDataset, self).__init__() tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096') config = LongformerConfig() df = pd.read_pickle(data_path, compression="gzip") grouped_df = df.groupby(["author", "game_id"]) labels = [] inputs = [] attention_masks = [] for key, item in grouped_df: posts = grouped_df.get_group(key).content.values # All the posts made by a user in a game label = grouped_df.get_group(key).scum.values[0] # Boolean label = 1 if label else 0 # Int num_sentences_in_game = 0 all_sentences_in_game = [] all_attention_masks_in_game = [] for post in posts: if len(posts) > 0: # Only consider games where user has spoken at least once sentences = post.split('\n\n') for sentence in sentences: sentence = sentence.strip() if len(sentence) > 0: input_ids = tokenizer.encode(sentence, max_length=MAX_SENTENCE_LEN) # 1 for local attention, 2 for global attention, 0 for none (padding) # (for our task, mark <s> start of sentence with 2 to have global attention) attention_mask = [1 for _ in range(len(input_ids))] attention_mask[0] = 2 input_ids = input_ids attention_mask = attention_mask all_sentences_in_game += input_ids all_attention_masks_in_game += attention_mask num_sentences_in_game += 1 # If the player said less than 10 sentences in a game, we ignore this sample if num_sentences_in_game < 10: continue input_ids = torch.LongTensor(all_sentences_in_game[:MAX_DOC_LEN]) attention_mask = torch.LongTensor(all_attention_masks_in_game[:MAX_DOC_LEN]) label = torch.FloatTensor([label]) inputs.append(input_ids) attention_masks.append(attention_mask) labels.append(label) self.inputs = inputs self.attention_masks = attention_masks self.labels = labels
def get_config(self): return LongformerConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, attention_window=self.attention_window, )
def __init__(self, model_path_or_name, device='cuda'): """ Creates a new LitcovidMultiLabelClassifier from a given model path or name. :param model_path_or_name: A model name or the path to the saved model :param device: which device to use. For example 'cpu' or 'cuda' """ self.tokenizer = AutoTokenizer.from_pretrained(model_path_or_name) self.config = LongformerConfig.from_pretrained( model_path_or_name, num_labels=len(categories)) self.config.sep_token_id = 2 self.config.attention_window = 32 self.model = LongformerForSequenceClassification.from_pretrained( model_path_or_name, config=self.config) self.max_sequence_length = 640 self.device = device self.model.to(device) self.model.eval()
def init_encoder(cls, cfg_name: str, projection_dim: int = 0, attn_dropout: float = 0.1, hidden_dropout: float = 0.1, seq_project=False, **kwargs) -> LongformerModel: cfg = LongformerConfig.from_pretrained( cfg_name if cfg_name else PRE_TAINED_LONFORMER_BASE) if attn_dropout != 0: cfg.attention_probs_dropout_prob = attn_dropout if hidden_dropout != 0: cfg.hidden_dropout_prob = hidden_dropout return cls.from_pretrained(cfg_name, config=cfg, project_dim=projection_dim, seq_project=seq_project, **kwargs)
labels = [label_map[x] for x in labels] train_ids, val_ids, train_mask, val_mask, train_label, val_label = train_test_split( token_ids, token_mask, labels, test_size=.2, random_state=42) train_dataset = TensorDataset(torch.tensor(train_ids), torch.tensor(train_mask), torch.tensor(train_label)) val_dataset = TensorDataset(torch.tensor(val_ids), torch.tensor(val_mask), torch.tensor(val_label)) train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True) val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=True) # Model Setup config = LongformerConfig.from_pretrained(longformer_pretrained) config.num_labels = 2 device = 'cuda:0' if torch.cuda.is_available() else 'cpu' print(device) model = LongformerForSequenceClassification.from_pretrained( longformer_pretrained, config=config) model.cuda() # Model Train epochs = 20 optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-6) bias = [float(i) for i in '1,1'.split(',')]
if args.net_type == "bert_pool_conv": config.num_filters = args.num_filters sizes = args.filter_sizes.split(',') config.filter_sizes = [int(s) for s in sizes] model = BertPoolConv.from_pretrained(args.wgts_dir, config=config) if args.net_type == "bert_pool_linear": config.pool_method_chunks = args.pool_method_chunks model = BertPoolLinear.from_pretrained(args.wgts_dir, config=config) if args.net_type in ["longformer_linear"]: # Default: rob/data/pre_wgts/longformer_base/ # Tokenizer tokenizer = LongformerTokenizer.from_pretrained( 'allenai/longformer-base-4096') # Config config = LongformerConfig.from_pretrained('allenai/longformer-base-4096') config.output_hidden_states = True config.num_labels = args.num_labels # config.unfreeze = args.unfreeze # config.pool_method = args.pool_method # config.pool_layers = args.pool_layers if args.num_hidden_layers: config.num_hidden_layers = args.num_hidden_layers if args.num_attention_heads: config.num_attention_heads = args.num_attention_heads # Model model = LongformerLinear.from_pretrained(args.wgts_dir) # if args.net_type in ["bert_linear", "bert_lstm"]: # # Default: rob/data/pre_wgts/bert_medium/ # # Tokenizer
f"Removed {bef-df.shape[0]} because they were under {min_len} or over {max_len} characters long." ) print(df.target.value_counts()) import torch from transformers import DistilBertTokenizerFast, DistilBertModel, DistilBertConfig from transformers import LongformerTokenizerFast, LongformerModel, LongformerConfig #model_name = 'distilbert-base-uncased' model_name = 'allenai/longformer-base-4096' tokenizer = LongformerTokenizerFast.from_pretrained(model_name) df["vecs"] = df.text.map( lambda x: torch.LongTensor(tokenizer.encode(x)).unsqueeze(0)) config = LongformerConfig.from_pretrained(model_name, output_hidden_states=True) model = LongformerModel.from_pretrained(model_name, config=config) device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cpu' model = model.to(device) input_tf = tokenizer.batch_encode_plus(df.text.to_list(), return_tensors='pt', padding=True) #vecs = input_tf['input_ids'].to(device) #granola_ids = granola_ids.to(device) model.eval() with torch.no_grad():
def __init__(self): super(JeffBERT,self).__init__()#input_ids=None,inputs_embeds config = LongformerConfig(vocab_size=100,num_labels=2,max_length=2560,max_position_embeddings=2560) sefl.l1=JeffLongformerForSequenceClassification(config=config)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: ", ) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: ", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action="store_true", help="Whether to run test on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--sub_model_type", default='han', type=str, help="Makesure you want sg or not.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if 'AAN' in args.data_dir: from src.cite import glue_convert_examples_to_features as convert_examples_to_features from src.cite import glue_output_modes as output_modes from src.cite import glue_processors as processors print('AAN') elif 'OC' in args.data_dir: from src.cite_OC import glue_convert_examples_to_features as convert_examples_to_features from src.cite_OC import glue_output_modes as output_modes from src.cite_OC import glue_processors as processors print('OC') elif 'S2ORC' in args.data_dir: from src.cite_S2ORC import glue_convert_examples_to_features as convert_examples_to_features from src.cite_S2ORC import glue_output_modes as output_modes from src.cite_S2ORC import glue_processors as processors print('S2ORC') elif 'pla' in args.data_dir: from src.cite_PAN import glue_convert_examples_to_features as convert_examples_to_features from src.cite_PAN import glue_output_modes as output_modes from src.cite_PAN import glue_processors as processors print('PAN') if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() #config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = LongformerConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None, ) config.output_hidden_states=True config.model_type = args.sub_model_type config.batch_size = args.per_gpu_train_batch_size ''' tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) ''' path = args.model_name_or_path tok_path = args.model_name_or_path tokenizer = LongformerTokenizer.from_pretrained(tok_path) model = LongformerForSequenceClassification.from_pretrained(path) model.num_labels=2 model.resize_token_embeddings(len(tokenizer)+1) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = LongformerForSequenceClassification.from_pretrained(args.output_dir) tokenizer = LongformerTokenizer.from_pretrained(args.output_dir) model.to(args.device) import pickle # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] # eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else args.task_name # eval_dataset = load_and_cache_examples(args,eval_task_names, tokenizer, evaluate=True) if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = LongformerForSequenceClassification.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) print('') print(results) with open(args.output_dir+'/eval_results' + '.pkl', 'wb') as f: pickle.dump(results, f, pickle.HIGHEST_PROTOCOL) if args.do_test and args.local_rank in [-1, 0]: # tokenizer = tokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] logger.info("Test the following checkpoint: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = LongformerForSequenceClassification.from_pretrained(checkpoint) model.to(args.device) result = test(args, model, tokenizer, prefix=prefix) result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
tokenizer=bpe_tokenizer, file_path=input_path_val, block_size=block_size ) data_collator = DataCollatorForLanguageModeling( tokenizer=bpe_tokenizer, mlm=True, mlm_probability=mlm_probability ) # create model config = LongformerConfig( attention_window=attention_window, sep_token_id=bpe_tokenizer.get_vocab()["</s>"], pad_token_id=bpe_tokenizer.get_vocab()["<pad>"], bos_token_id=bpe_tokenizer.get_vocab()["<s>"], eos_token_id=bpe_tokenizer.get_vocab()["</s>"], vocab_size=bpe_tokenizer.vocab_size, max_position_embeddings=max_len+10, num_attention_heads=num_attention_heads, num_hidden_layers=num_hidden_layers, type_vocab_size=1 ) model = LongformerForMaskedLM(config=config) _pretty_print(f"Number of model parameters : {model.num_parameters():,}") model_path = os.path.join(output_path, "lm") training_args = TrainingArguments( output_dir=model_path, overwrite_output_dir=True, num_train_epochs=epochs,
import pandas as pd import datasets from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig import torch.nn as nn import torch from torch.utils.data import Dataset, DataLoader import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support from tqdm import tqdm import wandb import os # In[2]: config = LongformerConfig() config # The datasets library handles the hassle of downloading and processing nlp datasets which is quite convenient to save time in processing and use it for modelling. First we need to instantiate the class by calling the method `load_dataset`. In case the dataset is not loaded, the library downloads it and saves it in the datasets default folder. # # This example provided by HuggingFace uses an older version of datasets (still called nlp) and demonstrates how to user the [trainer class with BERT](https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing#scrollTo=5DEWNilys9Ty). Todays tutorial will follow several of the concepts described there. # # The dataset class has multiple useful methods to easily load, process and apply transformations to the dataset. We can even load the data and split it into train and test feeding a list to the split argument. # In[3]: train_data, test_data = datasets.load_dataset( 'imdb', split=['train', 'test'], cache_dir='/media/jlealtru/data_files/github/website_tutorials/data')
def train(opt): # Set random seed for reproducibility random.seed(opt.seed) np.random.seed(opt.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(opt.seed) else: torch.manual_seed(opt.seed) if opt.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(opt.gradient_accumulation_steps)) opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps # Logging if not os.path.isdir(opt.saved_path): os.makedirs(opt.saved_path) output_file = open(opt.saved_path + os.sep + "logs.txt", "w") output_file.write("Model's parameters: {}".format(vars(opt))) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) # Data Loading training_params = { "batch_size": opt.batch_size, "shuffle": True, "drop_last": True } test_params = { "batch_size": opt.batch_size, "shuffle": False, "drop_last": False } train_data_path = opt.train_set training_set = MafiascumDataset(train_data_path) training_generator = DataLoader(training_set, **training_params) test_data_path = opt.test_set test_set = MafiascumDataset(test_data_path) test_generator = DataLoader(test_set, **test_params) # Model config = LongformerConfig.from_pretrained('longformer-base-4096') config.attention_mode = 'sliding_chunks' model = LongformerForBinaryClassification(config) if torch.cuda.is_available(): model = model.cuda() criterion = nn.BCEWithLogitsLoss() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=opt.lr, weight_decay=0.01, correct_bias=False) num_train_optimization_steps = int( len(training_generator) / opt.batch_size / opt.gradient_accumulation_steps) * opt.num_epoches scheduler = get_linear_schedule_with_warmup( optimizer, num_training_steps=num_train_optimization_steps, num_warmup_steps=opt.num_warmup_steps) # Training if opt.gradient_accumulation_steps > 1: labels_mini_batch = [] logits_mini_batch = [] best_loss = 1e5 best_epoch = 0 model.train() num_iter_per_epoch = len(training_generator) for epoch in range(opt.num_epoches): for iteration, (input_ids, attention_mask, labels) in enumerate(training_generator): if torch.cuda.is_available(): input_ids = input_ids.cuda() attention_mask = attention_mask.cuda() labels = labels.cuda() optimizer.zero_grad() logits = model(input_ids, attention_mask=attention_mask) loss = criterion(logits, labels) if opt.gradient_accumulation_steps > 1: loss = loss / opt.gradient_accumulation_steps labels_mini_batch.append(labels) logits_mini_batch.append(logits) loss.backward() if (iteration + 1) % opt.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() if opt.gradient_accumulation_steps > 1: labels = torch.cat(labels_mini_batch, dim=0) logits = torch.cat(logits_mini_batch, dim=0) labels_mini_batch = [] logits_mini_batch = [] training_metrics = get_evaluation( labels.cpu().numpy(), logits.cpu().detach().numpy(), list_metrics=["accuracy"]) print( "Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {}, Accuracy: {}" .format(epoch + 1, opt.num_epoches, iteration + 1, num_iter_per_epoch, optimizer.param_groups[0]['lr'], loss, training_metrics["accuracy"])) writer.add_scalar('Train/Loss', loss, epoch * num_iter_per_epoch + iteration) writer.add_scalar('Train/Accuracy', training_metrics["accuracy"], epoch * num_iter_per_epoch + iteration) if epoch % opt.test_interval == 0: model.eval() loss_ls = [] te_label_ls = [] te_pred_ls = [] for (input_ids, attention_mask, labels) in test_generator: num_sample = len(labels) if torch.cuda.is_available(): input_ids = input_ids.cuda() attention_mask = attention_mask.cuda() labels = labels.cuda() with torch.no_grad(): logits = model(input_ids, attention_mask=attention_mask) te_loss = criterion(logits, labels) loss_ls.append(te_loss * num_sample) te_label_ls.extend(labels.clone().cpu()) te_pred_ls.append(logits.clone().cpu()) te_loss = sum(loss_ls) / test_set.__len__() te_pred = torch.cat(te_pred_ls, 0) te_label = np.array(te_label_ls) test_metrics = get_evaluation( te_label, te_pred.numpy(), list_metrics=["accuracy", "confusion_matrix"]) output_file.write( "Epoch: {}/{} \nTest loss: {} Test accuracy: {} \nTest confusion matrix: \n{}\n\n" .format(epoch + 1, opt.num_epoches, te_loss, test_metrics["accuracy"], test_metrics["confusion_matrix"])) print("Epoch: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format( epoch + 1, opt.num_epoches, optimizer.param_groups[0]['lr'], te_loss, test_metrics["accuracy"])) writer.add_scalar('Test/Loss', te_loss, epoch) writer.add_scalar('Test/Accuracy', test_metrics["accuracy"], epoch) model.train() # Update if new best loss achieved if te_loss + opt.es_min_delta < best_loss: best_loss = te_loss best_epoch = epoch torch.save(model, opt.saved_path + os.sep + "longformer") # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( "Stop training at epoch {}. The lowest loss achieved is {}" .format(epoch, te_loss)) break writer.close() output_file.close()
'1', # 32GB gpu with fp32 '--gradient_accumulation_steps', '4', #'--evaluate_during_training', # this is removed to reduce training time '--do_train', ]) #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt' #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt' # these are small file for test train_fn = '/scratch/xl3119/capstone/data/sample/sample.txt' val_fn = '/scratch/xl3119/capstone/data/sample/sample.txt' training_args.train_datapath = train_fn training_args.val_datapath = val_fn ##################### use pretrianed longformer in transformer init_config = LongformerConfig.from_json_file( 'config_files/longformer_base_4096/config.json') mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer') word_embeddings = np.loadtxt( join('/scratch/xl3119/capstone/wd_emb', "word_embedding_matrix.txt")) longformer_model = LongformerForMaskedLM(init_config) longformer_model = use_embeddings_fasttext(longformer_model, word_embeddings) # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') logger.info('Train and eval with Longformer pretrained ...') pretrain_and_evaluate(training_args, longformer_model, mimic_tokenizer, train_only=True, eval_only=False, model_path=None\ #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path. )
LR = float(sys.argv[4]) MODEL_NAME = str(sys.argv[5]) WEIGHT_NAME = str(sys.argv[6]) # Sample values # MAX_LEN = 1024 # BATCH_SIZE = 8 # EPOCHS = 2 # LR = 3e-5 # MODEL_NAME = 'allenai/longformer-base-4096/' # WEIGHT_NAME = 'weights' (.h5) # Configs for model training - CHANGE ME configuration = LongformerConfig() save_path = "/scratch/aj2885/results/longformer-base-4096_"+WEIGHT_NAME[:-1]+"/" data_path = "/scratch/aj2885/datasets/hotpotqa/" # Tokenizer slow_tokenizer = LongformerTokenizerFast.from_pretrained(MODEL_NAME) if not os.path.exists(save_path): os.makedirs(save_path) slow_tokenizer.save_pretrained(save_path) # Load the fast tokenizer from saved file tokenizer = ByteLevelBPETokenizer(save_path+"vocab.json",save_path+"merges.txt" ,lowercase=True)
data_training_args.val_datapath_output = training_args.output_dir + "/wikitext-103/train.tfrecord" data_training_args.train_datapath = training_args.output_dir + "/wikitext-103/valid.tfrecord" if __name__ == "__main__": os.environ["CUDA_VISIBLE_DEVICES"] = "0" tokenizer = LongformerTokenizer.from_pretrained("longformer-base-4096") create_pretraining_data(data_training_args) config = LongformerConfig( vocab_size=training_args.vocab_size, hidden_size=training_args.hidden_size, num_hidden_layers=training_args.num_hidden_layers, num_attention_heads=training_args.num_attention_heads, intermediate_size=training_args.intermediate_size, hidden_act=training_args.hidden_act, hidden_dropout_prob=training_args.hidden_dropout_prob, attention_probs_dropout_prob=training_args. attention_probs_dropout_prob, max_position_embeddings=training_args.max_position_embeddings, type_vocab_size=training_args.type_vocab_size, initializer_range=training_args.initializer_range, attention_window=training_args.attention_window, ) # long_formeformer_tokernizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFLongformerModel(confg=config) logger.info('Evaluating roberta-base (seqlen: 4096) for refernece ...') pretrain_and_evaluate(training_args, model, eval_only=False)