Exemple #1
0
    def __init__(self, config_path):
        config = configparser.ConfigParser()
        config.read(config_path)

        self.save_dir = Path(config.get("general", "save_dir"))
        if not self.save_dir.exists():
            self.save_dir.mkdir(parents=True)
        self.clf_th = config.getfloat("general", "clf_th")

        self.mlp_model_path = config.get("model", "mlp")
        assert Path(self.mlp_model_path).exists()

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        bert_config_path = config.get("bert", "config_path")
        assert Path(bert_config_path).exists()
        self.bert_config = LongformerConfig.from_json_file(bert_config_path)
        self.max_seq_length = self.bert_config.max_position_embeddings - 2
        self.bert_tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        # bert_tokenizer_path = config.get("bert", "tokenizer_path")
        # assert Path(bert_config_path).exists()
        # self.bert_tokenizer = LongformerTokenizer.from_pretrained(bert_tokenizer_path)
        bert_model_path = config.get("bert", "model_path")
        assert Path(bert_model_path).exists()
        self.bert_model = LongformerModel.from_pretrained(
            bert_model_path, config=self.bert_config)
        self.bert_model.to(self.device)
        self.bert_model.eval()

        gold_dir = Path(config.get("data", "gold_dir"))
        assert Path(gold_dir).exists()
        self.gold_dataset = ConllDataset(gold_dir)
        target_dir = Path(config.get("data", "target_dir"))
        assert Path(target_dir).exists()
        self.target_dataset = ConllDataset(target_dir)
Exemple #2
0
    def __init__(self,
                 pretrained: str,
                 max_query_len: int,
                 max_doc_len: int,
                 mode: str = 'cls',
                 task: str = 'ranking') -> None:
        super(LongformerMaxp, self).__init__()
        self._pretrained = pretrained
        self._max_query_len = max_query_len
        self._max_doc_len = max_doc_len
        self._mode = mode
        self._task = task
        self._config = LongformerConfig.from_pretrained(self._pretrained)
        self._config.attention_mode = 'sliding_chunks'
        self._config.gradient_checkpointing = 'True'
        #print("attention_mode: "+self._config.attention_mode)
        self._model = LongformerModel.from_pretrained(self._pretrained,
                                                      config=self._config)
        self._activation = nn.ReLU()
        self.dense = nn.Linear(self._config.hidden_size, 128)
        self.dropout = nn.Dropout(self._config.hidden_dropout_prob)
        self.out_proj = nn.Linear(128, 2)

        if self._task == 'ranking':
            self._dense2 = nn.Linear(128, 1)
        elif self._task == 'classification':
            self._dense2 = nn.Linear(128, 2)
        else:
            raise ValueError('Task must be `ranking` or `classification`.')
Exemple #3
0
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)

        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

        sequence_labels = None
        token_labels = None
        choice_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)

        config = LongformerConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
            attention_window=self.attention_window,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
Exemple #4
0
    def __init__(self,
                 embed_dim=768,
                 max_position_embeddings=2 * 60 * 60,
                 num_attention_heads=12,
                 num_hidden_layers=3,
                 attention_mode='sliding_chunks',
                 pad_token_id=-1,
                 attention_window=None,
                 intermediate_size=3072,
                 attention_probs_dropout_prob=0.1,
                 hidden_dropout_prob=0.1):

        self.config = LongformerConfig()
        self.config.attention_mode = attention_mode
        self.config.intermediate_size = intermediate_size
        self.config.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.config.hidden_dropout_prob = hidden_dropout_prob
        self.config.attention_dilation = [
            1,
        ] * num_hidden_layers
        self.config.attention_window = [
            256,
        ] * num_hidden_layers if attention_window is None else attention_window
        self.config.num_hidden_layers = num_hidden_layers
        self.config.num_attention_heads = num_attention_heads
        self.config.pad_token_id = pad_token_id
        self.config.max_position_embeddings = max_position_embeddings
        self.config.hidden_size = embed_dim
        super(VTNLongformerModel, self).__init__(self.config,
                                                 add_pooling_layer=False)
        self.embeddings.word_embeddings = None  # to avoid distributed error of unused parameters
Exemple #5
0
  def __init__(self, data_path):
    super(MafiascumDataset, self).__init__()

    tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096')
    config = LongformerConfig()

    df = pd.read_pickle(data_path, compression="gzip")
    grouped_df = df.groupby(["author", "game_id"])

    labels = []
    inputs = []
    attention_masks = []

    for key, item in grouped_df:
      posts = grouped_df.get_group(key).content.values # All the posts made by a user in a game
      label = grouped_df.get_group(key).scum.values[0] # Boolean
      label = 1 if label else 0 # Int

      num_sentences_in_game = 0
      all_sentences_in_game = []
      all_attention_masks_in_game = []
      for post in posts:
        if len(posts) > 0: # Only consider games where user has spoken at least once

          sentences = post.split('\n\n')
          for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 0:
              input_ids = tokenizer.encode(sentence, max_length=MAX_SENTENCE_LEN)
              # 1 for local attention, 2 for global attention, 0 for none (padding)
              # (for our task, mark <s> start of sentence with 2 to have global attention)
              attention_mask  = [1 for _ in range(len(input_ids))]
              attention_mask[0] = 2

              input_ids = input_ids
              attention_mask = attention_mask

              all_sentences_in_game += input_ids
              all_attention_masks_in_game += attention_mask
              num_sentences_in_game += 1

      # If the player said less than 10 sentences in a game, we ignore this sample
      if num_sentences_in_game < 10:
        continue

      input_ids = torch.LongTensor(all_sentences_in_game[:MAX_DOC_LEN])
      attention_mask = torch.LongTensor(all_attention_masks_in_game[:MAX_DOC_LEN])
      label = torch.FloatTensor([label])

      inputs.append(input_ids)
      attention_masks.append(attention_mask)
      labels.append(label)

    self.inputs = inputs
    self.attention_masks = attention_masks
    self.labels = labels
 def get_config(self):
     return LongformerConfig(
         vocab_size=self.vocab_size,
         hidden_size=self.hidden_size,
         num_hidden_layers=self.num_hidden_layers,
         num_attention_heads=self.num_attention_heads,
         intermediate_size=self.intermediate_size,
         hidden_act=self.hidden_act,
         hidden_dropout_prob=self.hidden_dropout_prob,
         attention_probs_dropout_prob=self.attention_probs_dropout_prob,
         max_position_embeddings=self.max_position_embeddings,
         type_vocab_size=self.type_vocab_size,
         initializer_range=self.initializer_range,
         attention_window=self.attention_window,
     )
Exemple #7
0
    def __init__(self, model_path_or_name, device='cuda'):
        """
        Creates a new LitcovidMultiLabelClassifier from a given model path or name.
        :param model_path_or_name: A model name or the path to the saved model
        :param device: which device to use. For example 'cpu' or 'cuda'
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
        self.config = LongformerConfig.from_pretrained(
            model_path_or_name, num_labels=len(categories))
        self.config.sep_token_id = 2
        self.config.attention_window = 32
        self.model = LongformerForSequenceClassification.from_pretrained(
            model_path_or_name, config=self.config)

        self.max_sequence_length = 640
        self.device = device
        self.model.to(device)
        self.model.eval()
 def init_encoder(cls,
                  cfg_name: str,
                  projection_dim: int = 0,
                  attn_dropout: float = 0.1,
                  hidden_dropout: float = 0.1,
                  seq_project=False,
                  **kwargs) -> LongformerModel:
     cfg = LongformerConfig.from_pretrained(
         cfg_name if cfg_name else PRE_TAINED_LONFORMER_BASE)
     if attn_dropout != 0:
         cfg.attention_probs_dropout_prob = attn_dropout
     if hidden_dropout != 0:
         cfg.hidden_dropout_prob = hidden_dropout
     return cls.from_pretrained(cfg_name,
                                config=cfg,
                                project_dim=projection_dim,
                                seq_project=seq_project,
                                **kwargs)
Exemple #9
0
labels = [label_map[x] for x in labels]

train_ids, val_ids, train_mask, val_mask, train_label, val_label = train_test_split(
    token_ids, token_mask, labels, test_size=.2, random_state=42)

train_dataset = TensorDataset(torch.tensor(train_ids),
                              torch.tensor(train_mask),
                              torch.tensor(train_label))
val_dataset = TensorDataset(torch.tensor(val_ids), torch.tensor(val_mask),
                            torch.tensor(val_label))

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=True)

# Model Setup
config = LongformerConfig.from_pretrained(longformer_pretrained)
config.num_labels = 2

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

model = LongformerForSequenceClassification.from_pretrained(
    longformer_pretrained, config=config)

model.cuda()

# Model Train

epochs = 20
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-6)
bias = [float(i) for i in '1,1'.split(',')]
Exemple #10
0
    if args.net_type == "bert_pool_conv":
        config.num_filters = args.num_filters
        sizes = args.filter_sizes.split(',')
        config.filter_sizes = [int(s) for s in sizes]
        model = BertPoolConv.from_pretrained(args.wgts_dir, config=config)
    if args.net_type == "bert_pool_linear":
        config.pool_method_chunks = args.pool_method_chunks
        model = BertPoolLinear.from_pretrained(args.wgts_dir, config=config)

if args.net_type in ["longformer_linear"]:
    # Default: rob/data/pre_wgts/longformer_base/
    # Tokenizer
    tokenizer = LongformerTokenizer.from_pretrained(
        'allenai/longformer-base-4096')
    # Config
    config = LongformerConfig.from_pretrained('allenai/longformer-base-4096')
    config.output_hidden_states = True
    config.num_labels = args.num_labels
    # config.unfreeze = args.unfreeze
    # config.pool_method = args.pool_method
    # config.pool_layers = args.pool_layers
    if args.num_hidden_layers:
        config.num_hidden_layers = args.num_hidden_layers
    if args.num_attention_heads:
        config.num_attention_heads = args.num_attention_heads
    # Model
    model = LongformerLinear.from_pretrained(args.wgts_dir)

# if args.net_type in ["bert_linear", "bert_lstm"]:
#     # Default: rob/data/pre_wgts/bert_medium/
#     # Tokenizer
Exemple #11
0
    f"Removed {bef-df.shape[0]} because they were under {min_len} or over {max_len} characters long."
)
print(df.target.value_counts())

import torch
from transformers import DistilBertTokenizerFast, DistilBertModel, DistilBertConfig
from transformers import LongformerTokenizerFast, LongformerModel, LongformerConfig

#model_name = 'distilbert-base-uncased'
model_name = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

df["vecs"] = df.text.map(
    lambda x: torch.LongTensor(tokenizer.encode(x)).unsqueeze(0))

config = LongformerConfig.from_pretrained(model_name,
                                          output_hidden_states=True)
model = LongformerModel.from_pretrained(model_name, config=config)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

model = model.to(device)
input_tf = tokenizer.batch_encode_plus(df.text.to_list(),
                                       return_tensors='pt',
                                       padding=True)
#vecs = input_tf['input_ids'].to(device)
#granola_ids = granola_ids.to(device)

model.eval()

with torch.no_grad():
Exemple #12
0
 def __init__(self):
     super(JeffBERT,self).__init__()#input_ids=None,inputs_embeds
     config = LongformerConfig(vocab_size=100,num_labels=2,max_length=2560,max_position_embeddings=2560)
     sefl.l1=JeffLongformerForSequenceClassification(config=config)
Exemple #13
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: ",
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: ",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test", action="store_true", help="Whether to run test on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
    )

    parser.add_argument(
        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
    )

    parser.add_argument(
        "--sub_model_type", default='han', type=str, help="Makesure you want sg or not.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()
    if 'AAN' in args.data_dir:
        from src.cite import glue_convert_examples_to_features as convert_examples_to_features
        from src.cite import glue_output_modes as output_modes
        from src.cite import glue_processors as processors
        print('AAN')
    elif 'OC' in args.data_dir:
        from src.cite_OC import glue_convert_examples_to_features as convert_examples_to_features
        from src.cite_OC import glue_output_modes as output_modes
        from src.cite_OC import glue_processors as processors
        print('OC')
    elif 'S2ORC' in args.data_dir:
        from src.cite_S2ORC import glue_convert_examples_to_features as convert_examples_to_features
        from src.cite_S2ORC import glue_output_modes as output_modes
        from src.cite_S2ORC import glue_processors as processors
        print('S2ORC')
    elif 'pla' in args.data_dir:
        from src.cite_PAN import glue_convert_examples_to_features as convert_examples_to_features
        from src.cite_PAN import glue_output_modes as output_modes
        from src.cite_PAN import glue_processors as processors
        print('PAN')

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()

    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
    
    args.model_type = args.model_type.lower()
    
    #config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = LongformerConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    config.output_hidden_states=True
    config.model_type = args.sub_model_type 
    config.batch_size = args.per_gpu_train_batch_size 
    '''
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    '''
    path = args.model_name_or_path
    tok_path = args.model_name_or_path
    tokenizer = LongformerTokenizer.from_pretrained(tok_path)
    model = LongformerForSequenceClassification.from_pretrained(path)
    model.num_labels=2
    model.resize_token_embeddings(len(tokenizer)+1)
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = LongformerForSequenceClassification.from_pretrained(args.output_dir)
        tokenizer = LongformerTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)
    import pickle

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        # eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else args.task_name
        # eval_dataset = load_and_cache_examples(args,eval_task_names, tokenizer, evaluate=True)
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = LongformerForSequenceClassification.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
        print('')
        print(results)
        with open(args.output_dir+'/eval_results' + '.pkl', 'wb') as f:
            pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)


    if args.do_test and args.local_rank in [-1, 0]:
        # tokenizer = tokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        logger.info("Test the following checkpoint: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = LongformerForSequenceClassification.from_pretrained(checkpoint)
            model.to(args.device)
            result = test(args, model, tokenizer, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
        tokenizer=bpe_tokenizer,
        file_path=input_path_val,
        block_size=block_size
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=bpe_tokenizer, mlm=True, mlm_probability=mlm_probability
    )

    # create model
    config = LongformerConfig(
        attention_window=attention_window,
        sep_token_id=bpe_tokenizer.get_vocab()["</s>"],
        pad_token_id=bpe_tokenizer.get_vocab()["<pad>"],
        bos_token_id=bpe_tokenizer.get_vocab()["<s>"], 
        eos_token_id=bpe_tokenizer.get_vocab()["</s>"],
        vocab_size=bpe_tokenizer.vocab_size,
        max_position_embeddings=max_len+10,
        num_attention_heads=num_attention_heads,
        num_hidden_layers=num_hidden_layers,
        type_vocab_size=1
    )
    
    model = LongformerForMaskedLM(config=config)

    _pretty_print(f"Number of model parameters : {model.num_parameters():,}")

    model_path = os.path.join(output_path, "lm")
    training_args = TrainingArguments(
        output_dir=model_path,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
Exemple #15
0
import pandas as pd
import datasets
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import wandb
import os

# In[2]:

config = LongformerConfig()

config

# The datasets library handles the hassle of downloading and processing nlp datasets which is quite convenient to save time in processing and use it for modelling. First we need to instantiate the class by calling the method `load_dataset`. In case the dataset is not loaded, the library downloads it and saves it in the datasets default folder.
#
# This example provided by HuggingFace uses an older version of datasets (still called nlp) and demonstrates how to user the [trainer class with BERT](https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing#scrollTo=5DEWNilys9Ty). Todays tutorial will follow several of the concepts described there.
#
# The dataset class has multiple useful methods to easily load, process and apply transformations to the dataset. We can even load the data and split it into train and test feeding a list to the split argument.

# In[3]:

train_data, test_data = datasets.load_dataset(
    'imdb',
    split=['train', 'test'],
    cache_dir='/media/jlealtru/data_files/github/website_tutorials/data')
Exemple #16
0
def train(opt):
    # Set random seed for reproducibility
    random.seed(opt.seed)
    np.random.seed(opt.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(opt.seed)
    else:
        torch.manual_seed(opt.seed)

    if opt.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(opt.gradient_accumulation_steps))
    opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps

    # Logging
    if not os.path.isdir(opt.saved_path):
        os.makedirs(opt.saved_path)
    output_file = open(opt.saved_path + os.sep + "logs.txt", "w")
    output_file.write("Model's parameters: {}".format(vars(opt)))
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)

    # Data Loading
    training_params = {
        "batch_size": opt.batch_size,
        "shuffle": True,
        "drop_last": True
    }
    test_params = {
        "batch_size": opt.batch_size,
        "shuffle": False,
        "drop_last": False
    }

    train_data_path = opt.train_set
    training_set = MafiascumDataset(train_data_path)
    training_generator = DataLoader(training_set, **training_params)

    test_data_path = opt.test_set
    test_set = MafiascumDataset(test_data_path)
    test_generator = DataLoader(test_set, **test_params)

    # Model
    config = LongformerConfig.from_pretrained('longformer-base-4096')
    config.attention_mode = 'sliding_chunks'
    model = LongformerForBinaryClassification(config)
    if torch.cuda.is_available():
        model = model.cuda()
    criterion = nn.BCEWithLogitsLoss()

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=opt.lr,
                      weight_decay=0.01,
                      correct_bias=False)
    num_train_optimization_steps = int(
        len(training_generator) / opt.batch_size /
        opt.gradient_accumulation_steps) * opt.num_epoches
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_training_steps=num_train_optimization_steps,
        num_warmup_steps=opt.num_warmup_steps)

    # Training
    if opt.gradient_accumulation_steps > 1:
        labels_mini_batch = []
        logits_mini_batch = []

    best_loss = 1e5
    best_epoch = 0
    model.train()
    num_iter_per_epoch = len(training_generator)
    for epoch in range(opt.num_epoches):
        for iteration, (input_ids, attention_mask,
                        labels) in enumerate(training_generator):
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                labels = labels.cuda()
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)
            if opt.gradient_accumulation_steps > 1:
                loss = loss / opt.gradient_accumulation_steps
                labels_mini_batch.append(labels)
                logits_mini_batch.append(logits)
            loss.backward()
            if (iteration + 1) % opt.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                if opt.gradient_accumulation_steps > 1:
                    labels = torch.cat(labels_mini_batch, dim=0)
                    logits = torch.cat(logits_mini_batch, dim=0)
                    labels_mini_batch = []
                    logits_mini_batch = []
                training_metrics = get_evaluation(
                    labels.cpu().numpy(),
                    logits.cpu().detach().numpy(),
                    list_metrics=["accuracy"])
                print(
                    "Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {}, Accuracy: {}"
                    .format(epoch + 1, opt.num_epoches, iteration + 1,
                            num_iter_per_epoch,
                            optimizer.param_groups[0]['lr'], loss,
                            training_metrics["accuracy"]))
                writer.add_scalar('Train/Loss', loss,
                                  epoch * num_iter_per_epoch + iteration)
                writer.add_scalar('Train/Accuracy',
                                  training_metrics["accuracy"],
                                  epoch * num_iter_per_epoch + iteration)

        if epoch % opt.test_interval == 0:
            model.eval()
            loss_ls = []
            te_label_ls = []
            te_pred_ls = []
            for (input_ids, attention_mask, labels) in test_generator:
                num_sample = len(labels)
                if torch.cuda.is_available():
                    input_ids = input_ids.cuda()
                    attention_mask = attention_mask.cuda()
                    labels = labels.cuda()
                with torch.no_grad():
                    logits = model(input_ids, attention_mask=attention_mask)
                te_loss = criterion(logits, labels)
                loss_ls.append(te_loss * num_sample)
                te_label_ls.extend(labels.clone().cpu())
                te_pred_ls.append(logits.clone().cpu())
            te_loss = sum(loss_ls) / test_set.__len__()
            te_pred = torch.cat(te_pred_ls, 0)
            te_label = np.array(te_label_ls)
            test_metrics = get_evaluation(
                te_label,
                te_pred.numpy(),
                list_metrics=["accuracy", "confusion_matrix"])
            output_file.write(
                "Epoch: {}/{} \nTest loss: {} Test accuracy: {} \nTest confusion matrix: \n{}\n\n"
                .format(epoch + 1, opt.num_epoches, te_loss,
                        test_metrics["accuracy"],
                        test_metrics["confusion_matrix"]))
            print("Epoch: {}/{}, Lr: {}, Loss: {}, Accuracy: {}".format(
                epoch + 1, opt.num_epoches, optimizer.param_groups[0]['lr'],
                te_loss, test_metrics["accuracy"]))
            writer.add_scalar('Test/Loss', te_loss, epoch)
            writer.add_scalar('Test/Accuracy', test_metrics["accuracy"], epoch)
            model.train()
            # Update if new best loss achieved
            if te_loss + opt.es_min_delta < best_loss:
                best_loss = te_loss
                best_epoch = epoch
                torch.save(model, opt.saved_path + os.sep + "longformer")

            # Early stopping
            if epoch - best_epoch > opt.es_patience > 0:
                print(
                    "Stop training at epoch {}. The lowest loss achieved is {}"
                    .format(epoch, te_loss))
                break

    writer.close()
    output_file.close()
            '1',  # 32GB gpu with fp32
            '--gradient_accumulation_steps',
            '4',
            #'--evaluate_during_training', # this is removed to reduce training time
            '--do_train',
        ])
    #train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt'
    #val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt'
    # these are small file for test
    train_fn = '/scratch/xl3119/capstone/data/sample/sample.txt'
    val_fn = '/scratch/xl3119/capstone/data/sample/sample.txt'
    training_args.train_datapath = train_fn
    training_args.val_datapath = val_fn

    ##################### use pretrianed longformer in transformer
    init_config = LongformerConfig.from_json_file(
        'config_files/longformer_base_4096/config.json')
    mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer')
    word_embeddings = np.loadtxt(
        join('/scratch/xl3119/capstone/wd_emb', "word_embedding_matrix.txt"))
    longformer_model = LongformerForMaskedLM(init_config)
    longformer_model = use_embeddings_fasttext(longformer_model,
                                               word_embeddings)
    # longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

    logger.info('Train and eval with Longformer pretrained ...')
    pretrain_and_evaluate(training_args, longformer_model, mimic_tokenizer, train_only=True, eval_only=False, model_path=None\
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.

                         )
LR = float(sys.argv[4])
MODEL_NAME = str(sys.argv[5])
WEIGHT_NAME = str(sys.argv[6])

# Sample values
# MAX_LEN = 1024
# BATCH_SIZE = 8
# EPOCHS = 2
# LR = 3e-5
# MODEL_NAME = 'allenai/longformer-base-4096/'
# WEIGHT_NAME = 'weights' (.h5)



# Configs for model training - CHANGE ME
configuration = LongformerConfig()
save_path = "/scratch/aj2885/results/longformer-base-4096_"+WEIGHT_NAME[:-1]+"/"
data_path = "/scratch/aj2885/datasets/hotpotqa/"


# Tokenizer
slow_tokenizer = LongformerTokenizerFast.from_pretrained(MODEL_NAME)
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = ByteLevelBPETokenizer(save_path+"vocab.json",save_path+"merges.txt" ,lowercase=True)


Exemple #19
0
data_training_args.val_datapath_output = training_args.output_dir + "/wikitext-103/train.tfrecord"
data_training_args.train_datapath = training_args.output_dir + "/wikitext-103/valid.tfrecord"

if __name__ == "__main__":

    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    tokenizer = LongformerTokenizer.from_pretrained("longformer-base-4096")
    create_pretraining_data(data_training_args)

    config = LongformerConfig(
        vocab_size=training_args.vocab_size,
        hidden_size=training_args.hidden_size,
        num_hidden_layers=training_args.num_hidden_layers,
        num_attention_heads=training_args.num_attention_heads,
        intermediate_size=training_args.intermediate_size,
        hidden_act=training_args.hidden_act,
        hidden_dropout_prob=training_args.hidden_dropout_prob,
        attention_probs_dropout_prob=training_args.
        attention_probs_dropout_prob,
        max_position_embeddings=training_args.max_position_embeddings,
        type_vocab_size=training_args.type_vocab_size,
        initializer_range=training_args.initializer_range,
        attention_window=training_args.attention_window,
    )

    # long_formeformer_tokernizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = TFLongformerModel(confg=config)
    logger.info('Evaluating roberta-base (seqlen: 4096) for refernece ...')
    pretrain_and_evaluate(training_args, model, eval_only=False)