Esempio n. 1
0
def load_transformer(model_type):
    if model_type == "distilbert":
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased')
        model = TFDistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=1)
    elif model_type == "bert_x12":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased", num_labels=1)
    elif model_type == "bert_x24":
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-large-uncased", num_labels=1)
    elif model_type == "albert_v2_x12":
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForSequenceClassification.from_pretrained(
            "albert-base-v2", num_labels=1)
    elif model_type == "longformer_x12":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-base-4096", num_labels=1)
    elif model_type == "longformer_x24":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-large-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-large-4096", num_labels=1)
    else:
        raise ValueError(model_type + " was invalid")

    return model, tokenizer
def main(dataset_directory, jsonlines_filename):
    dataset, ids, images = extract_article_list(
        os.path.join(dataset_directory, jsonlines_filename))
    print(f'Len dataset = {len(dataset)}')

    tokenizer = LongformerTokenizer.from_pretrained(
        "allenai/longformer-base-4096")

    batch_size = 512
    all_tokens = np.zeros((len(dataset), 2, 4096), dtype=np.float)
    for i, chunk in tqdm(enumerate(chunks(dataset, batch_size)),
                         total=len(dataset) / batch_size):
        with torch.no_grad():
            tokenized_text = tokenizer(chunk,
                                       return_tensors="pt",
                                       truncation=True,
                                       padding="max_length")
            all_tokens[i * batch_size:i * batch_size + len(chunk),
                       0, :] = tokenized_text["input_ids"].numpy()
            all_tokens[i * batch_size:i * batch_size + len(chunk),
                       1, :] = tokenized_text["attention_mask"].numpy()

    data_df = pd.DataFrame(zip(ids, images, all_tokens.astype(np.int_)))
    data_df.to_pickle(
        os.path.join(
            dataset_directory,
            f"longformer_tokens_{jsonlines_filename.split('.')[0]}.pkl"))
Esempio n. 3
0
 def __init__(self, config):
     super(LongformerForBinaryClassification, self).__init__()
     self.config = config
     self.tokenizer = LongformerTokenizer.from_pretrained(
         'longformer-base-4096')
     self.longformer = LongformerModel(config)
     self.classifier = nn.Linear(config.hidden_size, 1)
def get_hotpotqa_longformer_tokenizer(model_name=PRE_TAINED_LONFORMER_BASE, do_lower_case=True):
    tokenizer = LongformerTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)
    special_tokens_dict = {'additional_special_tokens': ['<q>', '</q>', '<d>', '<p>']}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print('Number of added tokens = {}: {}'.format(num_added_toks, special_tokens_dict))
    print('*' * 75)
    return tokenizer
Esempio n. 5
0
def get_tokenizer(lm='bert'):
    """Return the tokenizer. Intiailize it if not initialized.

    Args:
        lm (string): the name of the language model (bert, albert, or distilbert)
    Returns:
        BertTokenizer or DistilBertTokenizer or AlbertTokenizer
    """
    global tokenizer
    if tokenizer is None:
        if lm == 'bert':
            from transformers import BertTokenizer
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif lm == 'distilbert':
            from transformers import DistilBertTokenizer
            tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        elif lm == 'albert':
            from transformers import AlbertTokenizer
            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        elif lm == 'roberta':
            from transformers import RobertaTokenizer
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        elif lm == 'xlnet':
            from transformers import XLNetTokenizer
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        elif lm == 'longformer':
            from transformers import LongformerTokenizer
            tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
    return tokenizer
def main(dataset_directory, jsonlines_filename):
    dataset, ids, images = extract_article_list(
        os.path.join(dataset_directory, jsonlines_filename))
    print(f'Len dataset = {len(dataset)}')

    text_model = LongformerModel.from_pretrained(
        "allenai/longformer-base-4096").to("cuda")
    text_model.eval()
    tokenizer = LongformerTokenizer.from_pretrained(
        "allenai/longformer-base-4096")

    # pool = Pool(processes=48)
    # processed_text = list(tqdm(pool.map(process_text, dataset), total=len(dataset)))
    # pool.close()
    batch_size = 8
    all_embeddings_avg = np.zeros((len(dataset), 768), dtype=np.float)
    for i, chunk in tqdm(enumerate(chunks(dataset, batch_size)),
                         total=len(dataset) / batch_size):
        with torch.no_grad():
            tokenized_text = tokenizer(chunk,
                                       return_tensors="pt",
                                       truncation=True,
                                       padding="max_length")
            model_out = text_model(**(tokenized_text.to("cuda")))
            all_embeddings_avg[i * batch_size:i * batch_size +
                               len(chunk), :] = torch.mean(
                                   model_out[0], dim=1).cpu().numpy()

    data_df = pd.DataFrame(zip(ids, images, all_embeddings_avg))
    data_df.to_pickle(
        os.path.join(dataset_directory,
                     f"longformer_{jsonlines_filename.split('.')[0]}.pkl"))
Esempio n. 7
0
    def __init__(self, config_path):
        config = configparser.ConfigParser()
        config.read(config_path)

        self.save_dir = Path(config.get("general", "save_dir"))
        if not self.save_dir.exists():
            self.save_dir.mkdir(parents=True)
        self.clf_th = config.getfloat("general", "clf_th")

        self.mlp_model_path = config.get("model", "mlp")
        assert Path(self.mlp_model_path).exists()

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        bert_config_path = config.get("bert", "config_path")
        assert Path(bert_config_path).exists()
        self.bert_config = LongformerConfig.from_json_file(bert_config_path)
        self.max_seq_length = self.bert_config.max_position_embeddings - 2
        self.bert_tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        # bert_tokenizer_path = config.get("bert", "tokenizer_path")
        # assert Path(bert_config_path).exists()
        # self.bert_tokenizer = LongformerTokenizer.from_pretrained(bert_tokenizer_path)
        bert_model_path = config.get("bert", "model_path")
        assert Path(bert_model_path).exists()
        self.bert_model = LongformerModel.from_pretrained(
            bert_model_path, config=self.bert_config)
        self.bert_model.to(self.device)
        self.bert_model.eval()

        gold_dir = Path(config.get("data", "gold_dir"))
        assert Path(gold_dir).exists()
        self.gold_dataset = ConllDataset(gold_dir)
        target_dir = Path(config.get("data", "target_dir"))
        assert Path(target_dir).exists()
        self.target_dataset = ConllDataset(target_dir)
Esempio n. 8
0
 def __init__(self):
     self.model = LongformerModel.from_pretrained(
         'allenai/longformer-base-4096')
     self.tokenizer = LongformerTokenizer.from_pretrained(
         'allenai/longformer-base-4096')
     self.led_tokenizer = LEDTokenizer.from_pretrained(
         'allenai/led-base-16384')
     self.led_model = LEDModel.from_pretrained('allenai/led-base-16384')
Esempio n. 9
0
 def _test_TFLongformer(self, size, large=False):
     from transformers import LongformerTokenizer, TFLongformerModel
     tokenizer = LongformerTokenizer.from_pretrained(size)
     model = TFLongformerModel.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     spec, input_dict = self.spec_and_pad(input_dict, max_length=512)
     outputs = ["last_hidden_state"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
Esempio n. 10
0
    def __init__(self, args):
        super().__init__()

        self.args = args
        self.save_hyperparameters(args)

        self.tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
        self.model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")
Esempio n. 11
0
 def __init__(self):
     self.train = None
     self.test = None
     self.tokenizer = LongformerTokenizer.from_pretrained(
         'allenai/longformer-base-4096')
     self.processor = squad.SquadV2Processor()
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
Esempio n. 12
0
 def build_model(self):
     super().build_model()
     self.tokenizer = LongformerTokenizer.from_pretrained(
         "allenai/longformer-base-4096")
     self.model = LongformerForSequenceClassification.from_pretrained(
         "allenai/longformer-base-4096",
         num_labels=self.num_categories,
     )
     self.model.to("cuda")
Esempio n. 13
0
  def __init__(self, data_path):
    super(MafiascumDataset, self).__init__()

    tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096')
    config = LongformerConfig()

    df = pd.read_pickle(data_path, compression="gzip")
    grouped_df = df.groupby(["author", "game_id"])

    labels = []
    inputs = []
    attention_masks = []

    for key, item in grouped_df:
      posts = grouped_df.get_group(key).content.values # All the posts made by a user in a game
      label = grouped_df.get_group(key).scum.values[0] # Boolean
      label = 1 if label else 0 # Int

      num_sentences_in_game = 0
      all_sentences_in_game = []
      all_attention_masks_in_game = []
      for post in posts:
        if len(posts) > 0: # Only consider games where user has spoken at least once

          sentences = post.split('\n\n')
          for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 0:
              input_ids = tokenizer.encode(sentence, max_length=MAX_SENTENCE_LEN)
              # 1 for local attention, 2 for global attention, 0 for none (padding)
              # (for our task, mark <s> start of sentence with 2 to have global attention)
              attention_mask  = [1 for _ in range(len(input_ids))]
              attention_mask[0] = 2

              input_ids = input_ids
              attention_mask = attention_mask

              all_sentences_in_game += input_ids
              all_attention_masks_in_game += attention_mask
              num_sentences_in_game += 1

      # If the player said less than 10 sentences in a game, we ignore this sample
      if num_sentences_in_game < 10:
        continue

      input_ids = torch.LongTensor(all_sentences_in_game[:MAX_DOC_LEN])
      attention_mask = torch.LongTensor(all_attention_masks_in_game[:MAX_DOC_LEN])
      label = torch.FloatTensor([label])

      inputs.append(input_ids)
      attention_masks.append(attention_mask)
      labels.append(label)

    self.inputs = inputs
    self.attention_masks = attention_masks
    self.labels = labels
Esempio n. 14
0
 def __init__(self,
              vocab,
              unk_token="<unk>",
              max_input_chars_per_word=100,
              never_split=None):
     self.vocab = vocab
     self.unk_token = unk_token
     self.max_input_chars_per_word = max_input_chars_per_word
     self.never_split = never_split
     self.tokenizer = LongformerTokenizer.from_pretrained(
         'allenai/longformer-base-4096')
Esempio n. 15
0
def summarise_longformer(long_text_to_summarise):
    model_to_load = "patrickvonplaten/longformer2roberta-cnn_dailymail-fp16"
    tok_to_load = "allenai/longformer-base-4096"
    tokeniser = LongformerTokenizer.from_pretrained(tok_to_load)
    model = EncoderDecoderModel.from_pretrained(model_to_load)
    input_ids = tokeniser(
        long_text_to_summarise,
        return_tensors="pt").input_ids  #.to(device).input_ids
    outputs = model.generate(input_ids)  #.to(device)
    summary = tokeniser.decode(outputs[0], skip_special_tokens=True)
    return summary
    def __init__(self, hparams):
        #super().__init__()
        super(TransformerMarco, self).__init__()
        self.hparams = hparams
        self.tokenizer = LongformerTokenizer.from_pretrained(
            hparams.model_name)
        self.model = LongformerForSequenceClassification.from_pretrained(
            hparams.model_name)

        self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
        self.DatasetClass = MarcoDataset
    def make_dataset(self, data_root: str) -> None:
        """ Make Dataset
        Make dataset from json files and save it as csv.

        Args:
            data_root: Root directory for document json files.
        """

        log.info(f"Making dataset...")
        json_paths = glob.glob(f"{data_root}/**/*.json", recursive=True)

        # nltk settings
        nltk.download('punkt')
        stemmer = PorterStemmer()
        cv = CountVectorizer()
        texts = [] # A list of tokenized texts separated by half-width characters

        # Longformer
        feature_matrix = []
        device = torch.device('cuda')
        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
        model = LongformerModel.from_pretrained('allenai/longformer-base-4096').to(device)
        for json_path in tqdm(json_paths):
            with open(json_path) as f:
                json_obj = json.load(f)
                body = json_obj["body"]

                soup = BeautifulSoup(body, "html.parser")
                for script in soup(["script", "style"]):
                    script.decompose()
                text = soup.get_text()

                with torch.no_grad():
                    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0).to(device)
                    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device).to(device)
                    global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device).to(device)
                    outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)

                    vec = outputs.last_hidden_state[0].cpu().detach().clone().numpy().mean(0)
                # np.append(feature_matrix, vec)
                feature_matrix.append(list(vec))
                # log.info(f"Done: {len(feature_matrix)}")

                
        feature_matrix = np.array(feature_matrix)
        log.info(f"Longformer: {feature_matrix.shape}")

        # Calculate distance matrix
        dist_mat = squareform(pdist(feature_matrix, metric='cosine'))

        df = pd.DataFrame(dist_mat)
        df.to_csv(join(self.cache_path, "json_document_longformer.csv"), index=False)
        log.info(f"Successfully made dataset.")
Esempio n. 18
0
 def set_tokenizer(self, tokenizer = "roberta"):
     if tokenizer == "longformer":
         from transformers import LongformerTokenizer
         self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
         self.tokenizer_type = tokenizer
     elif tokenizer == "roberta":
         from transformers import RobertaTokenizer
         self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         self.tokenizer_type = tokenizer
     elif tokenizer == "bert":
         from transformers import BertTokenizer
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         self.tokenizer_type = tokenizer
     else:
         print("Error, the tokenizers allowed are 'longformer' , 'roberta' , 'bert' ")
Esempio n. 19
0
 def __init__(self, params):
     super(LongEntityLinker, self).__init__()
     self.params = params
     self.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.n_gpu = torch.cuda.device_count()
     self.use_golden_tags = params['use_golden_tags']
     # init tokenizer
     if params['use_longformer']:
         self.tokenizer = LongformerTokenizer.from_pretrained(
             'allenai/longformer-base-4096')
     else:
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.pad_id = -1
     # init model
     self.model = LongEntityLinkerModule(self.params)
     self.model = self.model.to(self.device)
Esempio n. 20
0
 def load(self, k):
     while self.m.get(k, None) == -1:
         time.sleep(1)  # loading, wit till ready
     if self.m.get(k, None) is not None:
         return self.m[k]  # it's already loaded
     self.m[k] = -1  # tell others it's loading, wait
     m = None
     if k == 'sentence-encode':
         m = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
         # word_embedding_model = models.Transformer('allenai/longformer-base-4096')
         # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
         # m = SentenceTransformer(modules=[word_embedding_model, pooling_model])
     elif k == 'sentiment-analysis':
         tokenizer = AutoTokenizer.from_pretrained(
             "mrm8488/t5-base-finetuned-emotion")
         model = AutoModelWithLMHead.from_pretrained(
             "mrm8488/t5-base-finetuned-emotion").to("cuda")
         # TODO we sure it's not ForSequenceClassification? https://huggingface.co/mrm8488/t5-base-finetuned-emotion
         m = (tokenizer, model, 512)
     elif k == 'summarization':
         # Not using pipelines because can't handle >max_tokens
         # https://github.com/huggingface/transformers/issues/4501
         # https://github.com/huggingface/transformers/issues/4224
         max_tokens = 1024  # 4096
         tokenizer = BartTokenizer.from_pretrained(
             'facebook/bart-large-cnn')
         model = BartForConditionalGeneration.from_pretrained(
             'facebook/bart-large-cnn').to("cuda")
         # model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16").to("cuda")
         # tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
         m = (tokenizer, model, max_tokens)
     elif k == 'question-answering':
         tokenizer = LongformerTokenizer.from_pretrained(
             "allenai/longformer-large-4096-finetuned-triviaqa")
         model = LongformerForQuestionAnswering.from_pretrained(
             "allenai/longformer-large-4096-finetuned-triviaqa",
             return_dict=True).to("cuda")
         # tokenizer = AutoTokenizer.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2")
         # model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2", return_dict=True).to("cuda")
         m = (tokenizer, model, 4096)
     self.m[k] = m
     return m
Esempio n. 21
0
    def __init__(self, params):
        super().__init__()
        
        if 'dropout' in params:
            self.dropout = nn.Dropout(p=params['dropout'])
        else:
            self.dropout = None
            
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False, do_basic_tokenize=False)
#         self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.max_length = params['max_length'] if 'max_length' in params else 1024
        self.max_memory_size = params['max_memory_size']
        
        self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
        self.bert = LongformerModel.from_pretrained("allenai/longformer-base-4096", gradient_checkpointing=True)

        self.num_labels = params["label_length"] if 'label_length' in params else 2

        self.fc = nn.Linear(768, self.num_labels)
Esempio n. 22
0
 def __init__(self, params):
     super(LongEncoderRanker, self).__init__()
     self.params = params
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     self.n_gpu = torch.cuda.device_count() # todo
     #self.num_tags = 4 if not self.params['end_tag'] else 5
     #self.num_tags = 3 if not self.params['end_tag'] else 4
     self.num_tags = 9 if self.params['conll'] else 3
     self.is_biencoder = params['is_biencoder']
     self.use_golden_tags = not params['not_use_golden_tags']
     # init tokenizer
     if params['use_longformer']:
         self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
     else:
         #self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
     #self.pad_id = 0
     self.pad_id = -1
     # init model
     self.model = LongEncoderModule(self.params)
     self.model = self.model.to(self.device)
def get_par_train_data_loader(rank,
                              args) -> (DataLoader, DistributedSampler, int):
    data_frame = read_train_dev_data_frame(file_path=args.data_path,
                                           json_fileName=args.train_data_name)
    data_size = data_frame.shape[0]
    if args.train_data_filtered == 1:
        data_frame = data_frame[data_frame['level'] != 'easy']
        logging.info('Filtered data by removing easy case {} to {}'.format(
            data_size, data_frame.shape[0]))
    elif args.train_data_filtered == 2:
        data_frame = data_frame[data_frame['level'] == 'hard']
        logging.info(
            'Filtered data by removing easy and medium case {} to {}'.format(
                data_size, data_frame.shape[0]))
    else:
        logging.info('Using all training data {}'.format(data_size))
    data_size = data_frame.shape[0]

    num_replicas = args.world_size
    tokenizer = LongformerTokenizer.from_pretrained(args.pretrained_cfg_name,
                                                    do_lower_case=True)
    hotpot_tensorizer = LongformerQATensorizer(tokenizer=tokenizer,
                                               max_length=args.max_ctx_len)
    dataset = HotpotTrainDataset(data_frame=data_frame,
                                 hotpot_tensorizer=hotpot_tensorizer,
                                 max_sent_num=args.max_sent_num)
    batch_size = args.batch_size // num_replicas
    logging.info('Each node batch size = {}'.format(batch_size))
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset=dataset, rank=rank, num_replicas=num_replicas)
    train_dataloader = DataLoader(dataset=dataset,
                                  batch_size=batch_size,
                                  num_workers=max(1, args.cpu_num // 2),
                                  collate_fn=HotpotTrainDataset.collate_fn,
                                  shuffle=False,
                                  pin_memory=True,
                                  sampler=train_sampler)
    return train_dataloader, train_sampler, data_size
Esempio n. 24
0
 def __init__(self, model_name: str = "allenai/longformer-base-4096"):
     self.model = LongformerModel.from_pretrained(model_name)
     self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
Esempio n. 25
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    set_seed(training_args.seed)

    model = LongformerForMaskedLM.from_pretrained(
        'allenai/longformer-base-4096')
    tokenizer = LongformerTokenizer.from_pretrained(
        'allenai/longformer-base-4096')
    tokenizer.add_tokens(['<doc-s>'], special_tokens=True)
    tokenizer.add_tokens(['</doc-s>'], special_tokens=True)

    data_args.block_size = 4096

    train_dataset = get_dataset(data_args,
                                tokenizer=tokenizer,
                                local_rank=training_args.local_rank)
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability,
        globalize_special_tokens=data_args.globalize_special_tokens)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        # eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    model_path = (model_args.model_name_or_path
                  if model_args.model_name_or_path is not None
                  and os.path.isdir(model_args.model_name_or_path) else None)
    trainer.train(model_path=model_path)
    if trainer.is_world_master():
        tokenizer.save_pretrained(training_args.output_dir)

    results = {}
    logger.info("*** Evaluate ***")

    eval_output = trainer.evaluate()

    perplexity = math.exp(eval_output["loss"])
    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(training_args.output_dir,
                                    "eval_results_lm.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    results.update(result)

    return results
            '5.0',
            '--per_gpu_eval_batch_size',
            '2',
            '--per_gpu_train_batch_size',
            '1',  # 32GB gpu with fp32
            '--gradient_accumulation_steps',
            '32',
            #'--evaluate_during_training', # this is removed to reduce training time
            '--do_train',
            '--do_eval',
        ])
    train_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_train.txt'
    val_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_val.txt'
    # these are small file for test
    #     train_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_train.txt'
    #     val_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_val.txt'
    training_args.val_datapath = val_fn
    training_args.train_datapath = train_fn

    ##################### use pretrianed longformer in transformer
    longformer_model = LongformerForMaskedLM.from_pretrained(
        'allenai/longformer-base-4096')
    longformer_tokenizer = LongformerTokenizer.from_pretrained(
        'allenai/longformer-base-4096')

    logger.info('Train and eval with Longformer pretrained ...')
    pretrain_and_evaluate(training_args, longformer_model, longformer_tokenizer, eval_only=False, model_path=None\
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.

                         )
Esempio n. 27
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath('args.json'))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    tokenizer = LongformerTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = LongformerForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    #train_dataset  = torch.load(data_args.train_file_path)
    #eval_dataset = torch.load(data_args.valid_file_path)
    train_examples = DeepThinkDataset(data_args.input_train_file)
    train_dataset = DTDataset(tokenizer, train_examples,
                              data_args.max_seq_length)
    eval_examples = DeepThinkDataset(data_args.input_eval_file)
    eval_dataset = DTDataset(tokenizer, eval_examples,
                             data_args.max_seq_length)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DummyDataCollator(),
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))

        results.update(eval_output)

    return results
Esempio n. 28
0
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')

input_ids = torch.tensor(
    tokenizer.encode("Hello, my dog is cute",
                     add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids)

loss, prediction_scores = outputs[:2]
print(prediction_scores)

## Longformer
from transformers import LongformerModel, LongformerTokenizer

model = LongformerModel.from_pretrained('longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096')

SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(
    0)  # batch of size 1

# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(
    input_ids.shape, dtype=torch.long,
    device=input_ids.device)  # initialize to local attention
attention_mask[:, [
    1,
    4,
    21,
]] = 2  # Set global attention based on the task. For example,
# classification: the <s> token
Esempio n. 29
0
def data_consistent_checker():
    tokenizer = LongformerTokenizer.from_pretrained(PRE_TAINED_LONFORMER_BASE, do_lower_case=True)
    def supp_fact_check(row):
        support_facts, filtered_support_facts = row['supporting_facts'], row['supp_facts_filtered']
        for x in support_facts:
            print('supp {}'.format(x))

        for x in filtered_support_facts:
            print('filtered supp {}'.format(x))

    def answer_check(row):
        answer_encode_id = row['answer_encode']
        answer_norm = row['norm_answer']
        orig_answer = row['answer']
        print('Decode = {}\nnorm = {}\norig = {}'.format(tokenizer.decode(answer_encode_id, skip_special_tokens=True), answer_norm, orig_answer))

    def support_sentence_checker(row):
        filtered_support_facts = row['supp_facts_filtered']
        for x in filtered_support_facts:
            print(x)
        print('=' * 100)
        p_ctx = row['p_ctx']
        for idx, context in enumerate(p_ctx):
            print(context[0])
            print('context={}\nnum sents={}'.format(context[1], len(context[1])))
            print('*'*75)
        print('+'*100)
        p_ctx_encode = row['p_ctx_encode']
        # print(len(p_ctx_encode), len(p_ctx))
        for idx, context in enumerate(p_ctx_encode):
            p_doc_encode_ids, p_doc_weight, p_doc_len_i, sent_start_end_pair, supp_sent_labels, ctx_with_answer, answer_positions, p_title_len = context
            print('encode {}\nwith len {}\nstore len {}'.format(p_doc_encode_ids, len(p_doc_encode_ids), p_doc_len_i))
            print('sent pair = {}\nnum sents ={}'.format(sent_start_end_pair, len(sent_start_end_pair)))
            print('sent labels = {}'.format(supp_sent_labels))
            print('context len = {}'.format(len(context)))
            print('context with answer = {}'.format(ctx_with_answer))
            print('title = {}'.format(tokenizer.decode(p_doc_encode_ids[:p_title_len], skip_special_tokens=True)))
            print('answer position = {}'.format(answer_positions))
            if len(answer_positions) > 0:
                sent_start, sent_end = sent_start_end_pair[answer_positions[0][0]]
                support_sentence = tokenizer.decode(p_doc_encode_ids[sent_start:(sent_end + 1)], skip_special_tokens=True)
                print('sentence idx={}, Decode sentence = {}'.format(answer_positions[0][0], support_sentence))
                sentence_ids = p_doc_encode_ids[sent_start:(sent_end + 1)]
                decode_answer = tokenizer.decode(sentence_ids[answer_positions[0][1]:(answer_positions[0][2]+1)], skip_special_tokens=True)
                print('decode answer = {}, orig answer = {}'.format(decode_answer, row['norm_answer']))
            print(context[1])
            print('*'*75)
        print('+' * 100)

        print('p_ctx_lens', row['p_ctx_lens'])

    def doc_order_checker(row):
        pos_docs = row['p']

    '''
    _id, answer, question, supporting_facts, context, type, level, norm_query, norm_answer, p_ctx, n_ctx, supp_facts_filtered,
    answer_type, p_doc_num, n_doc_num, yes_no, no_found, ques_encode, ques_len, answer_encode, answer_len, p_ctx_encode,
    p_ctx_lens, pc_max_len, n_ctx_encode, n_ctx_lens, nc_max_len
    :return:
    '''
    data_frame = loadWikiData(PATH=abs_distractor_wiki_path,
                              json_fileName='hotpot_train_distractor_wiki_tokenized.json')
    print('Data frame size = {}'.format(data_frame.shape))
    record_num = data_frame.shape[0]
    row_num = 2
    random_idx = np.random.choice(record_num, row_num, replace=False)
    for idx in range(row_num):
        row_i = data_frame.loc[random_idx[idx], :]
        # supp_fact_check(row=row_i)
        # answer_check(row=row_i)
        support_sentence_checker(row=row_i)
        print('$' * 90)
Esempio n. 30
0
    def __init__(self, config_path):
        config = configparser.ConfigParser()
        config.read(config_path)

        self.n_epoch = config.getint("general", "n_epoch")
        self.batch_size = config.getint("general", "batch_size")
        self.train_bert = config.getboolean("general", "train_bert")
        self.lr = config.getfloat("general", "lr")
        self.cut_frac = config.getfloat("general", "cut_frac")
        self.log_dir = Path(config.get("general", "log_dir"))
        if not self.log_dir.exists():
            self.log_dir.mkdir(parents=True)
        self.model_save_freq = config.getint("general", "model_save_freq")

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # bert_config_path = config.get("bert", "config_path")
        # bert_tokenizer_path = config.get("bert", "tokenizer_path")
        # bert_model_path = config.get("bert", "model_path")

        self.bert_tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        # self.bert_tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_path)
        tkzer_save_dir = self.log_dir / "tokenizer"
        if not tkzer_save_dir.exists():
            tkzer_save_dir.mkdir()
        self.bert_tokenizer.save_pretrained(tkzer_save_dir)
        self.bert_model = LongformerModel.from_pretrained(
            'allenai/longformer-base-4096')
        self.bert_config = self.bert_model.config
        # self.bert_config = BertConfig.from_pretrained(bert_config_path)
        # self.bert_model = BertModel.from_pretrained(bert_model_path, config=self.bert_config)
        self.max_seq_length = self.bert_config.max_position_embeddings - 2
        # self.max_seq_length = self.bert_config.max_position_embeddings
        self.bert_model.to(self.device)

        if self.train_bert:
            self.bert_model.train()
        else:
            self.bert_model.eval()

        train_conll_path = config.get("data", "train_path")
        print("train path", train_conll_path)
        assert Path(train_conll_path).exists()
        dev_conll_path = config.get("data", "dev_path")
        print("dev path", dev_conll_path)
        assert Path(dev_conll_path).exists()
        dev1_conll_path = Path(dev_conll_path) / "1"
        print("dev1 path", dev1_conll_path)
        assert dev1_conll_path.exists()
        dev2_conll_path = Path(dev_conll_path) / "2"
        print("dev2 path", dev2_conll_path)
        assert dev2_conll_path.exists()
        self.train_dataset = ConllDataset(train_conll_path)
        # self.dev_dataset = ConllDataset(dev_conll_path)
        self.dev1_dataset = ConllDataset(dev1_conll_path)
        self.dev2_dataset = ConllDataset(dev2_conll_path)
        if self.batch_size == -1:
            self.batch_size = len(self.train_dataset)

        self.scaler = torch.cuda.amp.GradScaler()
        tb_cmt = f"lr_{self.lr}_cut-frac_{self.cut_frac}"
        self.writer = SummaryWriter(log_dir=self.log_dir, comment=tb_cmt)