コード例 #1
0
ファイル: hotpotQAUtils.py プロジェクト: xjtuwgt/multiDocQA
def document_encoder(title: str, doc_sents: list,
                     tokenizer: LongformerTokenizer):
    title_res = SPECIAL_TITLE_START + title + SPECIAL_TITLE_END  ##
    title_tokens = tokenizer.tokenize(text=title_res)
    title_encode_ids = tokenizer.encode(text=title_tokens,
                                        add_special_tokens=False)
    assert len(title_tokens) == len(title_encode_ids)
    title_len = len(title_encode_ids)
    ##++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    encode_id_lens = []
    encode_id_lens.append(title_len)
    doc_encode_id_list = []
    doc_encode_id_list.append(title_encode_ids)
    for sent_idx, sent_text in enumerate(doc_sents):
        sent_text_res = sent_text + SPECIAL_SENTENCE_TOKEN
        sent_tokens = tokenizer.tokenize(text=sent_text_res)
        sent_encode_ids = tokenizer.encode(text=sent_tokens,
                                           add_special_tokens=False)
        assert len(sent_tokens) == len(sent_encode_ids)
        ##++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        doc_encode_id_list.append(sent_encode_ids)
        sent_len = len(sent_encode_ids)
        encode_id_lens.append(sent_len)
        ##++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    doc_sent_len_cum_list = list(
        itertools.accumulate(encode_id_lens, operator.add))
    sent_start_end_pair = [(doc_sent_len_cum_list[i],
                            doc_sent_len_cum_list[i + 1] - 1)
                           for i in range(len(encode_id_lens) - 1)]
    doc_encode_ids = list(itertools.chain.from_iterable(doc_encode_id_list))
    assert len(doc_encode_ids) == doc_sent_len_cum_list[-1]
    assert len(sent_start_end_pair) == len(doc_sents)
    return doc_encode_ids, sent_start_end_pair, len(doc_encode_ids), title_len
コード例 #2
0
def query_encoder(query: str, tokenizer: LongformerTokenizer):
    query_res = CLS_TOKEN + SPECIAL_QUERY_START + query + SPECIAL_QUERY_END
    query_tokens = tokenizer.tokenize(text=query_res)
    query_encode_ids = tokenizer.encode(text=query_tokens, add_special_tokens=False)
    assert len(query_tokens) == len(query_encode_ids)
    query_len = len(query_encode_ids)
    return query_encode_ids, query_len
コード例 #3
0
def load_transformer(model_type):
    if model_type == "distilbert":
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased')
        model = TFDistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=1)
    elif model_type == "bert_x12":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased", num_labels=1)
    elif model_type == "bert_x24":
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-large-uncased", num_labels=1)
    elif model_type == "albert_v2_x12":
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForSequenceClassification.from_pretrained(
            "albert-base-v2", num_labels=1)
    elif model_type == "longformer_x12":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-base-4096", num_labels=1)
    elif model_type == "longformer_x24":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-large-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-large-4096", num_labels=1)
    else:
        raise ValueError(model_type + " was invalid")

    return model, tokenizer
コード例 #4
0
ファイル: hotpotQAUtils.py プロジェクト: xjtuwgt/multiDocQA
def answer_span_token_finder(norm_answer: str, sentence: str,
                             tokenizer: LongformerTokenizer):
    answer_encode_ids = tokenizer.encode(text=norm_answer,
                                         add_special_tokens=False)
    sentence_encode_ids = tokenizer.encode(text=sentence,
                                           add_special_tokens=False)
    idx = sub_list_finder(target=answer_encode_ids, source=sentence_encode_ids)
    flag = idx >= 0
    return flag, answer_encode_ids, sentence_encode_ids
コード例 #5
0
def get_hotpotqa_longformer_tokenizer(model_name=PRE_TAINED_LONFORMER_BASE, do_lower_case=True):
    tokenizer = LongformerTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)
    special_tokens_dict = {'additional_special_tokens': ['<q>', '</q>', '<d>', '<p>']}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print('Number of added tokens = {}: {}'.format(num_added_toks, special_tokens_dict))
    print('*' * 75)
    return tokenizer
コード例 #6
0
    def __init__(self, config_path):
        config = configparser.ConfigParser()
        config.read(config_path)

        self.save_dir = Path(config.get("general", "save_dir"))
        if not self.save_dir.exists():
            self.save_dir.mkdir(parents=True)
        self.clf_th = config.getfloat("general", "clf_th")

        self.mlp_model_path = config.get("model", "mlp")
        assert Path(self.mlp_model_path).exists()

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        bert_config_path = config.get("bert", "config_path")
        assert Path(bert_config_path).exists()
        self.bert_config = LongformerConfig.from_json_file(bert_config_path)
        self.max_seq_length = self.bert_config.max_position_embeddings - 2
        self.bert_tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        # bert_tokenizer_path = config.get("bert", "tokenizer_path")
        # assert Path(bert_config_path).exists()
        # self.bert_tokenizer = LongformerTokenizer.from_pretrained(bert_tokenizer_path)
        bert_model_path = config.get("bert", "model_path")
        assert Path(bert_model_path).exists()
        self.bert_model = LongformerModel.from_pretrained(
            bert_model_path, config=self.bert_config)
        self.bert_model.to(self.device)
        self.bert_model.eval()

        gold_dir = Path(config.get("data", "gold_dir"))
        assert Path(gold_dir).exists()
        self.gold_dataset = ConllDataset(gold_dir)
        target_dir = Path(config.get("data", "target_dir"))
        assert Path(target_dir).exists()
        self.target_dataset = ConllDataset(target_dir)
コード例 #7
0
 def __init__(self, config):
     super(LongformerForBinaryClassification, self).__init__()
     self.config = config
     self.tokenizer = LongformerTokenizer.from_pretrained(
         'longformer-base-4096')
     self.longformer = LongformerModel(config)
     self.classifier = nn.Linear(config.hidden_size, 1)
コード例 #8
0
def get_tokenizer(lm='bert'):
    """Return the tokenizer. Intiailize it if not initialized.

    Args:
        lm (string): the name of the language model (bert, albert, or distilbert)
    Returns:
        BertTokenizer or DistilBertTokenizer or AlbertTokenizer
    """
    global tokenizer
    if tokenizer is None:
        if lm == 'bert':
            from transformers import BertTokenizer
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif lm == 'distilbert':
            from transformers import DistilBertTokenizer
            tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        elif lm == 'albert':
            from transformers import AlbertTokenizer
            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        elif lm == 'roberta':
            from transformers import RobertaTokenizer
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        elif lm == 'xlnet':
            from transformers import XLNetTokenizer
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        elif lm == 'longformer':
            from transformers import LongformerTokenizer
            tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
    return tokenizer
コード例 #9
0
def main(dataset_directory, jsonlines_filename):
    dataset, ids, images = extract_article_list(
        os.path.join(dataset_directory, jsonlines_filename))
    print(f'Len dataset = {len(dataset)}')

    text_model = LongformerModel.from_pretrained(
        "allenai/longformer-base-4096").to("cuda")
    text_model.eval()
    tokenizer = LongformerTokenizer.from_pretrained(
        "allenai/longformer-base-4096")

    # pool = Pool(processes=48)
    # processed_text = list(tqdm(pool.map(process_text, dataset), total=len(dataset)))
    # pool.close()
    batch_size = 8
    all_embeddings_avg = np.zeros((len(dataset), 768), dtype=np.float)
    for i, chunk in tqdm(enumerate(chunks(dataset, batch_size)),
                         total=len(dataset) / batch_size):
        with torch.no_grad():
            tokenized_text = tokenizer(chunk,
                                       return_tensors="pt",
                                       truncation=True,
                                       padding="max_length")
            model_out = text_model(**(tokenized_text.to("cuda")))
            all_embeddings_avg[i * batch_size:i * batch_size +
                               len(chunk), :] = torch.mean(
                                   model_out[0], dim=1).cpu().numpy()

    data_df = pd.DataFrame(zip(ids, images, all_embeddings_avg))
    data_df.to_pickle(
        os.path.join(dataset_directory,
                     f"longformer_{jsonlines_filename.split('.')[0]}.pkl"))
コード例 #10
0
def main(dataset_directory, jsonlines_filename):
    dataset, ids, images = extract_article_list(
        os.path.join(dataset_directory, jsonlines_filename))
    print(f'Len dataset = {len(dataset)}')

    tokenizer = LongformerTokenizer.from_pretrained(
        "allenai/longformer-base-4096")

    batch_size = 512
    all_tokens = np.zeros((len(dataset), 2, 4096), dtype=np.float)
    for i, chunk in tqdm(enumerate(chunks(dataset, batch_size)),
                         total=len(dataset) / batch_size):
        with torch.no_grad():
            tokenized_text = tokenizer(chunk,
                                       return_tensors="pt",
                                       truncation=True,
                                       padding="max_length")
            all_tokens[i * batch_size:i * batch_size + len(chunk),
                       0, :] = tokenized_text["input_ids"].numpy()
            all_tokens[i * batch_size:i * batch_size + len(chunk),
                       1, :] = tokenized_text["attention_mask"].numpy()

    data_df = pd.DataFrame(zip(ids, images, all_tokens.astype(np.int_)))
    data_df.to_pickle(
        os.path.join(
            dataset_directory,
            f"longformer_tokens_{jsonlines_filename.split('.')[0]}.pkl"))
コード例 #11
0
 def __init__(self):
     self.model = LongformerModel.from_pretrained(
         'allenai/longformer-base-4096')
     self.tokenizer = LongformerTokenizer.from_pretrained(
         'allenai/longformer-base-4096')
     self.led_tokenizer = LEDTokenizer.from_pretrained(
         'allenai/led-base-16384')
     self.led_model = LEDModel.from_pretrained('allenai/led-base-16384')
コード例 #12
0
 def _test_TFLongformer(self, size, large=False):
     from transformers import LongformerTokenizer, TFLongformerModel
     tokenizer = LongformerTokenizer.from_pretrained(size)
     model = TFLongformerModel.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     spec, input_dict = self.spec_and_pad(input_dict, max_length=512)
     outputs = ["last_hidden_state"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
コード例 #13
0
    def __init__(self, args):
        super().__init__()

        self.args = args
        self.save_hyperparameters(args)

        self.tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
        self.model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")
コード例 #14
0
 def __init__(self):
     self.train = None
     self.test = None
     self.tokenizer = LongformerTokenizer.from_pretrained(
         'allenai/longformer-base-4096')
     self.processor = squad.SquadV2Processor()
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
コード例 #15
0
 def build_model(self):
     super().build_model()
     self.tokenizer = LongformerTokenizer.from_pretrained(
         "allenai/longformer-base-4096")
     self.model = LongformerForSequenceClassification.from_pretrained(
         "allenai/longformer-base-4096",
         num_labels=self.num_categories,
     )
     self.model.to("cuda")
コード例 #16
0
  def __init__(self, data_path):
    super(MafiascumDataset, self).__init__()

    tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096')
    config = LongformerConfig()

    df = pd.read_pickle(data_path, compression="gzip")
    grouped_df = df.groupby(["author", "game_id"])

    labels = []
    inputs = []
    attention_masks = []

    for key, item in grouped_df:
      posts = grouped_df.get_group(key).content.values # All the posts made by a user in a game
      label = grouped_df.get_group(key).scum.values[0] # Boolean
      label = 1 if label else 0 # Int

      num_sentences_in_game = 0
      all_sentences_in_game = []
      all_attention_masks_in_game = []
      for post in posts:
        if len(posts) > 0: # Only consider games where user has spoken at least once

          sentences = post.split('\n\n')
          for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 0:
              input_ids = tokenizer.encode(sentence, max_length=MAX_SENTENCE_LEN)
              # 1 for local attention, 2 for global attention, 0 for none (padding)
              # (for our task, mark <s> start of sentence with 2 to have global attention)
              attention_mask  = [1 for _ in range(len(input_ids))]
              attention_mask[0] = 2

              input_ids = input_ids
              attention_mask = attention_mask

              all_sentences_in_game += input_ids
              all_attention_masks_in_game += attention_mask
              num_sentences_in_game += 1

      # If the player said less than 10 sentences in a game, we ignore this sample
      if num_sentences_in_game < 10:
        continue

      input_ids = torch.LongTensor(all_sentences_in_game[:MAX_DOC_LEN])
      attention_mask = torch.LongTensor(all_attention_masks_in_game[:MAX_DOC_LEN])
      label = torch.FloatTensor([label])

      inputs.append(input_ids)
      attention_masks.append(attention_mask)
      labels.append(label)

    self.inputs = inputs
    self.attention_masks = attention_masks
    self.labels = labels
コード例 #17
0
 def __init__(self,
              vocab,
              unk_token="<unk>",
              max_input_chars_per_word=100,
              never_split=None):
     self.vocab = vocab
     self.unk_token = unk_token
     self.max_input_chars_per_word = max_input_chars_per_word
     self.never_split = never_split
     self.tokenizer = LongformerTokenizer.from_pretrained(
         'allenai/longformer-base-4096')
コード例 #18
0
    def __init__(self, hparams):
        #super().__init__()
        super(TransformerMarco, self).__init__()
        self.hparams = hparams
        self.tokenizer = LongformerTokenizer.from_pretrained(
            hparams.model_name)
        self.model = LongformerForSequenceClassification.from_pretrained(
            hparams.model_name)

        self.train_dataloader_object = self.val_dataloader_object = self.test_dataloader_object = None
        self.DatasetClass = MarcoDataset
コード例 #19
0
def summarise_longformer(long_text_to_summarise):
    model_to_load = "patrickvonplaten/longformer2roberta-cnn_dailymail-fp16"
    tok_to_load = "allenai/longformer-base-4096"
    tokeniser = LongformerTokenizer.from_pretrained(tok_to_load)
    model = EncoderDecoderModel.from_pretrained(model_to_load)
    input_ids = tokeniser(
        long_text_to_summarise,
        return_tensors="pt").input_ids  #.to(device).input_ids
    outputs = model.generate(input_ids)  #.to(device)
    summary = tokeniser.decode(outputs[0], skip_special_tokens=True)
    return summary
コード例 #20
0
    def make_dataset(self, data_root: str) -> None:
        """ Make Dataset
        Make dataset from json files and save it as csv.

        Args:
            data_root: Root directory for document json files.
        """

        log.info(f"Making dataset...")
        json_paths = glob.glob(f"{data_root}/**/*.json", recursive=True)

        # nltk settings
        nltk.download('punkt')
        stemmer = PorterStemmer()
        cv = CountVectorizer()
        texts = [] # A list of tokenized texts separated by half-width characters

        # Longformer
        feature_matrix = []
        device = torch.device('cuda')
        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
        model = LongformerModel.from_pretrained('allenai/longformer-base-4096').to(device)
        for json_path in tqdm(json_paths):
            with open(json_path) as f:
                json_obj = json.load(f)
                body = json_obj["body"]

                soup = BeautifulSoup(body, "html.parser")
                for script in soup(["script", "style"]):
                    script.decompose()
                text = soup.get_text()

                with torch.no_grad():
                    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0).to(device)
                    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device).to(device)
                    global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device).to(device)
                    outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)

                    vec = outputs.last_hidden_state[0].cpu().detach().clone().numpy().mean(0)
                # np.append(feature_matrix, vec)
                feature_matrix.append(list(vec))
                # log.info(f"Done: {len(feature_matrix)}")

                
        feature_matrix = np.array(feature_matrix)
        log.info(f"Longformer: {feature_matrix.shape}")

        # Calculate distance matrix
        dist_mat = squareform(pdist(feature_matrix, metric='cosine'))

        df = pd.DataFrame(dist_mat)
        df.to_csv(join(self.cache_path, "json_document_longformer.csv"), index=False)
        log.info(f"Successfully made dataset.")
コード例 #21
0
ファイル: long_answers.py プロジェクト: giuid/HLT
 def set_tokenizer(self, tokenizer = "roberta"):
     if tokenizer == "longformer":
         from transformers import LongformerTokenizer
         self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
         self.tokenizer_type = tokenizer
     elif tokenizer == "roberta":
         from transformers import RobertaTokenizer
         self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         self.tokenizer_type = tokenizer
     elif tokenizer == "bert":
         from transformers import BertTokenizer
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         self.tokenizer_type = tokenizer
     else:
         print("Error, the tokenizers allowed are 'longformer' , 'roberta' , 'bert' ")
コード例 #22
0
 def __init__(self, params):
     super(LongEntityLinker, self).__init__()
     self.params = params
     self.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.n_gpu = torch.cuda.device_count()
     self.use_golden_tags = params['use_golden_tags']
     # init tokenizer
     if params['use_longformer']:
         self.tokenizer = LongformerTokenizer.from_pretrained(
             'allenai/longformer-base-4096')
     else:
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.pad_id = -1
     # init model
     self.model = LongEntityLinkerModule(self.params)
     self.model = self.model.to(self.device)
コード例 #23
0
 def load(self, k):
     while self.m.get(k, None) == -1:
         time.sleep(1)  # loading, wit till ready
     if self.m.get(k, None) is not None:
         return self.m[k]  # it's already loaded
     self.m[k] = -1  # tell others it's loading, wait
     m = None
     if k == 'sentence-encode':
         m = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
         # word_embedding_model = models.Transformer('allenai/longformer-base-4096')
         # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
         # m = SentenceTransformer(modules=[word_embedding_model, pooling_model])
     elif k == 'sentiment-analysis':
         tokenizer = AutoTokenizer.from_pretrained(
             "mrm8488/t5-base-finetuned-emotion")
         model = AutoModelWithLMHead.from_pretrained(
             "mrm8488/t5-base-finetuned-emotion").to("cuda")
         # TODO we sure it's not ForSequenceClassification? https://huggingface.co/mrm8488/t5-base-finetuned-emotion
         m = (tokenizer, model, 512)
     elif k == 'summarization':
         # Not using pipelines because can't handle >max_tokens
         # https://github.com/huggingface/transformers/issues/4501
         # https://github.com/huggingface/transformers/issues/4224
         max_tokens = 1024  # 4096
         tokenizer = BartTokenizer.from_pretrained(
             'facebook/bart-large-cnn')
         model = BartForConditionalGeneration.from_pretrained(
             'facebook/bart-large-cnn').to("cuda")
         # model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16").to("cuda")
         # tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
         m = (tokenizer, model, max_tokens)
     elif k == 'question-answering':
         tokenizer = LongformerTokenizer.from_pretrained(
             "allenai/longformer-large-4096-finetuned-triviaqa")
         model = LongformerForQuestionAnswering.from_pretrained(
             "allenai/longformer-large-4096-finetuned-triviaqa",
             return_dict=True).to("cuda")
         # tokenizer = AutoTokenizer.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2")
         # model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2", return_dict=True).to("cuda")
         m = (tokenizer, model, 4096)
     self.m[k] = m
     return m
コード例 #24
0
    def __init__(self, params):
        super().__init__()
        
        if 'dropout' in params:
            self.dropout = nn.Dropout(p=params['dropout'])
        else:
            self.dropout = None
            
#         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False, do_basic_tokenize=False)
#         self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.max_length = params['max_length'] if 'max_length' in params else 1024
        self.max_memory_size = params['max_memory_size']
        
        self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
        self.bert = LongformerModel.from_pretrained("allenai/longformer-base-4096", gradient_checkpointing=True)

        self.num_labels = params["label_length"] if 'label_length' in params else 2

        self.fc = nn.Linear(768, self.num_labels)
コード例 #25
0
 def __init__(self, params):
     super(LongEncoderRanker, self).__init__()
     self.params = params
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     self.n_gpu = torch.cuda.device_count() # todo
     #self.num_tags = 4 if not self.params['end_tag'] else 5
     #self.num_tags = 3 if not self.params['end_tag'] else 4
     self.num_tags = 9 if self.params['conll'] else 3
     self.is_biencoder = params['is_biencoder']
     self.use_golden_tags = not params['not_use_golden_tags']
     # init tokenizer
     if params['use_longformer']:
         self.tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
     else:
         #self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
     #self.pad_id = 0
     self.pad_id = -1
     # init model
     self.model = LongEncoderModule(self.params)
     self.model = self.model.to(self.device)
コード例 #26
0
def get_par_train_data_loader(rank,
                              args) -> (DataLoader, DistributedSampler, int):
    data_frame = read_train_dev_data_frame(file_path=args.data_path,
                                           json_fileName=args.train_data_name)
    data_size = data_frame.shape[0]
    if args.train_data_filtered == 1:
        data_frame = data_frame[data_frame['level'] != 'easy']
        logging.info('Filtered data by removing easy case {} to {}'.format(
            data_size, data_frame.shape[0]))
    elif args.train_data_filtered == 2:
        data_frame = data_frame[data_frame['level'] == 'hard']
        logging.info(
            'Filtered data by removing easy and medium case {} to {}'.format(
                data_size, data_frame.shape[0]))
    else:
        logging.info('Using all training data {}'.format(data_size))
    data_size = data_frame.shape[0]

    num_replicas = args.world_size
    tokenizer = LongformerTokenizer.from_pretrained(args.pretrained_cfg_name,
                                                    do_lower_case=True)
    hotpot_tensorizer = LongformerQATensorizer(tokenizer=tokenizer,
                                               max_length=args.max_ctx_len)
    dataset = HotpotTrainDataset(data_frame=data_frame,
                                 hotpot_tensorizer=hotpot_tensorizer,
                                 max_sent_num=args.max_sent_num)
    batch_size = args.batch_size // num_replicas
    logging.info('Each node batch size = {}'.format(batch_size))
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset=dataset, rank=rank, num_replicas=num_replicas)
    train_dataloader = DataLoader(dataset=dataset,
                                  batch_size=batch_size,
                                  num_workers=max(1, args.cpu_num // 2),
                                  collate_fn=HotpotTrainDataset.collate_fn,
                                  shuffle=False,
                                  pin_memory=True,
                                  sampler=train_sampler)
    return train_dataloader, train_sampler, data_size
コード例 #27
0
 def __init__(self, model_name: str = "allenai/longformer-base-4096"):
     self.model = LongformerModel.from_pretrained(model_name)
     self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
コード例 #28
0
            '5.0',
            '--per_gpu_eval_batch_size',
            '2',
            '--per_gpu_train_batch_size',
            '1',  # 32GB gpu with fp32
            '--gradient_accumulation_steps',
            '32',
            #'--evaluate_during_training', # this is removed to reduce training time
            '--do_train',
            '--do_eval',
        ])
    train_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_train.txt'
    val_fn = './Preprocessed_Data/Preproc0_clinical_sentences_all_with_number_val.txt'
    # these are small file for test
    #     train_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_train.txt'
    #     val_fn = './Preprocessed_Data/test_clinical_sentences_all_with_number_val.txt'
    training_args.val_datapath = val_fn
    training_args.train_datapath = train_fn

    ##################### use pretrianed longformer in transformer
    longformer_model = LongformerForMaskedLM.from_pretrained(
        'allenai/longformer-base-4096')
    longformer_tokenizer = LongformerTokenizer.from_pretrained(
        'allenai/longformer-base-4096')

    logger.info('Train and eval with Longformer pretrained ...')
    pretrain_and_evaluate(training_args, longformer_model, longformer_tokenizer, eval_only=False, model_path=None\
                          #,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.

                         )
コード例 #29
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath('args.json'))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    tokenizer = LongformerTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = LongformerForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    #train_dataset  = torch.load(data_args.train_file_path)
    #eval_dataset = torch.load(data_args.valid_file_path)
    train_examples = DeepThinkDataset(data_args.input_train_file)
    train_dataset = DTDataset(tokenizer, train_examples,
                              data_args.max_seq_length)
    eval_examples = DeepThinkDataset(data_args.input_eval_file)
    eval_dataset = DTDataset(tokenizer, eval_examples,
                             data_args.max_seq_length)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DummyDataCollator(),
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))

        results.update(eval_output)

    return results
コード例 #30
0
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')

input_ids = torch.tensor(
    tokenizer.encode("Hello, my dog is cute",
                     add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids)

loss, prediction_scores = outputs[:2]
print(prediction_scores)

## Longformer
from transformers import LongformerModel, LongformerTokenizer

model = LongformerModel.from_pretrained('longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096')

SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(
    0)  # batch of size 1

# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(
    input_ids.shape, dtype=torch.long,
    device=input_ids.device)  # initialize to local attention
attention_mask[:, [
    1,
    4,
    21,
]] = 2  # Set global attention based on the task. For example,
# classification: the <s> token